In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [2]:
# Load data
data = pd.read_csv(r"C:\Users\svelo\Downloads\Attrition data.csv")

# Display basic information about the data
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   EmployeeID               4410 non-null   int64  
 1   Age                      4410 non-null   int64  
 2   Attrition                4410 non-null   object 
 3   BusinessTravel           4410 non-null   object 
 4   Department               4410 non-null   object 
 5   DistanceFromHome         4410 non-null   int64  
 6   Education                4410 non-null   int64  
 7   EducationField           4410 non-null   object 
 8   EmployeeCount            4410 non-null   int64  
 9   Gender                   4410 non-null   object 
 10  JobLevel                 4410 non-null   int64  
 11  JobRole                  4410 non-null   object 
 12  MaritalStatus            4410 non-null   object 
 13  MonthlyIncome            4410 non-null   int64  
 14  NumCompaniesWorked      

In [3]:
# Drop columns that are not useful or have constant values
data.drop(['Over18', 'StandardHours'], axis=1, inplace=True)

# Separate numeric and categorical columns
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Fill missing values
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# For categorical columns, fill missing values with the mode
for col in categorical_cols:
    mode_value = data[col].mode()
    if not mode_value.empty:
        data[col].fillna(mode_value[0], inplace=True)

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Display the first few rows to verify preprocessing
data.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(mode_value[0], inplace=True)


Unnamed: 0,EmployeeID,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,Gender,...,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating
0,1,51,0,2,2,6,2,1,1,0,...,1.0,6,1,0,0,3.0,4.0,2.0,3,3
1,2,31,1,1,1,10,1,1,1,0,...,6.0,3,5,1,4,3.0,2.0,4.0,2,4
2,3,32,0,1,1,17,4,4,1,1,...,5.0,2,5,0,3,2.0,2.0,1.0,3,3
3,4,38,0,0,1,2,5,1,1,1,...,13.0,5,8,7,5,4.0,4.0,3.0,2,3
4,5,32,0,2,1,10,1,3,1,1,...,9.0,2,6,0,4,4.0,1.0,3.0,3,3


In [4]:
# Split data into features and target
X = data.drop('Attrition', axis=1)
y = data['Attrition']

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report


(0.9886621315192744,
 '              precision    recall  f1-score   support\n\n           0       0.99      1.00      0.99       741\n           1       1.00      0.93      0.96       141\n\n    accuracy                           0.99       882\n   macro avg       0.99      0.96      0.98       882\nweighted avg       0.99      0.99      0.99       882\n')

In [5]:
# Identify feature importance
feature_importances = pd.Series(model.feature_importances_, index=X.columns)

# Display feature importances
feature_importances.sort_values(ascending=False)


MonthlyIncome              0.084947
Age                        0.084011
TotalWorkingYears          0.078078
DistanceFromHome           0.055244
YearsAtCompany             0.054645
PercentSalaryHike          0.052052
YearsWithCurrManager       0.046428
NumCompaniesWorked         0.046326
JobRole                    0.043447
JobSatisfaction            0.041186
EnvironmentSatisfaction    0.039719
MaritalStatus              0.037641
EducationField             0.034466
TrainingTimesLastYear      0.034075
YearsSinceLastPromotion    0.033390
Education                  0.031288
WorkLifeBalance            0.030841
JobLevel                   0.027609
JobInvolvement             0.027498
StockOptionLevel           0.027141
EmployeeID                 0.026562
BusinessTravel             0.021333
Department                 0.020970
Gender                     0.012326
PerformanceRating          0.008777
EmployeeCount              0.000000
dtype: float64

In [6]:
# Initialize the Dash app
app = dash.Dash(__name__)


In [7]:
# Function to create dynamic figures with descriptive labels
def create_dynamic_figures(data):
    figures = []
    for column in data.columns:
        if column == 'Attrition':
            continue
        
        unique_values = data[column].nunique()
        if data[column].dtype == 'object' or unique_values < 10:
            # Map encoded values back to original labels
            original_labels = label_encoders.get(column, None)
            if original_labels:
                data[column] = original_labels.inverse_transform(data[column])
            
            # Bar plot for categorical or low cardinality features
            fig = px.bar(data, x=column, y='Attrition', title=f'Attrition by {column}', barmode='group')
        else:
            # Box plot for numerical features
            fig = px.box(data, x='Attrition', y=column, title=f'{column} Distribution by Attrition')
        
        figures.append(fig)
    
    # Add feature importance plot
    fig_importance = px.bar(feature_importances.sort_values(ascending=False),
                            title='Feature Importances')
    figures.append(fig_importance)
    
    return figures

# Create initial figures
figures = create_dynamic_figures(data)


In [8]:
# Define the app layout
app.layout = html.Div([
    html.H1('Employee Attrition Dashboard'),
    
    html.Div([
        dcc.Graph(figure=fig) for fig in figures
    ])
])

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)
