In [1]:
# Import Library
import pandas as pd
import numpy as np

# Dataset loading

In [2]:
# Load Data
emp_survey = pd.read_csv('employee_survey_data.csv')
gen_data = pd.read_csv('general_data.csv')
man_survey = pd.read_csv('manager_survey_data.csv')

In [3]:
emp_survey.shape, gen_data.shape, man_survey.shape

((4410, 4), (4410, 24), (4410, 3))

# Dataset concatenation (emp_survey, gen_data, man_survey)

In [4]:
surv_list = [gen_data, man_survey]
df = pd.DataFrame(emp_survey)
for i in surv_list:
    df = df.merge(i, how='left', on=['EmployeeID', 'EmployeeID']) 

In [5]:
df.shape, df.columns

((4410, 29),
 Index(['EmployeeID', 'EnvironmentSatisfaction', 'JobSatisfaction',
        'WorkLifeBalance', 'Age', 'Attrition', 'BusinessTravel', 'Department',
        'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
        'Gender', 'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
        'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
        'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
        'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
        'JobInvolvement', 'PerformanceRating'],
       dtype='object'))

# EDA

In [6]:
# Detect Null Value
df.isnull().sum().sort_values(ascending=False)

WorkLifeBalance            38
EnvironmentSatisfaction    25
JobSatisfaction            20
NumCompaniesWorked         19
TotalWorkingYears           9
EmployeeID                  0
MonthlyIncome               0
JobInvolvement              0
YearsWithCurrManager        0
YearsSinceLastPromotion     0
YearsAtCompany              0
TrainingTimesLastYear       0
StockOptionLevel            0
StandardHours               0
PercentSalaryHike           0
Over18                      0
JobRole                     0
MaritalStatus               0
JobLevel                    0
Gender                      0
EmployeeCount               0
EducationField              0
Education                   0
DistanceFromHome            0
Department                  0
BusinessTravel              0
Attrition                   0
Age                         0
PerformanceRating           0
dtype: int64

In [7]:
# Check the type of the columns that have null values
df.WorkLifeBalance.dtype, df.EnvironmentSatisfaction.dtype, df.JobSatisfaction.dtype, df.NumCompaniesWorked.dtype, df.TotalWorkingYears.dtype

(dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'))

In [8]:
print(f'Numerical Columns: {len(df.select_dtypes(include=["int64", "float64"]).columns)}')
print(f'Categorical Columns: {len(df.select_dtypes(include=["object"]).columns)}')

Numerical Columns: 21
Categorical Columns: 8


In [9]:
# Define the Numerical Features
feat_numerical = sorted(df.select_dtypes(include=["int64", "float64"]).columns)
feat_numerical

['Age',
 'DistanceFromHome',
 'Education',
 'EmployeeCount',
 'EmployeeID',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [10]:
# Define the Categorical Features
feat_ordinal = sorted(df.select_dtypes(include=["object"]).columns)
feat_ordinal

['Attrition',
 'BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18']

In [11]:
# Find the cardinality of Categorical Features
df.select_dtypes(include=["object"]).nunique().sort_values(ascending=False)

JobRole           9
EducationField    6
BusinessTravel    3
Department        3
MaritalStatus     3
Attrition         2
Gender            2
Over18            1
dtype: int64

# Data preprocessor pipeline

1. Remove the column 'Over18'

Because it provides constant value, which does not provide any discriminatory power and does not contribute to the variation in the data.

2. Label the column 'Gender'

3. Define the numerical columns and nominal columns
('JobRole', 'EducationField','BusinessTravel', 'Department', 'MaritalStatus')

4. Numerical Features
- KNNImputer (For missing values)
- MinMaxScaler (For normalization)

5. Nominal/ Categorical Features
- OneHotEncoder (To encode the categorical features to new columns)




In [12]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline

# Drop the 'Over18' column
df.drop('Over18', axis=1, inplace=True)

# Encode the target variable 'Attrition' using LabelEncoder
target_encoder = LabelEncoder()
df['Attrition'] = target_encoder.fit_transform(df['Attrition'])

# Encode the 'Gender' column using LabelEncoder
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])

# Separate numerical,and nominal columns
feat_numerical = df.select_dtypes(include=["int64", "float64"]).columns
feat_nominal = ['JobRole', 'EducationField','BusinessTravel', 'Department', 'MaritalStatus']

# Create a pipeline for numerical columns
numerical_pipeline = make_pipeline(
    KNNImputer(),
    MinMaxScaler()
)

# Create a pipeline for nominal columns (one-hot encoding)
nominal_pipeline = make_pipeline(
    OneHotEncoder(sparse=False, handle_unknown='ignore')
)

# Create a column transformer for preprocessing
preprocessor = make_column_transformer(
    (numerical_pipeline, feat_numerical),
    (nominal_pipeline, feat_nominal),
    remainder='drop'
)
preprocessor

# Model Evaluation - Logistic Regression

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Split the data into train and validation sets
X_train, X_validation, y_train, y_validation = train_test_split(df.drop(columns=['Attrition']), df.Attrition, test_size=0.3)

# Fit and transform the preprocessor on the training data
preproc_Xtrain = preprocessor.fit_transform(X_train)
preproc_Xvalidation = preprocessor.transform(X_validation)

# Create and train a logistic regression model
logreg = LogisticRegression()
logreg.fit(preproc_Xtrain, y_train)

# Predict on the validation data
y_pred = logreg.predict(preproc_Xvalidation)

# Calculate evaluation metrics
confusion_mat = confusion_matrix(y_validation, y_pred)
accuracy = accuracy_score(y_validation, y_pred)
precision = precision_score(y_validation, y_pred)
recall = recall_score(y_validation, y_pred)
f1 = f1_score(y_validation, y_pred)
report = classification_report(y_validation, y_pred)

# Print the evaluation metrics
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print('Classification Report:\n', report)


Confusion Matrix:
 [[1083   25]
 [ 180   35]]
Accuracy: 0.8450491307634165
Precision: 0.5833333333333334
Recall: 0.16279069767441862
F1 Score: 0.2545454545454546
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.98      0.91      1108
           1       0.58      0.16      0.25       215

    accuracy                           0.85      1323
   macro avg       0.72      0.57      0.58      1323
weighted avg       0.81      0.85      0.81      1323





# Model Evaluation for more models
- Logistic Regression
- DecisionTree Classifier
- RandomForestClassifier
- XGB Classifier
- LGBM Classifier

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

# Create a list of classification models
models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    xgb.XGBClassifier(),
    lgb.LGBMClassifier()
]

# Define a list to store the performance metrics for each model
results = []

# Iterate through the models
for model in models:
    # Fit the model on the preprocessed training data
    model.fit(preproc_Xtrain, y_train)
    
    # Predict the target variable on the preprocessed validation data
    y_pred = model.predict(preproc_Xvalidation)
    
    # Calculate the evaluation metrics
    accuracy = accuracy_score(y_validation, y_pred)
    precision = precision_score(y_validation, y_pred)
    recall = recall_score(y_validation, y_pred)
    f1 = f1_score(y_validation, y_pred)
    
    # Generate the classification report
    report = classification_report(y_validation, y_pred)
    
    # Generate the confusion matrix
    matrix = confusion_matrix(y_validation, y_pred)
    
    # Store the results in a dictionary
    model_results = {
        'Model': model.__class__.__name__,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Classification Report': report,
        'Confusion Matrix': matrix
    }
    
    # Append the results to the list
    results.append(model_results)

# Display the results
for result in results:
    print(result['Model'])
    print('Confusion Matrix:\n', result['Confusion Matrix'])
    print('Accuracy:', result['Accuracy'])
    print('Precision:', result['Precision'])
    print('Recall:', result['Recall'])
    print('F1-Score:', result['F1-Score'])
    print('Classification Report:\n', result['Classification Report'])
    print('-' * 40)

LogisticRegression
Confusion Matrix:
 [[1083   25]
 [ 180   35]]
Accuracy: 0.8450491307634165
Precision: 0.5833333333333334
Recall: 0.16279069767441862
F1-Score: 0.2545454545454546
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.98      0.91      1108
           1       0.58      0.16      0.25       215

    accuracy                           0.85      1323
   macro avg       0.72      0.57      0.58      1323
weighted avg       0.81      0.85      0.81      1323

----------------------------------------
DecisionTreeClassifier
Confusion Matrix:
 [[1082   26]
 [  25  190]]
Accuracy: 0.9614512471655329
Precision: 0.8796296296296297
Recall: 0.8837209302325582
F1-Score: 0.8816705336426915
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1108
           1       0.88      0.88      0.88       215

    accuracy                           0.96      1323
 

# Summary
## DecisionTreeClassifier
Confusion Matrix:

 [[1078   24]\
 [  16  205]]

Accuracy: 0.9697656840513983

Precision: 0.8951965065502183

Recall: 0.9276018099547512

F1-Score: 0.9111111111111111

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      1102
           1       0.90      0.93      0.91       221

    accuracy                           0.97      1323

----------------------------------------
## RandomForestClassifier
Confusion Matrix:

 [[1102    0]

 [  19  202]]

Accuracy: 0.9856386999244142

Precision: 1.0

Recall: 0.9140271493212669

F1-Score: 0.9550827423167849

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1102
           1       1.00      0.91      0.96       221

    accuracy                           0.99      1323

----------------------------------------
## XGBClassifier
Confusion Matrix:

 [[1092   10]

 [  10  211]]

Accuracy: 0.9848828420256992

Precision: 0.9547511312217195

Recall: 0.9547511312217195

F1-Score: 0.9547511312217195

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1102
           1       0.95      0.95      0.95       221

    accuracy                           0.98      1323

----------------------------------------
## LGBMClassifier
Confusion Matrix:

 [[1096    6]

 [  16  205]]

Accuracy: 0.983371126228269

Precision: 0.9715639810426541

Recall: 0.9276018099547512

F1-Score: 0.9490740740740741

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1102
           1       0.97      0.93      0.95       221

    accuracy                           0.98      1323
