## MODEL TRAINING

### IMPORTING LIBRARIES

In [50]:
# Importing numpy
import numpy as np

# Importing pandas
import pandas as pd

# Importing warnings
import warnings

# Ignoring warnings
warnings.filterwarnings('ignore')

# Importing pickle
import pickle

# Importing import_ipynb
import import_ipynb

# Importing data_preprocessing file
from data_preprocessing import *

# Importing train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Importing DecisionTreeClassifier from sklearn
from sklearn.tree import DecisionTreeClassifier

# Importing RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Imporing MLPClassifier from sklearn
from sklearn.neural_network import MLPClassifier

# Importing f1_score, make_scorer from sklearn
from sklearn.metrics import f1_score, make_scorer

# Importing RandomizedSearchCV from sklearn
from sklearn.model_selection import RandomizedSearchCV

# Importing pickle
import pickle

### LOADING DATA

In [51]:
# Loading data using pandas
data=pd.read_excel('raw_data.xls')

# data
data

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,E100992,27,Female,Medical,Divorced,Sales,Sales Executive,Travel_Frequently,3,1,...,2,6,3,3,6,5,0,4,No,4
1196,E100993,37,Male,Life Sciences,Single,Development,Senior Developer,Travel_Rarely,10,2,...,1,4,2,3,1,0,0,0,No,3
1197,E100994,50,Male,Medical,Married,Development,Senior Developer,Travel_Rarely,28,1,...,3,20,3,3,20,8,3,8,No,3
1198,E100995,34,Female,Medical,Single,Data Science,Data Scientist,Travel_Rarely,9,3,...,2,9,3,4,8,7,7,7,No,3


### PREPROCESSING 

We have to split the features and target from the data and preprocess the features by preprocessor from 'pipeline.pkl' for model training

In [52]:
# Extracting features from the data
features=data.drop('PerformanceRating',axis=1)

# Extracting target from the data
target=data['PerformanceRating']

In [53]:
# Loading preprocessor from the file 'pipeline.pkl'
with open('pipeline.pkl','rb') as file:
    preprocessor=pickle.load(file)

In [54]:
# preprocessor
preprocessor

In [55]:
# Creating list of preprocessed columns
preprocessed_columns=['Age', 'DistanceFromHome', 'EmpHourlyRate', 'NumCompaniesWorked', 'EmpLastSalaryHikePercent', 'TotalWorkExperienceInYears', 'TrainingTimesLastYear', 'ExperienceYearsAtThisCompany', 'ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager','EmpEducationLevel', 'EmpEnvironmentSatisfaction', 'EmpJobInvolvement', 'EmpJobLevel', 'EmpJobSatisfaction', 'EmpRelationshipSatisfaction', 'EmpWorkLifeBalance','Gender', 'OverTime', 'Attrition','EducationBackground','MaritalStatus','EmpDepartment','EmpJobRole','BusinessTravelFrequency']
len(preprocessed_columns)

26

In [56]:
# Transforming the data by preprocessor and converted it to the DataFrame 
preprocessed_features=pd.DataFrame(preprocessor.transform(features),columns=preprocessed_columns)

In [57]:
# Preprocessed dataframe of features
preprocessed_features

Unnamed: 0,Age,DistanceFromHome,EmpHourlyRate,NumCompaniesWorked,EmpLastSalaryHikePercent,TotalWorkExperienceInYears,TrainingTimesLastYear,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,...,EmpRelationshipSatisfaction,EmpWorkLifeBalance,Gender,OverTime,Attrition,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency
0,0.333333,0.321429,0.357143,0.111111,0.071429,0.250,0.333333,0.250,0.388889,0.000000,...,4.0,2.0,1.0,0.0,0.0,2.0,2.0,5.0,13.0,2.0
1,0.690476,0.464286,0.171429,0.222222,0.071429,0.500,0.333333,0.175,0.388889,0.066667,...,4.0,3.0,1.0,0.0,0.0,2.0,2.0,5.0,13.0,2.0
2,0.523810,0.142857,0.257143,0.555556,0.714286,0.500,0.333333,0.450,0.722222,0.066667,...,3.0,3.0,1.0,1.0,0.0,1.0,1.0,5.0,13.0,1.0
3,0.547619,0.321429,0.614286,0.333333,0.285714,0.575,0.333333,0.525,0.333333,0.800000,...,2.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,8.0,2.0
4,1.000000,0.535714,0.771429,0.888889,0.214286,0.250,0.166667,0.050,0.111111,0.133333,...,4.0,3.0,1.0,0.0,0.0,2.0,2.0,5.0,13.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,0.214286,0.071429,0.585714,0.111111,0.642857,0.150,0.500000,0.150,0.277778,0.000000,...,2.0,3.0,0.0,1.0,0.0,3.0,0.0,5.0,13.0,1.0
1196,0.452381,0.321429,0.714286,0.333333,0.428571,0.100,0.333333,0.025,0.000000,0.000000,...,1.0,3.0,1.0,0.0,0.0,1.0,2.0,1.0,15.0,2.0
1197,0.761905,0.964286,0.628571,0.111111,0.000000,0.500,0.500000,0.500,0.444444,0.200000,...,3.0,3.0,1.0,1.0,0.0,3.0,1.0,1.0,15.0,2.0
1198,0.380952,0.285714,0.228571,0.111111,0.214286,0.225,0.500000,0.200,0.388889,0.466667,...,2.0,4.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,2.0


In [58]:
# Concatenating preprocessed features and target 
preprocessed_data=pd.concat([preprocessed_features,target],axis=1)

In [59]:
# Final preprocessed data for model training
preprocessed_data

Unnamed: 0,Age,DistanceFromHome,EmpHourlyRate,NumCompaniesWorked,EmpLastSalaryHikePercent,TotalWorkExperienceInYears,TrainingTimesLastYear,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,...,EmpWorkLifeBalance,Gender,OverTime,Attrition,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,PerformanceRating
0,0.333333,0.321429,0.357143,0.111111,0.071429,0.250,0.333333,0.250,0.388889,0.000000,...,2.0,1.0,0.0,0.0,2.0,2.0,5.0,13.0,2.0,3
1,0.690476,0.464286,0.171429,0.222222,0.071429,0.500,0.333333,0.175,0.388889,0.066667,...,3.0,1.0,0.0,0.0,2.0,2.0,5.0,13.0,2.0,3
2,0.523810,0.142857,0.257143,0.555556,0.714286,0.500,0.333333,0.450,0.722222,0.066667,...,3.0,1.0,1.0,0.0,1.0,1.0,5.0,13.0,1.0,4
3,0.547619,0.321429,0.614286,0.333333,0.285714,0.575,0.333333,0.525,0.333333,0.800000,...,2.0,1.0,0.0,0.0,0.0,0.0,3.0,8.0,2.0,3
4,1.000000,0.535714,0.771429,0.888889,0.214286,0.250,0.166667,0.050,0.111111,0.133333,...,3.0,1.0,0.0,0.0,2.0,2.0,5.0,13.0,2.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,0.214286,0.071429,0.585714,0.111111,0.642857,0.150,0.500000,0.150,0.277778,0.000000,...,3.0,0.0,1.0,0.0,3.0,0.0,5.0,13.0,1.0,4
1196,0.452381,0.321429,0.714286,0.333333,0.428571,0.100,0.333333,0.025,0.000000,0.000000,...,3.0,1.0,0.0,0.0,1.0,2.0,1.0,15.0,2.0,3
1197,0.761905,0.964286,0.628571,0.111111,0.000000,0.500,0.500000,0.500,0.444444,0.200000,...,3.0,1.0,1.0,0.0,3.0,1.0,1.0,15.0,2.0,3
1198,0.380952,0.285714,0.228571,0.111111,0.214286,0.225,0.500000,0.200,0.388889,0.466667,...,4.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,2.0,3


The 'preprocessed_data' has 27 columns ( 26 features and 1 target ). To train the model, we have to split the data into features and target again.

In [60]:
# Exporting the final preprocessed data into csv file 'preprocessed_data.csv'
preprocessed_data.to_csv('preprocessed_data.csv')

### DATA SPLITTING

In [61]:
# Extracting features from the preprocessed_data
X=preprocessed_data.drop('PerformanceRating',axis=1)

# Extracting target from the preprocessed_data
y=preprocessed_data['PerformanceRating']

In [62]:
#  Splitting the data into training and testing set using train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.2,random_state=18)

In [63]:
# Checking value counts for y_train
y_train.value_counts()

PerformanceRating
3    699
2    155
4    106
Name: count, dtype: int64

In [64]:
# Checking value counts for y_test
y_test.value_counts()

PerformanceRating
3    175
2     39
4     26
Name: count, dtype: int64

There are 1200 rows in the preprocessed_data. We splitted 80% of data (960 rows) into training and 20% of data (240 rows) into testing. 

### MODEL TRAINING

### DECISION TREE

In [65]:
# Initializing DecisionTreeClassifier model
model=DecisionTreeClassifier(random_state=27)

# Fitting training data into the model
model.fit(x_train,y_train)

# Predicting the testing data by the model
y_pred=model.predict(x_test)

In [66]:
# Evaluating the model using f1_score
f1_score(y_test,y_pred,average='macro')

0.8781807804068079

In [67]:
# Checking number of correct and incorrect predictions using pandas
pd.crosstab(y_test,y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,31,8,0
3,2,167,6
4,1,2,23


### Hyperparameter Tuning for Decision Tree

In [151]:
# Initializing DecisionTreeClassifier model
model_ht=DecisionTreeClassifier()

# Creating dictionary for the possible hyperparameters
params={'criterion':['gini','entropy'],
       'splitter':['best','random'],
       'max_depth': range(30),
       'min_samples_split': range(20),
       'min_samples_leaf':range(20),
       'random_state':range(43)}

In [153]:
# Making f1 score function using make_scorer
f1=make_scorer(f1_score,average='macro')

# Initaializing RandomizedSearchCV
tree_cv=RandomizedSearchCV(estimator=model_ht,
                    param_distributions=params,
                    n_jobs=-1,
                    cv=5,
                    scoring=f1,
                    n_iter=20000,
                    verbose=3)

# Fitting the training data into RandomizedSearchCV
tree_cv.fit(x_train,y_train)

Fitting 5 folds for each of 20000 candidates, totalling 100000 fits


In [154]:
# Checking best parameters and best scores of RandomizedSearchCV
tree_cv.best_params_, tree_cv.best_score_

({'splitter': 'best',
  'random_state': 21,
  'min_samples_split': 16,
  'min_samples_leaf': 3,
  'max_depth': 4,
  'criterion': 'entropy'},
 0.8891446606362795)

### Best Decision Tree Model

In [68]:
# Initializing DecisionTreeClassifier model
DecisionTreeClassifier_model=DecisionTreeClassifier(criterion='entropy',splitter='best',max_depth=4,min_samples_split=16,min_samples_leaf=3,random_state=21)

# Fitting training data into the model
DecisionTreeClassifier_model.fit(x_train,y_train)

# Predicting the testing data by the model
y_pred=DecisionTreeClassifier_model.predict(x_test)

In [69]:
# Evaluating the model using f1_score
DecisionTreeClassifier_f1_score=f1_score(y_test,y_pred,average='macro')

# Printing the f1_score of DecisionTreeClassifier
print('DecisionTreeClassifier_f1_score',DecisionTreeClassifier_f1_score)

# Checking number of correct and incorrect predictions using pandas
pd.crosstab(y_test,y_pred)

DecisionTreeClassifier_f1_score 0.9020297512944572


col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,34,5,0
3,3,169,3
4,1,3,22


### RANDOM FOREST

In [70]:
# Initializing RandomForestClassifier model
model=RandomForestClassifier(random_state=41)

# Fitting training data into the model
model.fit(x_train,y_train)

# Predicting the testing data by the model
y_pred=model.predict(x_test)

In [71]:
# Evaluating the model using f1_score
f1_score(y_test,y_pred,average='macro')

0.9445383411580593

In [72]:
# Checking number of correct and incorrect predictions using pandas
pd.crosstab(y_test,y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,35,4,0
3,0,174,1
4,1,2,23


### Hyperparameter Tuning for Random Forest

In [143]:
# Initializing RandomForestClassifier model
model_ht=RandomForestClassifier()

# Creating dictionary for the possible hyperparameters
params={'n_estimators':[100,300,500,800,1000],
       'criterion':['gini','entropy'],
       'max_depth': range(30),
       'min_samples_split': range(20),
       'min_samples_leaf':range(20),
       'random_state':range(43)}

In [144]:
# Making f1 score function using make_scorer
f1=make_scorer(f1_score,average='macro')

# Initaializing RandomizedSearchCV
forest_cv=RandomizedSearchCV(estimator=model_ht,
                    param_distributions=params,
                    n_jobs=-1,
                    cv=5,
                    scoring=f1,
                    n_iter=2000,
                    verbose=3)

# Fitting the training data into RandomizedSearchCV
forest_cv.fit(x_train,y_train)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


In [145]:
# Checking best parameters and best scores of RandomizedSearchCV
forest_cv.best_params_, forest_cv.best_score_

({'random_state': 13,
  'n_estimators': 300,
  'min_samples_split': 10,
  'min_samples_leaf': 1,
  'max_depth': 19,
  'criterion': 'gini'},
 0.8928361944938299)

### Best Random Forest Model

In [73]:
# Initializing RandomForestClassifier model
RandomForestClassifier_model=RandomForestClassifier(n_estimators=300,criterion='gini',max_depth=19,min_samples_split=10,min_samples_leaf=1,random_state=13)

# Fitting training data into the model
RandomForestClassifier_model.fit(x_train,y_train)

# Predicting the testing data by the model
y_pred=RandomForestClassifier_model.predict(x_test)

In [74]:
# Evaluating the model using f1_score
RandomForestClassifier_f1_score=f1_score(y_test,y_pred,average='macro')

# Printing the f1_score of RandomForestClassifier
print('RandomForestClassifier_f1_score',RandomForestClassifier_f1_score)

# Checking number of correct and incorrect predictions using pandas
pd.crosstab(y_test,y_pred)

RandomForestClassifier_f1_score 0.9388156696021864


col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,34,5,0
3,0,174,1
4,1,2,23


### ARTIFICIAL NEURAL NETWORK

In [75]:
# Initializing MLPClassifier model
model=MLPClassifier(random_state=14)

# Fitting training data into the model
model.fit(x_train,y_train)

# Predicting the testing data by the model
y_pred=model.predict(x_test)

In [76]:
# Evaluating the model using f1_score
f1_score(y_test,y_pred,average='macro')

0.768470362152117

In [77]:
# Checking number of correct and incorrect predictions using pandas
pd.crosstab(y_test,y_pred)

col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,24,15,0
3,13,157,5
4,0,6,20


### Hyperparameter Tuning for MLPClassifier

In [160]:
# Initializing MLPClassifier model
model_ht=MLPClassifier()

# Creating dictionary for the possible hyperparameters
params={'hidden_layer_sizes':[(100,),(500,),(100,2),(500,2),(100,3),(500,3),(1000,),(1000,1),(100,3),(500,3),(1000,3)],
       'activation':['relu','tanh'],
       'solver':['lbfgs','adam','sgd'],
       'alpha':[0.1,0.01,0.001,0.0001],
       'learning_rate_init':[0.1,0.01,0.001,0.0001],
        'max_iter':[200,500,1000,2000],
       'random_state':range(43)}

In [161]:
# Making f1 score function using make_scorer
f1=make_scorer(f1_score,average='macro')

# Initaializing RandomizedSearchCV
ann_cv=RandomizedSearchCV(estimator=model_ht,
                   param_distributions=params,
                   n_jobs=-1,
                   cv=5,
                   verbose=3,
                   n_iter=500,
                   scoring=f1)

# Fitting the training data into RandomizedSearchCV
ann_cv.fit(x_train,y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


In [162]:
# Checking best parameters and best scores of RandomizedSearchCV
ann_cv.best_params_, ann_cv.best_score_

({'solver': 'adam',
  'random_state': 30,
  'max_iter': 2000,
  'learning_rate_init': 0.0001,
  'hidden_layer_sizes': (1000, 3),
  'alpha': 0.0001,
  'activation': 'tanh'},
 0.706237247529969)

### Best MLPClassifier model

In [78]:
# Initializing MLPClassifier model
MLPClassifier_model=MLPClassifier(activation='tanh',alpha=0.0001,hidden_layer_sizes=(1000,3),solver='adam',learning_rate_init=0.0001,random_state=30,max_iter=2000)

# Fitting training data into the model
MLPClassifier_model.fit(x_train,y_train)

# Predicting the testing data by the model
y_pred=MLPClassifier_model.predict(x_test)

In [79]:
# Evaluating the model using f1_score
MLPClassifier_f1_score=f1_score(y_test,y_pred,average='macro')

# Printing the f1_score of MLPClassifier
print('MLPClassifier_f1_score',MLPClassifier_f1_score)

# Checking number of correct and incorrect predictions using pandas
pd.crosstab(y_test,y_pred)

MLPClassifier_f1_score 0.7929176379176379


col_0,2,3,4
PerformanceRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,25,14,0
3,14,158,3
4,0,5,21


### FINAL MODEL FOR PREDICTION

In [80]:
# Creating dataframe for final model comparison 
finalmodel_comparison=pd.DataFrame({'Model':['DecisionTreeClassifier_model','RandomForestClassifier_model','MLPClassifier_model'],
            'f1_score':[DecisionTreeClassifier_f1_score,RandomForestClassifier_f1_score,MLPClassifier_f1_score]})

# Sorting the Dataframe for descending f1_score
finalmodel_comparison.sort_values('f1_score',ascending=False,ignore_index=True)

Unnamed: 0,Model,f1_score
0,RandomForestClassifier_model,0.938816
1,DecisionTreeClassifier_model,0.90203
2,MLPClassifier_model,0.792918


The best model is RandomForestClassifier_model. So we will use this model as final model for prediction

In [81]:
# Final model for prediction
final_model=RandomForestClassifier_model

In [82]:
# Dumping the final model into the file 'final_model.pkl' for prediction
with open('final_model.pkl','wb') as file:
    pickle.dump(final_model,file)

We will use the file 'final_model.pkl' for predition.