In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost

In [2]:
df=pd.read_csv('HR_attrition_dataset.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [3]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [4]:
df.drop(['EmployeeCount', 'EmployeeID','Over18','StandardHours'],axis=1,inplace=True)

In [5]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
PercentSalaryHike           0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

### Missing value handling

In [6]:
df.fillna({'NumCompaniesWorked':df['NumCompaniesWorked'].mode()[0],'TotalWorkingYears':df['TotalWorkingYears'].mode()[0]},inplace=True)

### Outlier Handling

In [7]:
outlier_lst=['NumCompaniesWorked','YearsSinceLastPromotion','TotalWorkingYears','YearsAtCompany']
# Handling outliers by interquartile rule based capping
for i in outlier_lst:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1
    u_lim=(Q3+1.5*IQR)
    df[i].where(df[i] <=u_lim,u_lim, inplace=True)  #replacing all upper outliers with upper-limit
    df[i]=df[i].astype('int')

In [8]:
df1=df.copy()

### Feature Engineering

In [9]:
df['Age']=pd.cut(df['Age'],bins=[17,25,35,45,55,64],labels=['25 & Below','26-35','36-45','46-55','56 & Above'])
df['Age'] = df['Age'].astype('object')
df['Age'].value_counts()

26-35         1818
36-45         1404
46-55          678
25 & Below     369
56 & Above     141
Name: Age, dtype: int64

In [10]:
#Function to categorise income into slabs
def categorise_income(row):  
    if row['MonthlyIncome'] > 10000 and row['MonthlyIncome'] <= 25000:
        return '10k-25k'
    elif row['MonthlyIncome'] > 25001 and row['MonthlyIncome'] <= 50000:
        return '25k-50k'
    elif row['MonthlyIncome'] > 50001 and row['MonthlyIncome'] <= 75000:
        return '50k-75k'
    elif row['MonthlyIncome'] > 75001 and row['MonthlyIncome'] <= 100000:
        return '75k-1L'
    elif row['MonthlyIncome'] > 100001 and row['MonthlyIncome'] <= 125000:
        return '1L-1.25L'
    elif row['MonthlyIncome'] > 125001 and row['MonthlyIncome'] <= 150000:
        return '1.25L-1.5L'
    elif row['MonthlyIncome'] > 150001 and row['MonthlyIncome'] <= 175000:
        return '1.5L-1.75L'
    elif row['MonthlyIncome'] > 10000 and row['MonthlyIncome'] <= 200000:
        return '1.75L-2L'
df['MonthlyIncome'] = df.apply(lambda row: categorise_income(row), axis=1)
df['MonthlyIncome'].value_counts()

25k-50k       1569
50k-75k        930
10k-25k        678
75k-1L         390
1L-1.25L       273
1.75L-2L       243
1.25L-1.5L     171
1.5L-1.75L     156
Name: MonthlyIncome, dtype: int64

In [11]:
df.dtypes

Age                        object
Attrition                  object
BusinessTravel             object
Department                 object
DistanceFromHome            int64
Education                   int64
EducationField             object
Gender                     object
JobLevel                    int64
JobRole                    object
MaritalStatus              object
MonthlyIncome              object
NumCompaniesWorked          int32
PercentSalaryHike           int64
StockOptionLevel            int64
TotalWorkingYears           int32
TrainingTimesLastYear       int64
YearsAtCompany              int32
YearsSinceLastPromotion     int32
YearsWithCurrManager        int64
dtype: object

### Feature selection/elimination

In [12]:
df.drop(columns=['Education','StockOptionLevel','JobLevel','Gender','DistanceFromHome','PercentSalaryHike','TrainingTimesLastYear','NumCompaniesWorked','YearsWithCurrManager','YearsSinceLastPromotion'],inplace=True)

### Feature and target Encoding

In [13]:
# Encoding target variable
df['Attrition']=df['Attrition'].map({'Yes': 1, 'No': 0})
df['Attrition'].value_counts()

0    3699
1     711
Name: Attrition, dtype: int64

In [14]:
# Label encoding BusinessTravel feature
df['BusinessTravel']=df['BusinessTravel'].map({'Travel_Frequently': 2,'Travel_Rarely': 1, 'Non-Travel': 0})
df['BusinessTravel'].value_counts()

1    3129
2     831
0     450
Name: BusinessTravel, dtype: int64

In [15]:
# Feature and Target separation for train test split
X=df.drop(['Attrition'],axis=1)
y=df['Attrition']
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=.3,stratify = y) # 30% data is for testing
print(y_test.value_counts())
print(y_train.value_counts())

0    1110
1     213
Name: Attrition, dtype: int64
0    2589
1     498
Name: Attrition, dtype: int64


In [16]:
# One hot encoding remaining categorical features
ohe = make_column_transformer(
    (OneHotEncoder(drop='first'), ['Age','MonthlyIncome','Department','JobRole','MaritalStatus','EducationField']),
    remainder='passthrough')
ohencoded_X_train = ohe.fit_transform(X_train)
X_train = pd.DataFrame(ohencoded_X_train.todense(), columns=ohe.get_feature_names_out())
ohencoded_X_test=ohe.transform(X_test)
X_test = pd.DataFrame(ohencoded_X_test.todense(), columns=ohe.get_feature_names_out())
# print(X_train.head())
# print(X_test.head())

### Feature Scaling/ Standardisation

In [17]:
ss_scale=StandardScaler()
X_train_std=ss_scale.fit_transform(X_train)
X_train=pd.DataFrame(X_train_std,columns = X_train.columns)
X_test_std=ss_scale.transform(X_test)
X_test=pd.DataFrame(X_test_std,columns = X_test.columns)
X_train.head()

Unnamed: 0,onehotencoder__Age_26-35,onehotencoder__Age_36-45,onehotencoder__Age_46-55,onehotencoder__Age_56 & Above,onehotencoder__MonthlyIncome_1.5L-1.75L,onehotencoder__MonthlyIncome_1.75L-2L,onehotencoder__MonthlyIncome_10k-25k,onehotencoder__MonthlyIncome_1L-1.25L,onehotencoder__MonthlyIncome_25k-50k,onehotencoder__MonthlyIncome_50k-75k,...,onehotencoder__MaritalStatus_Married,onehotencoder__MaritalStatus_Single,onehotencoder__EducationField_Life Sciences,onehotencoder__EducationField_Marketing,onehotencoder__EducationField_Medical,onehotencoder__EducationField_Other,onehotencoder__EducationField_Technical Degree,remainder__BusinessTravel,remainder__TotalWorkingYears,remainder__YearsAtCompany
0,1.209017,-0.694773,-0.420057,-0.178189,-0.185789,-0.245145,-0.429621,-0.259668,1.341168,-0.509398,...,1.097008,-0.695798,-0.822682,-0.343028,1.452157,-0.245145,-0.318308,1.698474,-0.423678,0.293523
1,-0.827118,1.43932,-0.420057,-0.178189,-0.185789,-0.245145,-0.429621,-0.259668,1.341168,-0.509398,...,-0.911571,1.437199,-0.822682,-0.343028,1.452157,-0.245145,-0.318308,-0.159495,-0.285393,0.293523
2,-0.827118,1.43932,-0.420057,-0.178189,-0.185789,-0.245145,-0.429621,-0.259668,1.341168,-0.509398,...,-0.911571,-0.695798,-0.822682,-0.343028,-0.688631,-0.245145,3.141611,-0.159495,1.235734,2.115044
3,1.209017,-0.694773,-0.420057,-0.178189,-0.185789,-0.245145,-0.429621,-0.259668,1.341168,-0.509398,...,1.097008,-0.695798,-0.822682,2.915212,-0.688631,-0.245145,-0.318308,-0.159495,-0.147109,0.495914
4,1.209017,-0.694773,-0.420057,-0.178189,-0.185789,-0.245145,-0.429621,-0.259668,-0.745619,-0.509398,...,1.097008,-0.695798,1.215536,-0.343028,-0.688631,-0.245145,-0.318308,-0.159495,-0.700246,-0.516042


### Model Building and evaluation

In [18]:
# Dictionary with model name and model
models = {    
    '      Logistic Regression' : LogisticRegression(),
    '     KNeighborsClassifier' : KNeighborsClassifier(),
    '           SVC Polynomial' : SVC(),
    ' Decision Tree Classifier' : DecisionTreeClassifier(),
    ' Random Forest Classifier' : RandomForestClassifier(),
    '      XG Boost Classifier' : xgboost.XGBClassifier()
}
# Dictionary with model name and corresponding parameters for hyperparameter tuning
model_parameters={
    '      Logistic Regression' : {'solver':['liblinear','newton-cg','lbfgs'],
                                   'class_weight':['balanced',{ 0:0.16, 1:0.84 }],
                                   'C': [0.01, 0.1, 1]},
    '     KNeighborsClassifier' : {'n_neighbors':[35,40,45,50],
                                   'weights':['distance','uniform'],
                                   'metric':['euclidean','minkowski']},
    '           SVC Polynomial' : {'kernel':['poly'],
                                   'degree':[4,5,6],
                                   'C': [0.1, 0.5, 1],
                                   'class_weight':['balanced',None]},
    ' Decision Tree Classifier' : {'max_depth': [9,10,12,15],
                                   'criterion': ["gini", "entropy"],
                                   'max_features' : ['log2','sqrt']},
    ' Random Forest Classifier' : {'max_depth': [9,10,12,15],
                                   'max_features': ['sqrt','log2'],
                                   'min_samples_leaf': [1,2,4,5],
                                   'criterion': ["gini", "entropy"],
                                   'n_estimators': [100,150,200]},
    '      XG Boost Classifier' : { "learning_rate"    : [ 0.15, 0.20, 0.25, 0.30 ],
                                   "max_depth"        : [9,10, 12,15],
                                   "min_child_weight" : [ 1, 3, 5, 7,9 ],
                                   "gamma"            : [ 0.1, 0.2 , 0.3]}
}
# Dataframe to display the various scores for each model
metrices= pd.DataFrame(columns = ["Confusion_matrix", "Accuracy", "F1_score","Recall","Precision","ROC_AUC"],
              index=['      Logistic Regression','     KNeighborsClassifier','           SVC Polynomial',
                    ' Decision Tree Classifier',' Random Forest Classifier','      XG Boost Classifier'])
model_keys=['      Logistic Regression','     KNeighborsClassifier','           SVC Polynomial',
                    ' Decision Tree Classifier',' Random Forest Classifier','      XG Boost Classifier']


#### Randomised Search based hyperparameter tuning with stratified cross validation

In [19]:
def random_search_cv_best(model_name,model,params):
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=10)
    random_search=RandomizedSearchCV(model,param_distributions=params,n_iter=15,scoring='f1_weighted',cv=cv,verbose=1)
    print(model_name)
    random_search.fit(X_train, y_train)
#     print('Best Score: %s' % random_search.best_score_)
    print('Best Hyperparameters: %s\n' % random_search.best_params_)
    prediction = random_search.predict(X_test)
    con_matrix = confusion_matrix(y_test, prediction)
    metrices.at[model_name,"Confusion_matrix"]=con_matrix
    acc = accuracy_score(y_test, prediction)
    metrices.at[model_name,"Accuracy"]=acc
    f1 = f1_score(y_test, prediction)
    metrices.at[model_name,"F1_score"]=f1
    recal = recall_score(y_test, prediction)
    metrices.at[model_name,"Recall"]=recal
    prec = precision_score(y_test, prediction)
    metrices.at[model_name,"Precision"]=prec
    roc = roc_auc_score(y_test, prediction)
    metrices.at[model_name,"ROC_AUC"]=roc

In [20]:
for i in model_keys:
    model=models.get(i)
    params=model_parameters.get(i)
    random_search_cv_best(i,model,params)
metrices

      Logistic Regression
Fitting 15 folds for each of 15 candidates, totalling 225 fits
Best Hyperparameters: {'solver': 'lbfgs', 'class_weight': 'balanced', 'C': 1}

     KNeighborsClassifier
Fitting 15 folds for each of 15 candidates, totalling 225 fits
Best Hyperparameters: {'weights': 'distance', 'n_neighbors': 35, 'metric': 'euclidean'}

           SVC Polynomial
Fitting 15 folds for each of 15 candidates, totalling 225 fits
Best Hyperparameters: {'kernel': 'poly', 'degree': 5, 'class_weight': 'balanced', 'C': 1}

 Decision Tree Classifier
Fitting 15 folds for each of 15 candidates, totalling 225 fits
Best Hyperparameters: {'max_features': 'log2', 'max_depth': 15, 'criterion': 'gini'}

 Random Forest Classifier
Fitting 15 folds for each of 15 candidates, totalling 225 fits
Best Hyperparameters: {'n_estimators': 200, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 15, 'criterion': 'gini'}

      XG Boost Classifier
Fitting 15 folds for each of 15 candidates, totalling 

Unnamed: 0,Confusion_matrix,Accuracy,F1_score,Recall,Precision,ROC_AUC
Logistic Regression,"[[786, 324], [76, 137]]",0.697657,0.406528,0.643192,0.29718,0.67565
KNeighborsClassifier,"[[1107, 3], [16, 197]]",0.985639,0.953995,0.924883,0.985,0.96109
SVC Polynomial,"[[1019, 91], [32, 181]]",0.907029,0.746392,0.849765,0.665441,0.883892
Decision Tree Classifier,"[[1086, 24], [60, 153]]",0.936508,0.784615,0.71831,0.864407,0.848344
Random Forest Classifier,"[[1104, 6], [33, 180]]",0.970522,0.902256,0.84507,0.967742,0.919833
XG Boost Classifier,"[[1104, 6], [16, 197]]",0.983371,0.947115,0.924883,0.970443,0.959739


Since our dataset is an unbalanced one we will focus on Recall and F1_score to evaluate and decide on the model.

As per the Recall and F1_score we have KNeighborsClassifier as the best performing model with 0.92 Recall and 0.95 F1_score