In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import seaborn as sns
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
df=pd.read_csv('HR_attrition_dataset.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [3]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [4]:
df.drop(['EmployeeCount', 'EmployeeID','Over18','StandardHours'],axis=1,inplace=True)

In [5]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
PercentSalaryHike           0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [6]:
df.fillna({'NumCompaniesWorked':df['NumCompaniesWorked'].mode()[0],'TotalWorkingYears':df['TotalWorkingYears'].mode()[0]},inplace=True)

In [7]:
# finding the number of unique values associated with each column
for i in df.columns:
    print(i," :",df[i].nunique())
ll=['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'EducationField', 
        'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'PercentSalaryHike', 
        'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
for i in ll:
    print(i," :",df[i].unique())

Age  : 43
Attrition  : 2
BusinessTravel  : 3
Department  : 3
DistanceFromHome  : 29
Education  : 5
EducationField  : 6
Gender  : 2
JobLevel  : 5
JobRole  : 9
MaritalStatus  : 3
MonthlyIncome  : 1349
NumCompaniesWorked  : 10
PercentSalaryHike  : 15
StockOptionLevel  : 4
TotalWorkingYears  : 40
TrainingTimesLastYear  : 7
YearsAtCompany  : 37
YearsSinceLastPromotion  : 16
YearsWithCurrManager  : 18
Age  : [51 31 32 38 46 28 29 25 45 36 55 47 37 21 35 26 50 53 42 44 49 18 41 39
 58 33 43 52 27 30 54 40 23 48 57 34 24 22 56 60 19 20 59]
BusinessTravel  : ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Department  : ['Sales' 'Research & Development' 'Human Resources']
DistanceFromHome  : [ 6 10 17  2  8 11 18  1  7 28 14  3  4 16  9  5 20 29 15 13 24 19 22 25
 21 26 27 12 23]
EducationField  : ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
JobRole  : ['Healthcare Representative' 'Research Scientist' 'Sales Executive'
 'Human Resources' 'Research Dire

In [8]:
outlier_lst=['NumCompaniesWorked','YearsSinceLastPromotion','TotalWorkingYears','YearsAtCompany']
# Handling outliers by interquartile rule based capping
for i in outlier_lst:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1
    u_lim=(Q3+1.5*IQR)
    df[i].where(df[i] <=u_lim,u_lim, inplace=True)  #replacing all upper outliers with upper-limit
    df[i]=df[i].astype('int')

In [9]:
df1=df.copy()

In [10]:
# finding the number of unique values associated with each column
for i in df1.columns:
    print(i," :",df1[i].nunique())
ll=['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'EducationField', 
        'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'PercentSalaryHike', 
        'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
for i in ll:
    print(i," :",df1[i].unique())

Age  : 43
Attrition  : 2
BusinessTravel  : 3
Department  : 3
DistanceFromHome  : 29
Education  : 5
EducationField  : 6
Gender  : 2
JobLevel  : 5
JobRole  : 9
MaritalStatus  : 3
MonthlyIncome  : 1349
NumCompaniesWorked  : 9
PercentSalaryHike  : 15
StockOptionLevel  : 4
TotalWorkingYears  : 29
TrainingTimesLastYear  : 7
YearsAtCompany  : 19
YearsSinceLastPromotion  : 8
YearsWithCurrManager  : 18
Age  : [51 31 32 38 46 28 29 25 45 36 55 47 37 21 35 26 50 53 42 44 49 18 41 39
 58 33 43 52 27 30 54 40 23 48 57 34 24 22 56 60 19 20 59]
BusinessTravel  : ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Department  : ['Sales' 'Research & Development' 'Human Resources']
DistanceFromHome  : [ 6 10 17  2  8 11 18  1  7 28 14  3  4 16  9  5 20 29 15 13 24 19 22 25
 21 26 27 12 23]
EducationField  : ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
JobRole  : ['Healthcare Representative' 'Research Scientist' 'Sales Executive'
 'Human Resources' 'Research Direct

In [11]:
df['Attrition']=df['Attrition'].map({'Yes': 1, 'No': 0})
df['Attrition'].value_counts()

0    3699
1     711
Name: Attrition, dtype: int64

In [12]:
df['Age']=pd.cut(df['Age'],bins=[17,25,35,45,55,64],labels=[0,1,2,3,4])
df['Age'] = df['Age'].astype('int')
df['Age'].value_counts()

1    1818
2    1404
3     678
0     369
4     141
Name: Age, dtype: int64

In [13]:
#Function to categorise income into slabs
def categorise_income(row):  
    if row['MonthlyIncome'] > 10000 and row['MonthlyIncome'] <= 25000:
        return 0
    elif row['MonthlyIncome'] > 25001 and row['MonthlyIncome'] <= 50000:
        return 1
    elif row['MonthlyIncome'] > 50001 and row['MonthlyIncome'] <= 75000:
        return 2
    elif row['MonthlyIncome'] > 75001 and row['MonthlyIncome'] <= 100000:
        return 3
    elif row['MonthlyIncome'] > 100001 and row['MonthlyIncome'] <= 125000:
        return 4
    elif row['MonthlyIncome'] > 125001 and row['MonthlyIncome'] <= 150000:
        return 5
    elif row['MonthlyIncome'] > 150001 and row['MonthlyIncome'] <= 175000:
        return 6
    elif row['MonthlyIncome'] > 10000 and row['MonthlyIncome'] <= 200000:
        return 7
df['MonthlyIncome'] = df.apply(lambda row: categorise_income(row), axis=1)
df['MonthlyIncome'] = df['MonthlyIncome'].astype('int')
df['MonthlyIncome'].value_counts()

1    1569
2     930
0     678
3     390
4     273
7     243
5     171
6     156
Name: MonthlyIncome, dtype: int64

In [14]:
df['BusinessTravel']=df['BusinessTravel'].map({'Travel_Frequently': 2,'Travel_Rarely': 1, 'Non-Travel': 0})
df['BusinessTravel'].value_counts()

1    3129
2     831
0     450
Name: BusinessTravel, dtype: int64

In [15]:
df['Gender']=np.where(df['Gender']=="Male",1,0)

In [16]:
to_encode_list=['Department','EducationField','JobRole', 'MaritalStatus']
for l in to_encode_list:
    ordinal_label = {k: i for i, k in enumerate(df[l].unique(), 0)}
    print(l, ordinal_label)
    df[l] = df[l].map(ordinal_label)

Department {'Sales': 0, 'Research & Development': 1, 'Human Resources': 2}
EducationField {'Life Sciences': 0, 'Other': 1, 'Medical': 2, 'Marketing': 3, 'Technical Degree': 4, 'Human Resources': 5}
JobRole {'Healthcare Representative': 0, 'Research Scientist': 1, 'Sales Executive': 2, 'Human Resources': 3, 'Research Director': 4, 'Laboratory Technician': 5, 'Manufacturing Director': 6, 'Sales Representative': 7, 'Manager': 8}
MaritalStatus {'Married': 0, 'Single': 1, 'Divorced': 2}


In [17]:
df.dtypes

Age                        int32
Attrition                  int64
BusinessTravel             int64
Department                 int64
DistanceFromHome           int64
Education                  int64
EducationField             int64
Gender                     int32
JobLevel                   int64
JobRole                    int64
MaritalStatus              int64
MonthlyIncome              int32
NumCompaniesWorked         int32
PercentSalaryHike          int64
StockOptionLevel           int64
TotalWorkingYears          int32
TrainingTimesLastYear      int64
YearsAtCompany             int32
YearsSinceLastPromotion    int32
YearsWithCurrManager       int64
dtype: object

In [18]:
X=df.drop(['Attrition'],axis=1)
X.columns

Index(['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education',
       'EducationField', 'Gender', 'JobLevel', 'JobRole', 'MaritalStatus',
       'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [19]:
y=df['Attrition']
y

0       0
1       1
2       0
3       0
4       0
       ..
4405    0
4406    0
4407    0
4408    0
4409    0
Name: Attrition, Length: 4410, dtype: int64

In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=.3,stratify = y) # 30% data is for testing
print(y_test.value_counts())
print(y_train.value_counts())

0    1110
1     213
Name: Attrition, dtype: int64
0    2589
1     498
Name: Attrition, dtype: int64


### Feature Selection

#### 1) Mutual Info/ Info Gain

In [21]:
mutual_info = mutual_info_classif(X_train, y_train)
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

YearsWithCurrManager       0.034726
YearsAtCompany             0.031827
TotalWorkingYears          0.027809
Age                        0.023980
MaritalStatus              0.023587
BusinessTravel             0.015916
JobLevel                   0.006505
NumCompaniesWorked         0.006239
Education                  0.004440
TrainingTimesLastYear      0.004315
DistanceFromHome           0.004212
StockOptionLevel           0.003452
PercentSalaryHike          0.001514
EducationField             0.000000
Gender                     0.000000
JobRole                    0.000000
MonthlyIncome              0.000000
Department                 0.000000
YearsSinceLastPromotion    0.000000
dtype: float64

#### 2) Chi Square Test

In [22]:
f_p_values=chi2(X_train,y_train)# this will have 2 arrays with f-value and p-value
p_values=pd.Series(f_p_values[1])#choosing only p value array
f_values=pd.Series(f_p_values[0])
p_values.index=X_train.columns
p_values.sort_values(ascending=False)

EducationField              8.658451e-01
Education                   7.077503e-01
DistanceFromHome            5.739088e-01
JobLevel                    5.173200e-01
Gender                      4.020615e-01
StockOptionLevel            2.527098e-01
MaritalStatus               2.523808e-01
MonthlyIncome               2.224995e-01
PercentSalaryHike           1.870233e-01
TrainingTimesLastYear       1.168798e-01
Department                  7.628477e-02
JobRole                     1.137873e-02
NumCompaniesWorked          9.970815e-03
BusinessTravel              9.711806e-04
YearsSinceLastPromotion     5.107923e-06
Age                         1.562463e-11
YearsWithCurrManager        1.671102e-57
YearsAtCompany              7.879549e-78
TotalWorkingYears          9.992534e-113
dtype: float64

In [23]:
f_values.index=X_train.columns
f_values.sort_values(ascending=False)

TotalWorkingYears          509.092433
YearsAtCompany             348.763064
YearsWithCurrManager       255.465312
Age                         45.454248
YearsSinceLastPromotion     20.796387
BusinessTravel              10.881719
NumCompaniesWorked           6.640103
JobRole                      6.405190
Department                   3.142330
TrainingTimesLastYear        2.458642
PercentSalaryHike            1.740912
MonthlyIncome                1.488177
MaritalStatus                1.310076
StockOptionLevel             1.308261
Gender                       0.702152
JobLevel                     0.419233
DistanceFromHome             0.316186
Education                    0.140534
EducationField               0.028540
dtype: float64

#### 3) ExtraTreeClassifier

In [24]:
etc=ExtraTreesClassifier()
etc.fit(X_train,y_train)
ranked_features=pd.Series(etc.feature_importances_,index=X_train.columns)
ranked_features.sort_values(ascending=False)

TotalWorkingYears          0.073848
YearsAtCompany             0.069320
Age                        0.062020
YearsWithCurrManager       0.060828
PercentSalaryHike          0.059413
DistanceFromHome           0.057874
MonthlyIncome              0.056114
JobRole                    0.055828
TrainingTimesLastYear      0.054482
NumCompaniesWorked         0.053356
EducationField             0.050368
JobLevel                   0.048170
Education                  0.047863
MaritalStatus              0.047541
YearsSinceLastPromotion    0.047306
StockOptionLevel           0.047271
BusinessTravel             0.040800
Department                 0.036384
Gender                     0.031213
dtype: float64

#### Overall features that can be eliminated are
Gender,
StockOptionLevel,
JobLevel,
Education

In [25]:
df=df1.copy()
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [26]:
df.drop(columns=['Education',  'Gender', 'JobLevel', 'StockOptionLevel'],inplace=True)

### Feature Engineering

In [27]:
df['Age']=pd.cut(df['Age'],bins=[17,25,35,45,55,64],labels=['25 & Below','26-35','36-45','46-55','56 & Above'])
df['Age'] = df['Age'].astype('object')
df['Age'].value_counts()

26-35         1818
36-45         1404
46-55          678
25 & Below     369
56 & Above     141
Name: Age, dtype: int64

In [28]:
#Function to categorise income into slabs
def categorise_income(row):  
    if row['MonthlyIncome'] > 10000 and row['MonthlyIncome'] <= 25000:
        return '10k-25k'
    elif row['MonthlyIncome'] > 25001 and row['MonthlyIncome'] <= 50000:
        return '25k-50k'
    elif row['MonthlyIncome'] > 50001 and row['MonthlyIncome'] <= 75000:
        return '50k-75k'
    elif row['MonthlyIncome'] > 75001 and row['MonthlyIncome'] <= 100000:
        return '75k-1L'
    elif row['MonthlyIncome'] > 100001 and row['MonthlyIncome'] <= 125000:
        return '1L-1.25L'
    elif row['MonthlyIncome'] > 125001 and row['MonthlyIncome'] <= 150000:
        return '1.25L-1.5L'
    elif row['MonthlyIncome'] > 150001 and row['MonthlyIncome'] <= 175000:
        return '1.5L-1.75L'
    elif row['MonthlyIncome'] > 10000 and row['MonthlyIncome'] <= 200000:
        return '1.75L-2L'
df['MonthlyIncome'] = df.apply(lambda row: categorise_income(row), axis=1)
df['MonthlyIncome'].value_counts()

25k-50k       1569
50k-75k        930
10k-25k        678
75k-1L         390
1L-1.25L       273
1.75L-2L       243
1.25L-1.5L     171
1.5L-1.75L     156
Name: MonthlyIncome, dtype: int64

In [29]:
df.dtypes

Age                        object
Attrition                  object
BusinessTravel             object
Department                 object
DistanceFromHome            int64
EducationField             object
JobRole                    object
MaritalStatus              object
MonthlyIncome              object
NumCompaniesWorked          int32
PercentSalaryHike           int64
TotalWorkingYears           int32
TrainingTimesLastYear       int64
YearsAtCompany              int32
YearsSinceLastPromotion     int32
YearsWithCurrManager        int64
dtype: object

### Feature and target Encoding

In [30]:
# Encoding target variable
df['Attrition']=df['Attrition'].map({'Yes': 1, 'No': 0})
df['Attrition'].value_counts()

0    3699
1     711
Name: Attrition, dtype: int64

In [31]:
# Label encoding BusinessTravel feature
df['BusinessTravel']=df['BusinessTravel'].map({'Travel_Frequently': 2,'Travel_Rarely': 1, 'Non-Travel': 0})
df['BusinessTravel'].value_counts()

1    3129
2     831
0     450
Name: BusinessTravel, dtype: int64

In [32]:
# Feature and Target separation for train test split
from sklearn.model_selection import train_test_split
X=df.drop(['Attrition'],axis=1)
y=df['Attrition']
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=.3,stratify = y) # 30% data is for testing
print(y_test.value_counts())
print(y_train.value_counts())

0    1110
1     213
Name: Attrition, dtype: int64
0    2589
1     498
Name: Attrition, dtype: int64


In [33]:
# One hot encoding remaining categorical features
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
ohe = make_column_transformer(
    (OneHotEncoder(drop='first'), ['Age','MonthlyIncome','Department','JobRole','MaritalStatus','EducationField']),
    remainder='passthrough')
ohencoded_X_train = ohe.fit_transform(X_train)
X_train = pd.DataFrame(ohencoded_X_train, columns=ohe.get_feature_names_out())
ohencoded_X_test=ohe.transform(X_test)
X_test = pd.DataFrame(ohencoded_X_test, columns=ohe.get_feature_names_out())
# print(X_train.head())
# print(X_test.head())

### Feature Scaling/ Standardisation

In [34]:
from sklearn.preprocessing import StandardScaler
ss_scale=StandardScaler()
X_train_std=ss_scale.fit_transform(X_train)
X_train_std=pd.DataFrame(X_train_std,columns = X_train.columns)
X_test_std=ss_scale.transform(X_test)
X_test_std=pd.DataFrame(X_test_std,columns = X_test.columns)
X_train_std.head()

Unnamed: 0,onehotencoder__Age_26-35,onehotencoder__Age_36-45,onehotencoder__Age_46-55,onehotencoder__Age_56 & Above,onehotencoder__MonthlyIncome_1.5L-1.75L,onehotencoder__MonthlyIncome_1.75L-2L,onehotencoder__MonthlyIncome_10k-25k,onehotencoder__MonthlyIncome_1L-1.25L,onehotencoder__MonthlyIncome_25k-50k,onehotencoder__MonthlyIncome_50k-75k,...,onehotencoder__EducationField_Technical Degree,remainder__BusinessTravel,remainder__DistanceFromHome,remainder__NumCompaniesWorked,remainder__PercentSalaryHike,remainder__TotalWorkingYears,remainder__TrainingTimesLastYear,remainder__YearsAtCompany,remainder__YearsSinceLastPromotion,remainder__YearsWithCurrManager
0,1.209017,-0.694773,-0.420057,-0.178189,-0.185789,-0.245145,-0.429621,-0.259668,1.341168,-0.509398,...,-0.318308,1.698474,0.089003,-0.68864,-0.345294,-0.423678,0.155813,0.293523,0.480121,-1.143162
1,-0.827118,1.43932,-0.420057,-0.178189,-0.185789,-0.245145,-0.429621,-0.259668,1.341168,-0.509398,...,-0.318308,-0.159495,2.04802,-1.103761,-0.345294,-0.285393,1.71749,0.293523,-0.360329,0.828628
2,-0.827118,1.43932,-0.420057,-0.178189,-0.185789,-0.245145,-0.429621,-0.259668,1.341168,-0.509398,...,3.141611,-0.159495,-1.012944,2.217204,0.466322,1.235734,0.155813,2.115044,-0.780554,3.082104
3,1.209017,-0.694773,-0.420057,-0.178189,-0.185789,-0.245145,-0.429621,-0.259668,1.341168,-0.509398,...,-0.318308,-0.159495,-0.033436,-1.103761,0.73686,-0.147109,-0.625025,0.495914,1.740795,0.828628
4,1.209017,-0.694773,-0.420057,-0.178189,-0.185789,-0.245145,-0.429621,-0.259668,-0.745619,-0.509398,...,-0.318308,-0.159495,0.456319,-0.27352,0.195783,-0.700246,-0.625025,-0.516042,-0.360329,-0.579794
