In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# read the whole data
empdata = pd.read_csv("Employee-Attrition.csv")
empdata.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
empdata.shape

(1470, 35)

In [4]:
empdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [5]:
for i in empdata.columns:
    print(i, '===>', empdata[i].unique())

Age ===> [41 49 37 33 27 32 59 30 38 36 35 29 31 34 28 22 53 24 21 42 44 46 39 43
 50 26 48 55 45 56 23 51 40 54 58 20 25 19 57 52 47 18 60]
Attrition ===> ['Yes' 'No']
BusinessTravel ===> ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
DailyRate ===> [1102  279 1373 1392  591 1005 1324 1358  216 1299  809  153  670 1346
  103 1389  334 1123 1219  371  673 1218  419  391  699 1282 1125  691
  477  705  924 1459  125  895  813 1273  869  890  852 1141  464 1240
 1357  994  721 1360 1065  408 1211 1229  626 1434 1488 1097 1443  515
  853 1142  655 1115  427  653  989 1435 1223  836 1195 1339  664  318
 1225 1328 1082  548  132  746  776  193  397  945 1214  111  573 1153
 1400  541  432  288  669  530  632 1334  638 1093 1217 1353  120  682
  489  807  827  871  665 1040 1420  240 1280  534 1456  658  142 1127
 1031 1189 1354 1467  922  394 1312  750  441  684  249  841  147  528
  594  470  957  542  802 1355 1150 1329  959 1033 1316  364  438  689
  201 1427  857  933 1181 1395  662

We can drop this (Over18 , EmployeeNumber , EmployeeCount , StandardHours) columns because of very less count compare to other columns

In [6]:
empdata = empdata.drop(['Over18', 'EmployeeNumber', 'EmployeeCount', 'StandardHours'], axis=1)

### Feature Engineering

This is a technique to combining the existing features into one perticular feature or a new feature

In [7]:
# data set columns
empdata.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

'EnvironmentSatisfaction', 'JobSatisfaction', 'JobInvolvement', 'RelationshipSatisfaction' --> this columns are all about to satisfaction. So, we can combine these all into a new feature.

In [8]:
# checking the min-max range
empdata[['EnvironmentSatisfaction', 'JobSatisfaction', 'JobInvolvement', 'RelationshipSatisfaction', 'WorkLifeBalance']]

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,JobInvolvement,RelationshipSatisfaction,WorkLifeBalance
0,2,4,3,1,1
1,3,2,2,4,3
2,4,3,2,2,3
3,4,3,3,3,3
4,1,2,3,4,3
...,...,...,...,...,...
1465,3,4,4,3,3
1466,4,1,2,1,3
1467,2,2,4,2,3
1468,4,2,2,4,2


In [9]:
print(empdata.EnvironmentSatisfaction.min(), empdata.EnvironmentSatisfaction.max())

1 4


In [10]:
print(empdata.RelationshipSatisfaction.min(), empdata.RelationshipSatisfaction.max())

1 4


In [11]:
print(empdata.WorkLifeBalance.min(), empdata.WorkLifeBalance.max())

1 4


In [12]:
empdata['Mean_of_Satisfaction'] = (empdata.EnvironmentSatisfaction + empdata.RelationshipSatisfaction + empdata.WorkLifeBalance)/3
empdata['Mean_of_Satisfaction'].mean()

2.731746031746032

In [13]:
def employee_satisfaction(empdata):
    if empdata['Mean_of_Satisfaction'] > 2.73:
        return 1 #satisfied 
    else:
        return 0 #not satisfied
empdata['employee_satisfaction'] = empdata.apply(lambda empdata:employee_satisfaction(empdata), axis=1)
empdata['employee_satisfaction']

0       0
1       1
2       1
3       1
4       0
       ..
1465    1
1466    0
1467    0
1468    1
1469    0
Name: employee_satisfaction, Length: 1470, dtype: int64

In [14]:
print(empdata.JobSatisfaction.min(), empdata.JobSatisfaction.max())

1 4


In [15]:
print(empdata.JobInvolvement.min(), empdata.JobInvolvement.max())

1 4


In [16]:
empdata['Mean_of_JobSatisfaction'] = (empdata.JobSatisfaction + empdata.JobInvolvement)/2
empdata['Mean_of_JobSatisfaction'].mean()

2.729251700680272

In [17]:
def Job_Satisfaction(empdata):
    if empdata['Mean_of_JobSatisfaction'] > 2.7:
        return 1
    else:
        return 0
    
empdata['Job_Satisfaction'] = empdata.apply(lambda empdata:Job_Satisfaction(empdata), axis=1)
empdata['Job_Satisfaction']

0       1
1       0
2       0
3       1
4       0
       ..
1465    1
1466    0
1467    1
1468    0
1469    1
Name: Job_Satisfaction, Length: 1470, dtype: int64

In [18]:
def freshers(empdata):
    if empdata['Age'] < 30 and empdata['Age'] > 21 and (empdata['MonthlyIncome'] < 25000):
        return 1
    else:
        return 0

empdata['freshers'] = empdata.apply(lambda empdata:freshers(empdata), axis=1)
empdata['freshers']

0       0
1       0
2       0
3       0
4       1
       ..
1465    0
1466    0
1467    1
1468    0
1469    0
Name: freshers, Length: 1470, dtype: int64

In [19]:
def frequent_transition(empdata):
    if empdata['NumCompaniesWorked'] > 3:
        return 1
    else:
        return 0
    
empdata['frequent_transition'] = empdata.apply(lambda empdata:frequent_transition(empdata), axis=1)
empdata['frequent_transition']

0       1
1       0
2       1
3       0
4       1
       ..
1465    1
1466    1
1467    0
1468    0
1469    0
Name: frequent_transition, Length: 1470, dtype: int64

In [20]:
empdata.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Mean_of_Satisfaction,employee_satisfaction,Mean_of_JobSatisfaction,Job_Satisfaction,freshers,frequent_transition
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,6,4,0,5,1.333333,0,3.5,1,0,1
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,10,7,1,7,3.333333,1,2.0,0,0,0
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,0,0,0,0,3.0,1,2.5,0,0,1
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,8,7,3,0,3.333333,1,3.0,1,0,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,2,2,2,2,2.666667,0,2.5,0,1,1


In [21]:
empdata.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'Mean_of_Satisfaction', 'employee_satisfaction',
       'Mean_of_JobSatisfaction', 'Job_Satisfaction', 'freshers',
       'frequent_transition'],
      dtype='object')

Here I create some new feature like -> employee_satisfaction, Job_Satisfaction(mean of JobSatisfaction and JobInvolvement), frashers,  frequent_transition

In [22]:
new_empdata = empdata.copy()
new_empdata = empdata.drop(['EnvironmentSatisfaction','JobInvolvement', 'JobSatisfaction', 'RelationshipSatisfaction', 'Mean_of_Satisfaction', 'Mean_of_JobSatisfaction'], axis=1)
new_empdata.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'Gender',
       'HourlyRate', 'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'employee_satisfaction', 'Job_Satisfaction', 'freshers',
       'frequent_transition'],
      dtype='object')

In [23]:
empdata_numeric_feature = new_empdata.select_dtypes(include=[np.number])
empdata_categorical_feature = new_empdata.select_dtypes(exclude=[np.number])

new_empdata_nemeric_col = empdata_numeric_feature.columns.tolist()
new_empdata_cat_col = empdata_categorical_feature.columns.tolist()

print("Numerical Features: \n", new_empdata_nemeric_col)
print("\n Categorical Features: ", new_empdata_cat_col)

Numerical Features: 
 ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'HourlyRate', 'JobLevel', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'employee_satisfaction', 'Job_Satisfaction', 'freshers', 'frequent_transition']

 Categorical Features:  ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']


We need to encode the categorical fetures and this will convert all the categorical columns into the numerical columns. After that we can plot a correlation matrix and  extract those features which is more relevent for the target variable.