## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from subprocess import check_output

In [2]:
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

In [201]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
# from keras.wrappers.scikit_learn import KerasClassifier
from scikeras.wrappers import KerasClassifier

## Import Dataset

In [4]:
dataset_o = pd.read_csv('HR_Employee_Attrition.csv')

In [5]:
dataset_o.shape

(1470, 35)

In [6]:
dataset_o.head().T

Unnamed: 0,0,1,2,3,4
Age,41,49,37,33,27
Attrition,Yes,No,Yes,No,No
BusinessTravel,Travel_Rarely,Travel_Frequently,Travel_Rarely,Travel_Frequently,Travel_Rarely
DailyRate,1102,279,1373,1392,591
Department,Sales,Research & Development,Research & Development,Research & Development,Research & Development
DistanceFromHome,1,8,2,3,2
Education,2,1,2,4,1
EducationField,Life Sciences,Life Sciences,Other,Life Sciences,Medical
EmployeeCount,1,1,1,1,1
EmployeeNumber,1,2,4,5,7


## Removing Columns

In [7]:
len(dataset_o.columns)

35

In [8]:
dataset_o.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [9]:
remove_cols = ['EmployeeNumber', 'Over18', 'StandardHours']
keep_cols = list(filter(lambda x: (x not in remove_cols), dataset_o.columns))
len(keep_cols)

32

In [10]:
print( keep_cols )

['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


In [86]:
# making new dataset and re-index columns

dataset = dataset_o[keep_cols]
dataset = dataset.reindex(columns=['Attrition', 'Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'])

In [87]:
dataset.head().T

Unnamed: 0,0,1,2,3,4
Attrition,Yes,No,Yes,No,No
Age,41,49,37,33,27
BusinessTravel,Travel_Rarely,Travel_Frequently,Travel_Rarely,Travel_Frequently,Travel_Rarely
DailyRate,1102,279,1373,1392,591
Department,Sales,Research & Development,Research & Development,Research & Development,Research & Development
DistanceFromHome,1,8,2,3,2
Education,2,1,2,4,1
EducationField,Life Sciences,Life Sciences,Other,Life Sciences,Medical
EmployeeCount,1,1,1,1,1
EnvironmentSatisfaction,2,3,4,4,1


## Adding few more derived columns

In [88]:
dataset['JobInvolment_On_Salary'] = dataset['JobInvolvement'] / dataset['MonthlyIncome'] * 1000

In [89]:
dataset['MarriedAndBad_Worklife_Balance'] = np.where(
    dataset['MaritalStatus']=='Married', 
    dataset['WorkLifeBalance']-2, 
    dataset['WorkLifeBalance']+1
)

In [90]:
dataset['DistanceFromHome_rootedTo_JobSatisfaction'] = dataset['DistanceFromHome']**(1/dataset['JobSatisfaction'])

In [91]:
dataset['TotalJobSatisfaction'] = dataset['EnvironmentSatisfaction'] + dataset['JobSatisfaction'] + dataset['RelationshipSatisfaction']

In [92]:
dataset['OldLowEmployeeTendToStay'] = dataset['YearsAtCompany'] / dataset['JobLevel']

In [93]:
dataset['Mothers'] = np.where(
    (dataset['Gender']=='Female') & (dataset['Age']>=36), 
    1,
    0
)

In [94]:
dataset['Rate'] = dataset['DailyRate'] * 20 + dataset['HourlyRate'] * 8 * 20 + dataset['MonthlyRate']

In [95]:
dataset['RateExtended'] = dataset['Rate'] * (8 - dataset['JobSatisfaction'] - dataset['EnvironmentSatisfaction'])

In [97]:
dataset.shape

(1470, 40)

In [96]:
dataset.head(2).T

Unnamed: 0,0,1
Attrition,Yes,No
Age,41,49
BusinessTravel,Travel_Rarely,Travel_Frequently
DailyRate,1102,279
Department,Sales,Research & Development
DistanceFromHome,1,8
Education,2,1
EducationField,Life Sciences,Life Sciences
EmployeeCount,1,1
EnvironmentSatisfaction,2,3


## Making Independent (X) and Dependent (y) Dataset

In [98]:
X = dataset.iloc[:, 1:]
y = dataset.iloc[:, 0]

In [99]:
X.shape

(1470, 39)

In [100]:
y.shape

(1470,)

In [101]:
X.head(2).T

Unnamed: 0,0,1
Age,41,49
BusinessTravel,Travel_Rarely,Travel_Frequently
DailyRate,1102,279
Department,Sales,Research & Development
DistanceFromHome,1,8
Education,2,1
EducationField,Life Sciences,Life Sciences
EmployeeCount,1,1
EnvironmentSatisfaction,2,3
Gender,Female,Male


In [103]:
y.head()

0    Yes
1     No
2    Yes
3     No
4     No
Name: Attrition, dtype: object

## LabelEncoder Categorical Columns

In [123]:
categ_cols = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']

for i in categ_cols:
    print( X[i].unique(), '\n')

['Travel_Rarely' 'Travel_Frequently' 'Non-Travel'] 

['Sales' 'Research & Development' 'Human Resources'] 

['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources'] 

['Female' 'Male'] 

['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources'] 

['Single' 'Married' 'Divorced'] 

['Yes' 'No'] 



In [124]:
categ_cols[0]

'BusinessTravel'

In [125]:
categ_cols = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']

labelencoder_X_1 = LabelEncoder()
X[categ_cols[0]] = labelencoder_X_1.fit_transform(X[categ_cols[0]].values)

labelencoder_X_3 = LabelEncoder()
X[categ_cols[1]] = labelencoder_X_3.fit_transform(X[categ_cols[1]].values)

labelencoder_X_6 = LabelEncoder()
X[categ_cols[2]] = labelencoder_X_6.fit_transform(X[categ_cols[2]].values)

labelencoder_X_9 = LabelEncoder()
X[categ_cols[3]] = labelencoder_X_9.fit_transform(X[categ_cols[3]].values)

labelencoder_X_13 = LabelEncoder()
X[categ_cols[4]] = labelencoder_X_13.fit_transform(X[categ_cols[4]].values)

labelencoder_X_15 = LabelEncoder()
X[categ_cols[5]] = labelencoder_X_15.fit_transform(X[categ_cols[5]].values)

labelencoder_X_19 = LabelEncoder()
X[categ_cols[6]] = labelencoder_X_19.fit_transform(X[categ_cols[6]].values)

In [126]:
for i in categ_cols:
    print( X[i].unique(), '\n')

[2 1 0] 

[2 1 0] 

[1 4 3 2 5 0] 

[0 1] 

[7 6 2 4 0 3 8 5 1] 

[2 1 0] 

[1 0] 



In [129]:
X.shape

(1470, 39)

In [128]:
X.head().T

Unnamed: 0,0,1,2,3,4
Age,41.0,49.0,37.0,33.0,27.0
BusinessTravel,2.0,1.0,2.0,1.0,2.0
DailyRate,1102.0,279.0,1373.0,1392.0,591.0
Department,2.0,1.0,1.0,1.0,1.0
DistanceFromHome,1.0,8.0,2.0,3.0,2.0
Education,2.0,1.0,2.0,4.0,1.0
EducationField,1.0,1.0,4.0,1.0,3.0
EmployeeCount,1.0,1.0,1.0,1.0,1.0
EnvironmentSatisfaction,2.0,3.0,4.0,4.0,1.0
Gender,0.0,1.0,1.0,0.0,1.0


In [211]:
labelencoder_y= LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [229]:
y[0:20]

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

## OneHotEncoder Categorical Columns

In [127]:
X.shape

(1470, 39)

In [131]:
categ_cols = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']
onehotencoder = OneHotEncoder(categories='auto')
onehot_features_arr = onehotencoder1.fit_transform(dataset_o[categ_cols]).toarray()
onehot_features_label = onehotencoder1.categories_

onehot_features_list = []

for i in onehot_features_label:
    for j in i:
        onehot_features_list.append(j)

onehot_features_df = pd.DataFrame(onehot_features_arr, columns=onehot_features_list)
onehot_features_df.shape

(1470, 28)

In [132]:
onehot_features_df.head().T

Unnamed: 0,0,1,2,3,4
Non-Travel,0.0,0.0,0.0,0.0,0.0
Travel_Frequently,0.0,1.0,0.0,1.0,0.0
Travel_Rarely,1.0,0.0,1.0,0.0,1.0
Human Resources,0.0,0.0,0.0,0.0,0.0
Research & Development,0.0,1.0,1.0,1.0,1.0
Sales,1.0,0.0,0.0,0.0,0.0
Human Resources,0.0,0.0,0.0,0.0,0.0
Life Sciences,1.0,1.0,0.0,1.0,0.0
Marketing,0.0,0.0,0.0,0.0,0.0
Medical,0.0,0.0,0.0,0.0,1.0


In [133]:
X = pd.concat([X, onehot_features_df], axis=1, ignore_index=False)

In [134]:
X.shape

(1470, 67)

In [137]:
categ_cols = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']
X.drop(categ_cols, axis=1, inplace=True)

In [138]:
X.shape

(1470, 60)

In [140]:
X.columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
       'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'JobInvolment_On_Salary', 'MarriedAndBad_Worklife_Balance',
       'DistanceFromHome_rootedTo_JobSatisfaction', 'TotalJobSatisfaction',
       'OldLowEmployeeTendToStay', 'Mothers', 'Rate', 'RateExtended',
       'Non-Travel', 'Travel_Frequently', 'Travel_Rarely', 'Human Resources',
       'Research & Development', 'Sales', 'Human Resources', 'Life Sciences',
       'Marketing', 'Medical', 'Other', 'Technical Degree', 'Female', 'Male',
       'Healthcare Representative', 'Human R

In [141]:
X = X.iloc[:, 0:].values

In [142]:
X.shape

(1470, 60)

In [144]:
X[0]

array([4.10000000e+01, 1.10200000e+03, 1.00000000e+00, 2.00000000e+00,
       1.00000000e+00, 2.00000000e+00, 9.40000000e+01, 3.00000000e+00,
       2.00000000e+00, 4.00000000e+00, 5.99300000e+03, 1.94790000e+04,
       8.00000000e+00, 1.10000000e+01, 3.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 8.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       6.00000000e+00, 4.00000000e+00, 0.00000000e+00, 5.00000000e+00,
       5.00584015e-01, 2.00000000e+00, 1.00000000e+00, 7.00000000e+00,
       3.00000000e+00, 1.00000000e+00, 5.65590000e+04, 1.13118000e+05,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [145]:
X = X.astype(float)

In [147]:
X.shape

(1470, 60)

In [150]:
X[0]

array([4.10000000e+01, 1.10200000e+03, 1.00000000e+00, 2.00000000e+00,
       1.00000000e+00, 2.00000000e+00, 9.40000000e+01, 3.00000000e+00,
       2.00000000e+00, 4.00000000e+00, 5.99300000e+03, 1.94790000e+04,
       8.00000000e+00, 1.10000000e+01, 3.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 8.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       6.00000000e+00, 4.00000000e+00, 0.00000000e+00, 5.00000000e+00,
       5.00584015e-01, 2.00000000e+00, 1.00000000e+00, 7.00000000e+00,
       3.00000000e+00, 1.00000000e+00, 5.65590000e+04, 1.13118000e+05,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

## Splitting `X, y` in Train and Test Set

In [214]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [215]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1176, 60)
(294, 60)
(1176,)
(294,)


In [216]:
X_train[0]

array([5.80000000e+01, 6.05000000e+02, 2.10000000e+01, 3.00000000e+00,
       1.00000000e+00, 4.00000000e+00, 7.20000000e+01, 3.00000000e+00,
       4.00000000e+00, 4.00000000e+00, 1.78750000e+04, 1.17610000e+04,
       4.00000000e+00, 1.30000000e+01, 3.00000000e+00, 3.00000000e+00,
       1.00000000e+00, 2.90000000e+01, 2.00000000e+00, 2.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.67832168e-01, 0.00000000e+00, 2.14069514e+00, 1.10000000e+01,
       2.50000000e-01, 1.00000000e+00, 3.53810000e+04, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [217]:
X_test[0]

array([3.60000000e+01, 6.35000000e+02, 1.00000000e+01, 4.00000000e+00,
       1.00000000e+00, 2.00000000e+00, 3.20000000e+01, 3.00000000e+00,
       3.00000000e+00, 4.00000000e+00, 9.98000000e+03, 1.53180000e+04,
       1.00000000e+00, 1.40000000e+01, 3.00000000e+00, 4.00000000e+00,
       0.00000000e+00, 1.00000000e+01, 3.00000000e+00, 2.00000000e+00,
       1.00000000e+01, 3.00000000e+00, 9.00000000e+00, 7.00000000e+00,
       3.00601202e-01, 3.00000000e+00, 1.77827941e+00, 1.00000000e+01,
       3.33333333e+00, 0.00000000e+00, 3.31380000e+04, 6.62760000e+04,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [227]:
y_train[0:10]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [226]:
y_test[0:10]

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 1])

## StandardScaler `X_train and X_test`

In [160]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [162]:
print( X_train.shape )
print( X_test.shape )

(1176, 60)
(294, 60)


In [163]:
X_train[0]

array([ 2.3389367 , -0.48557354,  1.45567735,  0.08087412,  0.        ,
        1.13998967,  0.33865726,  0.38547835,  1.7590273 ,  1.14972558,
        2.41725694, -0.34865414,  0.514544  , -0.58948704, -0.42092419,
        0.26452003,  0.22505569,  2.30343322, -0.6134953 , -1.07224664,
       -0.98200785, -1.15684058, -0.67381506, -1.15043886, -1.07071855,
       -1.45592203, -0.26045131,  1.49100594, -1.28248422,  1.96456207,
       -0.47619127, -1.45601014, -0.34641016, -0.479714  ,  0.64565275,
       -0.21724413, -1.38234984,  1.53311035, -0.14123725,  1.18321596,
       -0.34332306, -0.68021068, -0.23186945, -0.31637564,  1.24094996,
       -1.24094996, -0.30151134, -0.1948136 , -0.4662524 ,  3.6767538 ,
       -0.3323877 , -0.23791548, -0.50636968, -0.53013748, -0.2438431 ,
       -0.54065487,  1.11547067, -0.69900472, -1.62507442,  1.62507442])

In [164]:
X_test[0]

array([-0.08886552, -0.41164077,  0.10328995,  1.0715821 ,  0.        ,
       -0.68523545, -1.63317557,  0.38547835,  0.85451654,  1.14972558,
        0.74559207,  0.1501725 , -0.67582402, -0.31558868, -0.42092419,
        1.19034013, -0.93068882, -0.14998998,  0.1863611 , -1.07224664,
        0.49540098, -0.33032507,  2.13281956,  0.81437705, -0.77789998,
        0.34319983, -0.34016257,  0.95885936, -0.08027442, -0.50901929,
       -0.67667349, -0.51127958,  2.88675135, -0.479714  , -1.54882017,
       -0.21724413, -1.38234984,  1.53311035, -0.14123725, -0.84515425,
       -0.34332306,  1.47013276, -0.23186945, -0.31637564, -0.80583427,
        0.80583427, -0.30151134, -0.1948136 , -0.4662524 , -0.27197905,
       -0.3323877 , -0.23791548, -0.50636968,  1.88630314, -0.2438431 ,
       -0.54065487, -0.89648256,  1.43060551,  0.61535643, -0.61535643])

In [224]:
y_train[0:10]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [225]:
y_test[0:10]

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 1])

## Making ANN Model

**_Define Hyperparameters_**

In [165]:
dropout = 0.1
epochs = 100
batch_size = 30
optimizer = 'adam'
k = 20

**_define function for model_**

In [167]:
X.shape[1]

60

In [202]:
def build_classifier():
    classifier = Sequential()
    classifier.add(Input(shape=(X.shape[1],)))
    classifier.add(Dense(16, kernel_initializer='truncated_normal', activation='relu'))
    classifier.add(Dropout(dropout))
    classifier.add(Dense(1, kernel_initializer='truncated_normal', activation='sigmoid'))
    classifier.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return classifier

In [203]:
build_classifier

<function __main__.build_classifier()>

**_Initialize the module_**

In [204]:
classifier = KerasClassifier(model=build_classifier, batch_size=batch_size, epochs=epochs, verbose=0)

In [205]:
classifier

## Training the module

In [220]:
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=30)

In [230]:
accuracies

array([0.825     , 0.825     , 0.8       , 0.825     , 0.825     ,
       0.825     , 0.82051282, 0.84615385, 0.84615385, 0.84615385,
       0.84615385, 0.84615385, 0.84615385, 0.84615385, 0.84615385,
       0.84615385, 0.84615385, 0.84615385, 0.84615385, 0.84615385,
       0.84615385, 0.84615385, 0.87179487, 0.82051282, 0.84615385,
       0.84615385, 0.84615385, 0.84615385, 0.82051282, 0.82051282])

In [231]:
max_accuracy = accuracies.max()
print("Best accuracy: ", max_accuracy)

Best accuracy:  0.8717948717948718
