## Import Libraries

In [66]:
import numpy as np
import pandas as pd
from subprocess import check_output

In [67]:
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

In [68]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
# from keras.wrappers.scikit_learn import KerasClassifier
from scikeras.wrappers import KerasClassifier

## Import Dataset

In [4]:
dataset_o = pd.read_csv('HR_Employee_Attrition.csv')

In [5]:
dataset_o.shape

(1470, 35)

In [6]:
dataset_o.head().T

Unnamed: 0,0,1,2,3,4
Age,41,49,37,33,27
Attrition,Yes,No,Yes,No,No
BusinessTravel,Travel_Rarely,Travel_Frequently,Travel_Rarely,Travel_Frequently,Travel_Rarely
DailyRate,1102,279,1373,1392,591
Department,Sales,Research & Development,Research & Development,Research & Development,Research & Development
DistanceFromHome,1,8,2,3,2
Education,2,1,2,4,1
EducationField,Life Sciences,Life Sciences,Other,Life Sciences,Medical
EmployeeCount,1,1,1,1,1
EmployeeNumber,1,2,4,5,7


## Removing Columns

In [7]:
len(dataset_o.columns)

35

In [8]:
dataset_o.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [9]:
remove_cols = ['EmployeeNumber', 'Over18', 'StandardHours']
keep_cols = list(filter(lambda x: (x not in remove_cols), dataset_o.columns))
len(keep_cols)

32

In [10]:
print( keep_cols )

['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


In [11]:
# making new dataset and re-index columns

dataset = dataset_o[keep_cols]
dataset = dataset.reindex(columns=['Attrition', 'Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'])

In [12]:
dataset.head().T

Unnamed: 0,0,1,2,3,4
Attrition,Yes,No,Yes,No,No
Age,41,49,37,33,27
BusinessTravel,Travel_Rarely,Travel_Frequently,Travel_Rarely,Travel_Frequently,Travel_Rarely
DailyRate,1102,279,1373,1392,591
Department,Sales,Research & Development,Research & Development,Research & Development,Research & Development
DistanceFromHome,1,8,2,3,2
Education,2,1,2,4,1
EducationField,Life Sciences,Life Sciences,Other,Life Sciences,Medical
EmployeeCount,1,1,1,1,1
EnvironmentSatisfaction,2,3,4,4,1


## Adding few more derived columns

In [13]:
# cols = list(dataset.columns.values)
# cols = dataset.columns.tolist()
# cols = sorted(dataset.columns)
# dataset = dataset.reindex(columns=[Attrition', 'Age', 'BusinessTravel', 'DailyRate'])
# dataset = dataset.reindex(sorted(dataset.columns), axis=1)
# print( dataset.columns.get_loc('JobInvolvement') )
# print( dataset.columns.get_loc('MonthlyIncome') )
# dataset.loc[:,'JobInvolvement']
# dataset['JobInvolment_On_Salary'] = np.nan

In [14]:
dataset['JobInvolment_On_Salary'] = dataset['JobInvolvement'] / dataset['MonthlyIncome'] * 1000

In [15]:
dataset['MarriedAndBad_Worklife_Balance'] = np.where(
    dataset['MaritalStatus']=='Married', 
    dataset['WorkLifeBalance']-2, 
    dataset['WorkLifeBalance']+1
)

In [16]:
dataset['DistanceFromHome_rootedTo_JobSatisfaction'] = dataset['DistanceFromHome']**(1/dataset['JobSatisfaction'])

In [17]:
dataset['TotalJobSatisfaction'] = dataset['EnvironmentSatisfaction'] + dataset['JobSatisfaction'] + dataset['RelationshipSatisfaction']

In [18]:
dataset['OldLowEmployeeTendToStay'] = dataset['YearsAtCompany'] / dataset['JobLevel']

In [19]:
dataset['Mothers'] = np.where(
    (dataset['Gender']=='Female') & (dataset['Age']>=36), 
    1,
    0
)

In [20]:
dataset['Rate'] = dataset['DailyRate'] * 20 + dataset['HourlyRate'] * 8 * 20 + dataset['MonthlyRate']

In [21]:
dataset['RateExtended'] = dataset['Rate'] * (8 - dataset['JobSatisfaction'] - dataset['EnvironmentSatisfaction'])

In [22]:
dataset.head(2).T

Unnamed: 0,0,1
Attrition,Yes,No
Age,41,49
BusinessTravel,Travel_Rarely,Travel_Frequently
DailyRate,1102,279
Department,Sales,Research & Development
DistanceFromHome,1,8
Education,2,1
EducationField,Life Sciences,Life Sciences
EmployeeCount,1,1
EnvironmentSatisfaction,2,3


## Making Independent (X) and Dependent (y) Dataset
* Also separating the data from the labels (columns name)

In [23]:
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

In [24]:
X.shape

(1470, 39)

In [25]:
print( X[0] )

[41 'Travel_Rarely' 1102 'Sales' 1 2 'Life Sciences' 1 2 'Female' 94 3 2
 'Sales Executive' 4 'Single' 5993 19479 8 'Yes' 11 3 1 0 8 0 1 6 4 0 5
 0.5005840146837978 2 1.0 7 3.0 1 56559 113118]


In [26]:
y.shape

(1470,)

In [27]:
print( y[0:10] )

['Yes' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No']


In [28]:
dataset.iloc[:, 0].head()

0    Yes
1     No
2    Yes
3     No
4     No
Name: Attrition, dtype: object

In [29]:
dataset.iloc[:, 0].values

array(['Yes', 'No', 'Yes', ..., 'No', 'No', 'No'], dtype=object)

In [30]:
dataset.iloc[:, 1:].head(2).T

Unnamed: 0,0,1
Age,41,49
BusinessTravel,Travel_Rarely,Travel_Frequently
DailyRate,1102,279
Department,Sales,Research & Development
DistanceFromHome,1,8
Education,2,1
EducationField,Life Sciences,Life Sciences
EmployeeCount,1,1
EnvironmentSatisfaction,2,3
Gender,Female,Male


In [31]:
dataset.iloc[:, 1:].values[0]

array([41, 'Travel_Rarely', 1102, 'Sales', 1, 2, 'Life Sciences', 1, 2,
       'Female', 94, 3, 2, 'Sales Executive', 4, 'Single', 5993, 19479, 8,
       'Yes', 11, 3, 1, 0, 8, 0, 1, 6, 4, 0, 5, 0.5005840146837978, 2,
       1.0, 7, 3.0, 1, 56559, 113118], dtype=object)

## LabelEncoder Categorical Columns

In [32]:
for i, v in enumerate(X[0]):
    print(i, v)

0 41
1 Travel_Rarely
2 1102
3 Sales
4 1
5 2
6 Life Sciences
7 1
8 2
9 Female
10 94
11 3
12 2
13 Sales Executive
14 4
15 Single
16 5993
17 19479
18 8
19 Yes
20 11
21 3
22 1
23 0
24 8
25 0
26 1
27 6
28 4
29 0
30 5
31 0.5005840146837978
32 2
33 1.0
34 7
35 3.0
36 1
37 56559
38 113118


In [33]:
# categorical columns indexes in X == 1, 3, 6, 9, 13, 15, 19

In [34]:
X[:, 1]

array(['Travel_Rarely', 'Travel_Frequently', 'Travel_Rarely', ...,
       'Travel_Rarely', 'Travel_Frequently', 'Travel_Rarely'],
      dtype=object)

In [35]:
X[:, 9]

array(['Female', 'Male', 'Male', ..., 'Male', 'Male', 'Male'],
      dtype=object)

In [36]:
print( X[:,1][0:3] )
print( X[:,3][0:3] )
print( X[:,6][0:3] )
print( X[:,9][0:3] )
print( X[:,13][0:3] )
print( X[:,15][0:3] )
print( X[:,19][0:3] )

['Travel_Rarely' 'Travel_Frequently' 'Travel_Rarely']
['Sales' 'Research & Development' 'Research & Development']
['Life Sciences' 'Life Sciences' 'Other']
['Female' 'Male' 'Male']
['Sales Executive' 'Research Scientist' 'Laboratory Technician']
['Single' 'Married' 'Single']
['Yes' 'No' 'Yes']


In [37]:
print( len(np.unique(X[:,1])) )
print( len(np.unique(X[:,3])) )
print( len(np.unique(X[:,6])) )
print( len(np.unique(X[:,9])) )
print( len(np.unique(X[:,13])) )
print( len(np.unique(X[:,15])) )
print( len(np.unique(X[:,19])) )

3
3
6
2
9
3
2


In [38]:
print( np.unique(X[:,1]), '\n')
print( np.unique(X[:,3]), '\n' )
print( np.unique(X[:,6]), '\n' )
print( np.unique(X[:,9]), '\n' )
print( np.unique(X[:,13]), '\n' )
print( np.unique(X[:,15]), '\n' )
print( np.unique(X[:,19]), '\n' )

['Non-Travel' 'Travel_Frequently' 'Travel_Rarely'] 

['Human Resources' 'Research & Development' 'Sales'] 

['Human Resources' 'Life Sciences' 'Marketing' 'Medical' 'Other'
 'Technical Degree'] 

['Female' 'Male'] 

['Healthcare Representative' 'Human Resources' 'Laboratory Technician'
 'Manager' 'Manufacturing Director' 'Research Director'
 'Research Scientist' 'Sales Executive' 'Sales Representative'] 

['Divorced' 'Married' 'Single'] 

['No' 'Yes'] 



In [39]:
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])

labelencoder_X_3 = LabelEncoder()
X[:, 3] = labelencoder_X_3.fit_transform(X[:, 3])

labelencoder_X_6= LabelEncoder()
X[:, 6] = labelencoder_X_6.fit_transform(X[:, 6])

labelencoder_X_9= LabelEncoder()
X[:, 9] = labelencoder_X_9.fit_transform(X[:, 9])

labelencoder_X_13= LabelEncoder()
X[:, 13] = labelencoder_X_13.fit_transform(X[:, 13])

labelencoder_X_15= LabelEncoder()
X[:, 15] = labelencoder_X_15.fit_transform(X[:, 15])

labelencoder_X_19= LabelEncoder()
X[:, 19] = labelencoder_X_19.fit_transform(X[:, 19])

In [40]:
print( X[:,1] )
print( X[:,3] )
print( X[:,6] )
print( X[:,9] )
print( X[:,13] )
print( X[:,15] )
print( X[:,19] )

[2 1 2 ... 2 1 2]
[2 1 1 ... 1 2 1]
[1 1 4 ... 1 3 3]
[0 1 1 ... 1 1 1]
[7 6 2 ... 4 7 2]
[2 1 2 ... 1 1 1]
[1 0 1 ... 1 0 0]


In [41]:
print( len(np.unique(X[:,1])) )
print( len(np.unique(X[:,3])) )
print( len(np.unique(X[:,6])) )
print( len(np.unique(X[:,9])) )
print( len(np.unique(X[:,13])) )
print( len(np.unique(X[:,15])) )
print( len(np.unique(X[:,19])) )

3
3
6
2
9
3
2


In [42]:
print( np.unique(X[:,1]), '\n')
print( np.unique(X[:,3]), '\n' )
print( np.unique(X[:,6]), '\n' )
print( np.unique(X[:,9]), '\n' )
print( np.unique(X[:,13]), '\n' )
print( np.unique(X[:,15]), '\n' )
print( np.unique(X[:,19]), '\n' )

[0 1 2] 

[0 1 2] 

[0 1 2 3 4 5] 

[0 1] 

[0 1 2 3 4 5 6 7 8] 

[0 1 2] 

[0 1] 



In [43]:
print( len(X[:,1]) )
print( len(X[:,3]) )
print( len(X[:,6]) )
print( len(X[:,9]) )
print( len(X[:,13]) )
print( len(X[:,15]) )
print( len(X[:,19]) )

1470
1470
1470
1470
1470
1470
1470


In [44]:
X = X.astype(float)

In [45]:
print( X[:,1] )
print( X[:,3] )
print( X[:,6] )
print( X[:,9] )
print( X[:,13] )
print( X[:,15] )
print( X[:,19] )

[2. 1. 2. ... 2. 1. 2.]
[2. 1. 1. ... 1. 2. 1.]
[1. 1. 4. ... 1. 3. 3.]
[0. 1. 1. ... 1. 1. 1.]
[7. 6. 2. ... 4. 7. 2.]
[2. 1. 2. ... 1. 1. 1.]
[1. 0. 1. ... 1. 0. 0.]


In [46]:
print( len(np.unique(X[:,1])) )
print( len(np.unique(X[:,3])) )
print( len(np.unique(X[:,6])) )
print( len(np.unique(X[:,9])) )
print( len(np.unique(X[:,13])) )
print( len(np.unique(X[:,15])) )
print( len(np.unique(X[:,19])) )

3
3
6
2
9
3
2


In [47]:
print( np.unique(X[:,1]), '\n')
print( np.unique(X[:,3]), '\n' )
print( np.unique(X[:,6]), '\n' )
print( np.unique(X[:,9]), '\n' )
print( np.unique(X[:,13]), '\n' )
print( np.unique(X[:,15]), '\n' )
print( np.unique(X[:,19]), '\n' )

[0. 1. 2.] 

[0. 1. 2.] 

[0. 1. 2. 3. 4. 5.] 

[0. 1.] 

[0. 1. 2. 3. 4. 5. 6. 7. 8.] 

[0. 1. 2.] 

[0. 1.] 



In [48]:
labelencoder_y= LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [49]:
y

array([1, 0, 1, ..., 0, 0, 0])

In [None]:
X = dataset.iloc[:, 1:]
y = dataset.iloc[:, 0]

In [None]:
categ_cols = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']

for i in categ_cols:
    print( X[i].unique(), '\n')

In [None]:
categ_cols = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']

labelencoder_X_1 = LabelEncoder()
X[categ_cols[0]] = labelencoder_X_1.fit_transform(X[categ_cols[0]].values)

labelencoder_X_3 = LabelEncoder()
X[categ_cols[1]] = labelencoder_X_3.fit_transform(X[categ_cols[1]].values)

labelencoder_X_6 = LabelEncoder()
X[categ_cols[2]] = labelencoder_X_6.fit_transform(X[categ_cols[2]].values)

labelencoder_X_9 = LabelEncoder()
X[categ_cols[3]] = labelencoder_X_9.fit_transform(X[categ_cols[3]].values)

labelencoder_X_13 = LabelEncoder()
X[categ_cols[4]] = labelencoder_X_13.fit_transform(X[categ_cols[4]].values)

labelencoder_X_15 = LabelEncoder()
X[categ_cols[5]] = labelencoder_X_15.fit_transform(X[categ_cols[5]].values)

labelencoder_X_19 = LabelEncoder()
X[categ_cols[6]] = labelencoder_X_19.fit_transform(X[categ_cols[6]].values)

In [None]:
labelencoder_y= LabelEncoder()
y = labelencoder_y.fit_transform(y)

## OneHotEncoder Categorical Columns

In [50]:
# categorical columns indexes in X == 1, 3, 6, 9, 13, 15, 19
# categorical columns in X == ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']

In [51]:
# unique, counts = np.unique(X[:,1], return_counts=True)
    # print(np.asarray((unique, counts)).T)
# print(np.asarray((np.unique(X[:,1], return_counts=True))).T)
# unique, counts = np.unique(X[:,1], return_counts=True)
    # len(counts)

In [52]:
# rows, columns = X.shape
# print("Rows:", rows)
# print("Columns:", columns)

# Rows: 1470
# Columns: 38

In [53]:
# X.shape[1]

In [54]:
# for i in range(X.shape[1]):
#     print( 'col_idx:', i, 'values_count:', len(np.unique(X[:,i])) )

In [59]:
# onehotencoder ==> Single Column in Numpy
# onehotencoder1 = ColumnTransformer(transformers=[('onehot', OneHotEncoder(sparse_output=False), [1])], remainder='passthrough')
# X = onehotencoder1.fit_transform(X)
# X = X[:,1:]

In [60]:
# onehotencoder ==> Multiple Columns in Numpy
# transformer = make_column_transformer(
#     (OneHotEncoder(categories='auto', sparse_output=False), [1, 3, 6, 13, 15]), 
#     remainder='passthrough' 
# )

In [None]:
categ_cols = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']
onehotencoder = OneHotEncoder(categories='auto')
onehot_features_arr = onehotencoder1.fit_transform(dataset_o[categ_cols]).toarray()
onehot_features_label = onehotencoder1.categories_

onehot_features_list = []

for i in onehot_features_label:
    for j in i:
        onehot_features_list.append(j)

onehot_features_df = pd.DataFrame(onehot_features_arr, columns=onehot_features_list)
onehot_features_df.shape

In [None]:
X = pd.concat([X, onehot_features_df], axis=1, ignore_index=False)

In [None]:
categ_cols = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']
X.drop(categ_cols, axis=1, inplace=True)

In [None]:
X = X.iloc[:, 0:].values

In [None]:
X = X.astype(float)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [82]:
categ_cols = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']
feature_labels2 = np.array(categ_cols).ravel()
feature_labels2

array(['BusinessTravel', 'Department', 'EducationField', 'Gender',
       'JobRole', 'MaritalStatus', 'OverTime'], dtype='<U14')

In [None]:
categ_cols = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']
print( X[categ_cols].value_counts())

In [79]:
import pandas as pd
df1 = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [25, 30]})
df2 = pd.DataFrame({'City': ['New York', 'Los Angeles'], 'Salary': [70000, 80000]})

# combined_df = pd.concat([df1, df2], axis=1, ignore_index=False)
# combined_df = pd.concat([df1, df2], ignore_index=True)
df1 = pd.concat([df1, df2], axis=1, ignore_index=False)
display(df1)

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,70000
1,Bob,30,Los Angeles,80000
