#### Importing relevant libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
emp_data = pd.read_csv('HR_comma_sep.csv.txt')

In [3]:
emp_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
emp_data.shape

(14999, 10)

In [5]:
# Renaming the column sales to department
emp_data.rename(columns={'sales':'department'}, inplace=True)

In [6]:
emp_data.corr(method ='pearson')['left']

satisfaction_level      -0.388375
last_evaluation          0.006567
number_project           0.023787
average_montly_hours     0.071287
time_spend_company       0.144822
Work_accident           -0.154622
left                     1.000000
promotion_last_5years   -0.061788
Name: left, dtype: float64

In [7]:
# Kendall rank correlation is a non-parametric test that measures the strength of 
# dependence between two variables

emp_data.corr(method ='kendall')['left']

satisfaction_level      -0.300675
last_evaluation         -0.002010
number_project          -0.017601
average_montly_hours     0.038045
time_spend_company       0.244851
Work_accident           -0.154622
left                     1.000000
promotion_last_5years   -0.061788
Name: left, dtype: float64

#### Selecting categorical columns & integer columns

In [8]:
# To study data w.r.t counts, types of data
emp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
department               14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


#### Preprocessing Categorical Columns

In [9]:
# Label encoder is used to turn the categorical values to numeric
# One hot encoder is used to conver the labelled rows to columns

from sklearn.preprocessing import LabelEncoder

In [10]:
# INitializing
le = LabelEncoder()

In [11]:
emp_data['department'] = le.fit_transform(emp_data['department'])
print(emp_data['department'].unique())

[7 2 3 9 8 4 0 6 5 1]


In [12]:
# Performing One Hot Encoding

dummy = pd.get_dummies(emp_data['department'])
dummy.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0


In [13]:
emp_data_merge = pd.concat([emp_data,dummy],axis='columns')

In [14]:
emp_data_merge.columns

Index([   'satisfaction_level',       'last_evaluation',
              'number_project',  'average_montly_hours',
          'time_spend_company',         'Work_accident',
                        'left', 'promotion_last_5years',
                  'department',                'salary',
                             0,                       1,
                             2,                       3,
                             4,                       5,
                             6,                       7,
                             8,                       9],
      dtype='object')

In [15]:
# Since the column department has been transformed through one hot enoding, original column has been dropped
emp_data_merge.drop('department',axis=1, inplace=True)

In [16]:
# splitting the data to x & y data
y = emp_data_merge['left'] # Feature 
x = emp_data_merge.drop('left',axis = 1) # Target data

In [17]:
x.columns

Index([   'satisfaction_level',       'last_evaluation',
              'number_project',  'average_montly_hours',
          'time_spend_company',         'Work_accident',
       'promotion_last_5years',                'salary',
                             0,                       1,
                             2,                       3,
                             4,                       5,
                             6,                       7,
                             8,                       9],
      dtype='object')

In [18]:
# Converting column salary to integer

x['salary'] = x['salary'].replace('low',1)
x['salary'] = x['salary'].replace('medium',2)
x['salary'] = x['salary'].replace('high',3)

x['salary'].unique()

array([1, 2, 3], dtype=int64)

#### Preprocessing Number Data

In [19]:
x.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,0,1,2,3,4,5,6,7,8,9
0,0.38,0.53,2,157,3,0,0,1,0,0,0,0,0,0,0,1,0,0
1,0.8,0.86,5,262,6,0,0,2,0,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,0,2,0,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,0,1,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,0,1,0,0,0,0,0,0,0,1,0,0


In [20]:
# Data are transformed to comparable form, MMS is suitable even if the data are not normally distributed
from sklearn.preprocessing import MinMaxScaler

In [21]:
mms = MinMaxScaler()

In [22]:
x[['number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years']] = mms.fit_transform(x[['number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years']])

  return self.partial_fit(X, y)


In [23]:
x.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,0,1,2,3,4,5,6,7,8,9
0,0.38,0.53,0.0,0.285047,0.125,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0
1,0.8,0.86,0.6,0.775701,0.5,0.0,0.0,2,0,0,0,0,0,0,0,1,0,0
2,0.11,0.88,1.0,0.82243,0.25,0.0,0.0,2,0,0,0,0,0,0,0,1,0,0
3,0.72,0.87,0.6,0.593458,0.375,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,0.0,0.294393,0.125,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0


In [24]:
x.columns

Index([   'satisfaction_level',       'last_evaluation',
              'number_project',  'average_montly_hours',
          'time_spend_company',         'Work_accident',
       'promotion_last_5years',                'salary',
                             0,                       1,
                             2,                       3,
                             4,                       5,
                             6,                       7,
                             8,                       9],
      dtype='object')

#### Splitting data into train & test data

In [25]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size = 0.2,random_state = 44)

#### Model Training

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [27]:
lr = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier(subsample=0.5,max_depth=10)

In [28]:
lr.fit(xtrain,ytrain)
rf.fit(xtrain,ytrain)
xgb.fit(xtrain,ytrain)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=0.5, verbosity=1)

#### Model Validation

In [29]:
# On train data
print('Score LogisticReg   :',lr.score(xtrain,ytrain))
print('Score RandomForest  :',rf.score(xtrain,ytrain))
print('Score XGBClassifier :',xgb.score(xtrain,ytrain))

Score LogisticReg   : 0.7951495957996499
Score RandomForest  : 0.9989999166597217
Score XGBClassifier : 0.9940828402366864


In [30]:
from sklearn.metrics import recall_score,precision_score, f1_score, classification_report

In [31]:
# Applying predict function
pred_lr = lr.predict(xtest)
pred_rf = rf.predict(xtest)
pred_xgb = xgb.predict(xtest)

In [32]:
print('PrecisionScore LogisticReg   :',precision_score(y_pred=pred_lr, y_true=ytest))
print('PrecisionScore RandomForest  :',precision_score(y_pred=pred_rf, y_true=ytest))
print('PrecisionScore XGBClassifier :',precision_score(y_pred=pred_xgb, y_true=ytest))

PrecisionScore LogisticReg   : 0.5820224719101124
PrecisionScore RandomForest  : 0.9940029985007496
PrecisionScore XGBClassifier : 0.9894578313253012


In [33]:
print('F1 Score LogisticReg   :',f1_score(y_pred=pred_lr, y_true=ytest))
print('F1 Score RandomForest  :',f1_score(y_pred=pred_rf, y_true=ytest))
print('F1 Score XGBClassifier :',f1_score(y_pred=pred_xgb, y_true=ytest))

F1 Score LogisticReg   : 0.4580017683465959
F1 Score RandomForest  : 0.9800443458980045
F1 Score XGBClassifier : 0.9733333333333334
