In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv('/home/learner/Downloads/hr_data.txt')

In [8]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,churn,promotion_last_5years,sales,salary
0,0,0.38,0.53,2,157,3,0,1,0,sales,low
1,1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,3,0.72,0.87,5,223,5,0,1,0,sales,low
4,4,0.37,0.52,2,159,3,0,1,0,sales,low


## Data Preprocessing

In [12]:
df.shape

(14999, 11)

In [40]:
df['satisfaction_level'].median()

0.64

In [50]:
df['sales'].unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [21]:
df[df['number_project'] > 3].shape[0]

8556

In [26]:
df[df['last_evaluation'] >= 0.80].shape[0]

5740

In [110]:
df.corr()['churn'].abs().sort_values(ascending = False)

churn                    1.000000
satisfaction_level       0.388375
Work_accident            0.154622
time_spend_company       0.144822
id                       0.129047
average_montly_hours     0.071287
promotion_last_5years    0.061788
number_project           0.023787
last_evaluation          0.006567
Name: churn, dtype: float64

In [34]:
#Top Performing employee is assumed to be employees who 1. scored 80% and above, 
#and 2. have completed more that 3 projects
new_df = df[(df['last_evaluation'] >= 0.80) & (df['number_project'] > 3)]

In [35]:
new_df.shape

(4112, 11)

In [41]:
#Rename 'sales' columns to departments
new_df = new_df.rename(columns = {'sales': 'departments'})

In [42]:
new_df

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,churn,promotion_last_5years,departments,salary
1,1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,3,0.72,0.87,5,223,5,0,1,0,sales,low
7,7,0.92,0.85,5,259,5,0,1,0,sales,low
8,8,0.89,1.00,5,224,5,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...,...
14986,14986,0.85,0.85,4,247,6,0,1,0,technical,low
14990,14990,0.89,0.88,5,228,5,1,1,0,support,low
14991,14991,0.09,0.81,6,257,4,0,1,0,support,low
14993,14993,0.76,0.83,6,293,6,0,1,0,support,low


### Convert salary column to numerical values

In [43]:
new_df['salary'].unique()

array(['medium', 'low', 'high'], dtype=object)

In [44]:
new_df['salary'] = new_df['salary'].map({'low': 1, 'medium': 2, 'high': 3})

### Use get_dummies to convert departments column to numerical values

In [51]:
new_df = pd.get_dummies(new_df, columns = ['departments'], prefix =['department'])

In [52]:
new_df

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,churn,promotion_last_5years,salary,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical
1,1,0.80,0.86,5,262,6,0,1,0,2,0,0,0,0,0,0,0,1,0,0
2,2,0.11,0.88,7,272,4,0,1,0,2,0,0,0,0,0,0,0,1,0,0
3,3,0.72,0.87,5,223,5,0,1,0,1,0,0,0,0,0,0,0,1,0,0
7,7,0.92,0.85,5,259,5,0,1,0,1,0,0,0,0,0,0,0,1,0,0
8,8,0.89,1.00,5,224,5,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14986,14986,0.85,0.85,4,247,6,0,1,0,1,0,0,0,0,0,0,0,0,0,1
14990,14990,0.89,0.88,5,228,5,1,1,0,1,0,0,0,0,0,0,0,0,1,0
14991,14991,0.09,0.81,6,257,4,0,1,0,1,0,0,0,0,0,0,0,0,1,0
14993,14993,0.76,0.83,6,293,6,0,1,0,1,0,0,0,0,0,0,0,0,1,0


### Set Variables

In [55]:
X = new_df.drop('churn','id', axis = 1)
y = new_df['churn']

In [56]:
X.shape

(4112, 19)

In [57]:
#Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [59]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3289, 19)
(823, 19)
(3289,)
(823,)


In [60]:
#scale data
from sklearn.preprocessing import StandardScaler

In [62]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [63]:
#Transform data
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

In [66]:
pd.DataFrame(X_test_scale).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,-1.44037,-1.660482,1.342679,1.29927,0.940928,0.040304,-0.374212,-0.127978,-0.900144,-0.304452,-0.2163,-0.229819,-0.219362,-0.197105,-0.240617,-0.254452,1.636756,-0.434341,-0.4858
1,-1.433584,1.0548,-0.502663,-0.932218,0.174868,0.752986,-0.374212,-0.127978,-0.900144,-0.304452,-0.2163,-0.229819,-0.219362,-0.197105,-0.240617,-0.254452,-0.610965,-0.434341,2.058461
2,-1.281014,0.425649,-1.341455,0.183526,0.898369,0.752986,-0.374212,-0.127978,-0.900144,-0.304452,-0.2163,-0.229819,-0.219362,-0.197105,-0.240617,-0.254452,-0.610965,-0.434341,2.058461


### Build a Logistic Regression Model

In [91]:
new_df['churn'].value_counts(normalize = True).round(2) * 100

0    59.0
1    41.0
Name: churn, dtype: float64

In [72]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(random_state = 42)

In [73]:
log_model.fit(X_train_scale, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [95]:
y_train_pred = log_model.predict(X_train_scale)
y_test_pred = log_model.predict(X_test_scale)

In [99]:
from sklearn.metrics import r2_score

In [132]:
#R Squared score for train set
r2_score(y_train, y_train_pred)

0.5313365041162875

In [134]:
#R Squared score for test set
r2_score(y_test, y_test_pred)

0.6241322616002922

In [104]:
log_model.coef_

array([[-0.38251993,  0.32324145,  0.26633192,  1.25793785,  1.80216199,
         1.25724167, -0.37824419, -0.60399608, -0.44030039, -0.01227458,
        -0.09908908,  0.00930147,  0.11771788, -0.1004764 , -0.03579084,
        -0.06217089,  0.02615229,  0.03916185,  0.03395091]])

### Using a Different Set of Variables

In [111]:
new_df.corr()['churn'].abs().sort_values(ascending = False)

churn                     1.000000
average_montly_hours      0.568372
number_project            0.501633
time_spend_company        0.429484
satisfaction_level        0.304891
Work_accident             0.186979
salary                    0.177143
id                        0.140755
promotion_last_5years     0.104443
last_evaluation           0.077717
department_management     0.053731
department_RandD          0.045591
department_technical      0.040360
department_marketing      0.014895
department_sales          0.011526
department_product_mng    0.011486
department_accounting     0.009944
department_hr             0.007689
department_IT             0.002579
department_support        0.001319
Name: churn, dtype: float64

In [221]:
exclude_cols = ['id','department_support','department_IT','department_hr','department_accounting',
               'department_product_mng','churn','department_sales',
                'department_marketing','department_technical','department_RandD','department_management']

In [169]:
# set variables

X2 = new_df.drop(columns = exclude_cols, axis = 1)
y2 = new_df['churn']

In [170]:
#Split data into train and test sets
from sklearn.model_selection import train_test_split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [171]:
print(X2_train.shape)
print(X2_test.shape)
print(y2_train.shape)
print(y2_test.shape)

(3289, 8)
(823, 8)
(3289,)
(823,)


In [172]:
#Instantiate the standardScaler class
scaler2 = StandardScaler()

#Scale the train set
scaler2.fit(X2_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [173]:
#Transform train and test sets
X2_train_scale = scaler2.transform(X2_train)
X2_test_scale = scaler2.transform(X2_test)

In [174]:
#Instantiate the LogisticRegression class
log_model_2 = LogisticRegression(random_state = 42)

In [175]:
#Fit the model
log_model_2.fit(X2_train_scale, y2_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [176]:
#Make predictions
y2_train_pred = log_model_2.predict(X2_train_scale)
y2_test_pred = log_model_2.predict(X2_test_scale)

In [177]:
#Get R2 score
r2_score(y2_test, y2_test_pred)

0.6391669711362806

# Decision Tree Classifier

In [180]:
from sklearn.tree import DecisionTreeClassifier

In [181]:
#Set variables
X3 = new_df.drop(columns = exclude_cols, axis = 1)
y3 = new_df['churn']

In [182]:
#Split data into train and test sets
from sklearn.model_selection import train_test_split
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.2, random_state = 42)

In [183]:
print(X2_train.shape)
print(X2_test.shape)
print(y2_train.shape)
print(y2_test.shape)

(3289, 8)
(823, 8)
(3289,)
(823,)


In [205]:
#Instantaite the class
dt_model = DecisionTreeClassifier(max_depth = 3, random_state = 42)

In [206]:
#Fit the model
dt_model.fit(X3_train, y3_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [207]:
#Make predictions
y3_train_pred = dt_model.predict(X3_train)
y3_test_pred = dt_model.predict(X3_test)

In [190]:
from sklearn.metrics import accuracy_score, recall_score

In [208]:
#Get Accuracy scores
accuracy_score(y3_test, y3_test_pred)

0.9270959902794653

In [209]:
#Get Recall score
recall_score(y3_test, y3_test_pred)

0.9764705882352941

In [210]:
dt_model.score(X3_test, y3_test)

0.9270959902794653

# Random Forest Classifier

In [195]:
from sklearn.ensemble import RandomForestClassifier

In [211]:
rf_model = RandomForestClassifier(max_depth = 3, random_state = 42)

In [212]:
rf_model.fit(X3_train, y3_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [213]:
y4_train_pred = rf_model.predict(X3_train)
y4_test_pred = rf_model.predict(X3_test)

In [214]:
accuracy_score(y3_test, y4_test_pred)

0.976913730255164

In [215]:
recall_score(y3_test, y4_test_pred)

0.9735294117647059

In [216]:
import shap

In [217]:
import eli5

The sklearn.metrics.scorer module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
The sklearn.feature_selection.base module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API.


In [218]:
from xgboost import XGBClassifier

In [222]:
# set variables

X5 = new_df.drop(columns = exclude_cols, axis = 1)
y5 = new_df['churn']

In [223]:
X5_rest, X5_test, y5_rest, y5_test = train_test_split(X5,
                                                 y5,
                                                 random_state = 42,
                                                 test_size = 0.2)

In [224]:
#split rest set into train and validation set
X5_train, X5_val, y5_train, y5_val = train_test_split(X5_rest,
                                                  y5_rest,
                                                  random_state = 42,
                                                 test_size = 0.25)

In [225]:
xgb_model = XGBClassifier(max_depth = 6,
                         learning_rate = 0.3,
                         n_estimators = 100,
                         booster = 'gbtree',
                         scale_pos_weight = 1,
                         colsample_bytree = 1,
                         subsample = 1,
                         random_state = 42)

In [226]:
xgb_model.fit(X5_train, y5_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.3, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [227]:
xgb_model.score (X5_train, y5_train)

1.0

In [228]:
xgb_model.score (X5_val, y5_val)

0.9756986634264885

In [231]:
from sklearn.metrics import recall_score