In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [6]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [7]:
print('Samples:' ,df.shape[0])
print('Features:' ,df.shape[1])

Samples: 1470
Features: 35


### Data Cleaning

In [8]:
# Removal of obviously useless features
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1, inplace=True)

In [9]:
print('Features:' ,df.shape[1])

Features: 31


In [10]:
y = df['Attrition']
X = df.drop(['Attrition'], axis=1)

In [11]:
y.unique()

array(['Yes', 'No'], dtype=object)

In [12]:
df['Attrition'].value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

In [13]:
from sklearn import preprocessing
le = preprocessing.LabelBinarizer()

In [14]:
y = le.fit_transform(y)

In [15]:
y.shape

(1470, 1)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null object
BusinessTravel              1470 non-null object
DailyRate                   1470 non-null int64
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EducationField              1470 non-null object
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
HourlyRate                  1470 non-null int64
JobInvolvement              1470 non-null int64
JobLevel                    1470 non-null int64
JobRole                     1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome               1470 non-null int64
MonthlyRate                 1470 non-null int64
NumCompaniesWorked    

In [17]:
# inspect string based features
df.select_dtypes(['object'])

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Yes,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Yes
1,No,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,No
2,Yes,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Yes
3,No,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Yes
4,No,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,No
5,No,Travel_Frequently,Research & Development,Life Sciences,Male,Laboratory Technician,Single,No
6,No,Travel_Rarely,Research & Development,Medical,Female,Laboratory Technician,Married,Yes
7,No,Travel_Rarely,Research & Development,Life Sciences,Male,Laboratory Technician,Divorced,No
8,No,Travel_Frequently,Research & Development,Life Sciences,Male,Manufacturing Director,Single,No
9,No,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,No


In [18]:
# optimize with for loop
# one hot encoding
ind_BusinessTravel = pd.get_dummies(df['BusinessTravel'], prefix='BusinessTravel')
ind_Department = pd.get_dummies(df['Department'], prefix='Department')
ind_EducationField = pd.get_dummies(df['EducationField'], prefix='EducationField')
ind_Gender = pd.get_dummies(df['Gender'], prefix='Gender')
ind_JobRole = pd.get_dummies(df['JobRole'], prefix='JobRole')
ind_MaritalStatus = pd.get_dummies(df['MaritalStatus'], prefix='MaritalStatus')
ind_OverTime = pd.get_dummies(df['OverTime'], prefix='OverTime')

In [19]:
ind_Department.head()

Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales
0,0,0,1
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [20]:
# combines dummied categorical data with numeric data from original dataframe
df1 = pd.concat([ind_BusinessTravel, ind_Department, ind_EducationField, ind_Gender,
                ind_JobRole, ind_MaritalStatus, ind_OverTime, 
                df.select_dtypes(['int64'])], axis=1)

In [21]:
df1.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0,0,1,0,0,1,0,1,0,0,...,3,1,0,8,0,1,6,4,0,5
1,0,1,0,0,1,0,0,1,0,0,...,4,4,1,10,3,3,10,7,1,7
2,0,0,1,0,1,0,0,0,0,0,...,3,2,0,7,3,3,0,0,0,0
3,0,1,0,0,1,0,0,1,0,0,...,3,3,0,8,3,3,8,7,3,0
4,0,0,1,0,1,0,0,0,0,1,...,3,4,1,6,3,3,2,2,2,2


In [22]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [23]:
from sklearn.model_selection import cross_val_predict, cross_val_score

In [24]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''Print scoring metrics for classifier on training data'''
    if train:
        print("Train Result:\n")
        print("Accuracy score: {0:.4f}".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report:\n {}".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix:\n {} \n".format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t {0:.4f}".format(np.std(res)))
        
      
    elif train == False:
        '''Print scoring metrics for classifier on test data'''  
        
        print("Test Result:\n")
        print("Accuracy score: {0:.4f}".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report:\n {}".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix:\n {} \n".format(confusion_matrix(y_test, clf.predict(X_test))))

## Model 1: Decision Tree

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df1, y, test_size=0.25)

In [29]:
y_train = y_train.ravel()
y_test = y_test.ravel()

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
tree_clf = DecisionTreeClassifier()

In [30]:
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [33]:
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy score: 1.0000
Classification Report:
              precision    recall  f1-score   support

          0       1.00      1.00      1.00       922
          1       1.00      1.00      1.00       180

avg / total       1.00      1.00      1.00      1102

Confusion Matrix:
 [[922   0]
 [  0 180]] 

Average accuracy: 	 0.7758
Accuracy SD: 	 0.0305


In [32]:
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Test Result:

Accuracy score: 0.7826
Classification Report:
              precision    recall  f1-score   support

          0       0.88      0.86      0.87       311
          1       0.32      0.37      0.34        57

avg / total       0.79      0.78      0.79       368

Confusion Matrix:
 [[267  44]
 [ 36  21]] 



In [34]:
## Model 2: Random Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
rf_clf = RandomForestClassifier()

In [37]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [38]:
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy score: 0.9773
Classification Report:
              precision    recall  f1-score   support

          0       0.97      1.00      0.99       922
          1       1.00      0.86      0.93       180

avg / total       0.98      0.98      0.98      1102

Confusion Matrix:
 [[922   0]
 [ 25 155]] 

Average accuracy: 	 0.8584
Accuracy SD: 	 0.0208


In [39]:
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Test Result:

Accuracy score: 0.8533
Classification Report:
              precision    recall  f1-score   support

          0       0.86      0.99      0.92       311
          1       0.67      0.11      0.18        57

avg / total       0.83      0.85      0.81       368

Confusion Matrix:
 [[308   3]
 [ 51   6]] 



In [40]:
en_en = pd.DataFrame()

In [41]:
en_en['tree_clf'] = pd.DataFrame(tree_clf.predict_proba(X_train))[1]
en_en['rf_clf'] = pd.DataFrame(rf_clf.predict_proba(X_train))[1]

col_name = en_en.columns
en_en = pd.concat([en_en, pd.DataFrame(y_train).reset_index(drop=True)], axis=1)

In [43]:
en_en.head()

Unnamed: 0,tree_clf,rf_clf,0
0,0.0,0.0,0
1,0.0,0.0,0
2,0.0,0.1,0
3,1.0,0.7,1
4,1.0,0.9,1


In [44]:
tmp = list(col_name)
tmp.append('ind')
en_en.columns = tmp

In [47]:
en_en.head()

Unnamed: 0,tree_clf,rf_clf,ind
0,0.0,0.0,0
1,0.0,0.0,0
2,0.0,0.1,0
3,1.0,0.7,1
4,1.0,0.9,1


## Meta Classifier 

In [52]:
from sklearn.linear_model import LogisticRegression

In [53]:
m_clf = LogisticRegression(fit_intercept=False)

In [54]:
m_clf.fit(en_en[['tree_clf', 'rf_clf']], en_en['ind'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [55]:
en_test = pd.DataFrame()

In [56]:
# grabs n_classes from predict_proba -> probability itself
en_test['tree_clf'] = pd.DataFrame(tree_clf.predict_proba(X_test))[1]
en_test['rf_clf'] = pd.DataFrame(rf_clf.predict_proba(X_test))[1]

col_name = en_en.columns
en_test['combined'] = m_clf.predict(en_test[['tree_clf', 'rf_clf']])

In [57]:
col_name = en_test.columns
tmp = list(col_name)
tmp.append('ind')

In [59]:
en_test = pd.concat([en_test, pd.DataFrame(y_test).reset_index(drop=True)], axis=1)

In [60]:
en_test.columns = tmp

In [61]:
print(pd.crosstab(en_test['ind'], en_test['combined']))

combined    0   1
ind              
0         267  44
1          36  21


In [66]:
print(round(accuracy_score(en_test['ind'], en_test['combined']), 4))

0.7826


In [67]:
print(classification_report(en_test['ind'], en_test['combined']))

             precision    recall  f1-score   support

          0       0.88      0.86      0.87       311
          1       0.32      0.37      0.34        57

avg / total       0.79      0.78      0.79       368



## Single Classifier

In [84]:
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier

In [85]:
# ratio of imbalanced attrition class
class_weight = {0:0.834, 1:0.166}

In [86]:
pd.Series(list(y_train)).value_counts() / pd.Series(list(y_train)).count()

0    0.836661
1    0.163339
dtype: float64

In [87]:
forest = RandomForestClassifier(class_weight=class_weight, n_jobs=-1)

In [88]:
ada = AdaBoostClassifier(base_estimator=forest, n_estimators=1000,
                        learning_rate=0.5, random_state=42)

In [89]:
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight={0: 0.834, 1: 0.166},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          learning_rate=0.5, n_estimators=1000, random_state=42)

In [91]:
print_score(ada, X_train, y_train, X_test, y_test, train=False)

Test Result:

Accuracy score: 0.8696
Classification Report:
              precision    recall  f1-score   support

          0       0.87      1.00      0.93       311
          1       1.00      0.16      0.27        57

avg / total       0.89      0.87      0.83       368

Confusion Matrix:
 [[311   0]
 [ 48   9]] 



In [92]:
# can add a Bagging Classifier on top of this for even more fun.