In [1]:
import os 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np  
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Read in csv data
attrition_df = pd.read_csv("data/IBM_attrition_data.csv")
attrition_df 

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [11]:
# Set X and y values
X = attrition_df.drop(['Attrition'], axis=1)
X = pd.get_dummies(X)
y = attrition_df['Attrition']

X

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,41,1102,1,2,1,1,2,94,3,2,...,0,0,1,0,0,0,1,1,0,1
1,49,279,8,1,1,2,3,61,2,2,...,0,1,0,0,0,1,0,1,1,0
2,37,1373,2,2,1,4,4,92,2,1,...,0,0,0,0,0,0,1,1,0,1
3,33,1392,3,4,1,5,4,56,3,1,...,0,1,0,0,0,1,0,1,0,1
4,27,591,2,1,1,7,1,40,3,1,...,0,0,0,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,1,2061,3,41,4,2,...,0,0,0,0,0,1,0,1,1,0
1466,39,613,6,1,1,2062,4,42,2,3,...,0,0,0,0,0,1,0,1,1,0
1467,27,155,4,3,1,2064,2,87,4,2,...,0,0,0,0,0,1,0,1,0,1
1468,49,1023,2,3,1,2065,4,63,2,2,...,0,0,1,0,0,1,0,1,1,0


In [17]:
X_cleaned = X.drop(["OverTime_No", "Gender_Male"], axis=1)
X_cleaned

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_Yes
0,41,1102,1,2,1,1,2,94,3,2,...,0,0,0,1,0,0,0,1,1,1
1,49,279,8,1,1,2,3,61,2,2,...,0,0,1,0,0,0,1,0,1,0
2,37,1373,2,2,1,4,4,92,2,1,...,0,0,0,0,0,0,0,1,1,1
3,33,1392,3,4,1,5,4,56,3,1,...,0,0,1,0,0,0,1,0,1,1
4,27,591,2,1,1,7,1,40,3,1,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,1,2061,3,41,4,2,...,0,0,0,0,0,0,1,0,1,0
1466,39,613,6,1,1,2062,4,42,2,3,...,0,0,0,0,0,0,1,0,1,0
1467,27,155,4,3,1,2064,2,87,4,2,...,1,0,0,0,0,0,1,0,1,1
1468,49,1023,2,3,1,2065,4,63,2,2,...,0,0,0,1,0,0,1,0,1,0


In [18]:

print(X_cleaned.shape, y.shape)

(1470, 53) (1470,)


In [19]:

# Train and test split data
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, random_state=42)

X_train.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_Yes
1343,29,592,7,3,1,1883,4,59,3,1,...,0,0,0,0,0,0,0,1,1,0
1121,36,884,1,4,1,1585,2,73,3,2,...,0,0,0,1,0,0,0,1,1,0
1048,34,1326,3,3,1,1478,4,81,1,2,...,0,0,0,1,0,0,0,1,1,0
1393,27,954,9,3,1,1965,4,44,3,2,...,0,0,0,1,0,0,0,1,1,0
527,32,929,10,3,1,722,4,55,3,2,...,0,0,0,1,0,0,0,1,1,0


In [6]:
# Scale data 
#X_scaler = StandardScaler().fit(X_train)


#X_train_scaled = X_scaler.fit_transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)

In [20]:
# Random Forest w/ no Hyperparameter Tuning
model = RandomForestClassifier(n_estimators=200, criterion='gini', min_samples_split=5, min_samples_leaf=2, max_features='auto', bootstrap=True, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [21]:
labels = X_cleaned.columns
model_importances = model.feature_importances_
model_importances_array = zip(model_importances, labels)
sorted(model_importances_array, reverse=True)[:10]

[(0.08026483116726953, 'MonthlyIncome'),
 (0.06997669663682671, 'OverTime_Yes'),
 (0.057930742305350155, 'Age'),
 (0.05044817695832942, 'TotalWorkingYears'),
 (0.04486888241538795, 'DailyRate'),
 (0.04445703604820404, 'EmployeeNumber'),
 (0.04168846486651877, 'MonthlyRate'),
 (0.04167191291121488, 'YearsAtCompany'),
 (0.039399566595043865, 'DistanceFromHome'),
 (0.0385548445254479, 'HourlyRate')]

In [100]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))

[[304  16]
 [ 44   4]]
0.8369565217391305


In [90]:
# Random Forest Hyperparameter tuning
# Number of trees in random forest
n_estimators = [int(x) for x in range(200,2000,200)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
from pprint import pprint
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800]}


In [101]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_scaled, y_train.ravel())

#Best Params
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.2min finished


{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 60,
 'bootstrap': True}

In [102]:
# Random Model Accuracy
from sklearn import metrics

def evaluate(model, test_features, test_labels):
    y_pred = model.predict(X_test_scaled)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print (accuracy)
    print(metrics.confusion_matrix(y_test,y_pred))

best_random = rf_random.best_estimator_
evaluate(best_random, X_test_scaled, y_test)

0.8614130434782609
[[315   5]
 [ 46   2]]
