In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score

def default_random_forest(df):
    y = df.pop('attrition').values
    X = df.values
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    
    y_preds = rf.predict(X_test)
    
    accuracy_score_ = accuracy_score(y_test, y_preds)
    precision_score_ = precision_score(y_test, y_preds)
    recall_score_ = recall_score(y_test, y_preds)
    
    return f'Accuracy Score: {accuracy_score_}', f'Precision Score: {precision_score_}', f'Recall Score: {recall_score_}'

In [2]:
df = pd.read_csv('./data/clean_one_hot_data.csv')

In [3]:
default_random_forest(df)

('Accuracy Score: 0.8668478260869565',
 'Precision Score: 0.9166666666666666',
 'Recall Score: 0.1864406779661017')

In [4]:
# The default random forest with number of trees = 100 is at about 86.6% accurate which is only
# slightly better than creating a RF by just purely guessing (84%) so will use SMOTE to account for this imbalance

In [11]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
[K     |████████████████████████████████| 167 kB 1.3 MB/s eta 0:00:01
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.7.0


In [24]:
from imblearn.over_sampling import SMOTE

df = pd.read_csv("./data/clean_one_hot_data.csv")
df.drop(df.columns[0], axis=1,inplace=True)

In [25]:
df.head()

Unnamed: 0,age,attrition,business_travel,daily_rate,distance_from_home,education,environment_satisfaction,gender,hourly_rate,job_involvement,...,job_role_sales representative,department_research & development,department_sales,education_field_life sciences,education_field_marketing,education_field_medical,education_field_other,education_field_technical degree,marital_status_married,marital_status_single
0,41,1,1,1102,1,2,2,0,94,3,...,0,0,1,1,0,0,0,0,0,1
1,49,0,2,279,8,1,3,1,61,2,...,0,1,0,1,0,0,0,0,1,0
2,37,1,1,1373,2,2,4,1,92,2,...,0,1,0,0,0,0,1,0,0,1
3,33,0,2,1392,3,4,4,0,56,3,...,0,1,0,1,0,0,0,0,1,0
4,27,0,1,591,2,1,1,1,40,3,...,0,1,0,0,0,1,0,0,1,0


In [26]:
y = df.pop('attrition').values
X = df.values
# smt = SMOTE(random_state=42)
# X_train_SMOTE, y_train_SMOTE = smt.fit_sample(X_train, y_train)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [28]:
smt = SMOTE(random_state=42)

In [29]:
X_train_SMOTE, y_train_SMOTE = smt.fit_sample(X_train, y_train)

In [30]:
rf = RandomForestClassifier(random_state=42)

In [31]:
rf.fit(X_train_SMOTE, y_train_SMOTE)

RandomForestClassifier(random_state=42)

In [32]:
y_preds = rf.predict(X_test)

In [33]:
accuracy_score(y_test, y_preds)

0.8722826086956522

In [34]:
# With SMOTE, I received an OOB Accuracy Score of 87.2%

In [36]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_preds)

array([[308,  12],
       [ 35,  13]])

In [37]:
df = pd.read_csv("./data/clean_one_hot_data.csv")
df.drop(df.columns[0], axis=1,inplace=True)

In [38]:
def smote_default_random_forest(df):
    y = df.pop('attrition').values
    X = df.values
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)
    smt = SMOTE(random_state=42)
    X_train_SMOTE, y_train_SMOTE = smt.fit_sample(X_train, y_train)
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train_SMOTE, y_train_SMOTE)
    y_preds = rf.predict(X_test)
    
    accuracy_score_ = accuracy_score(y_test, y_preds)
    precision_score_ = precision_score(y_test, y_preds)
    recall_score_ = recall_score(y_test, y_preds)
    
    return f'Accuracy Score: {accuracy_score_}', f'Precision Score: {precision_score_}', f'Recall Score: {recall_score_}'

In [39]:
smote_default_random_forest(df)

('Accuracy Score: 0.8722826086956522',
 'Precision Score: 0.52',
 'Recall Score: 0.2708333333333333')

In [40]:
def smote_1000_tree_random_forest(df):
    y = df.pop('attrition').values
    X = df.values
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)
    smt = SMOTE(random_state=42)
    X_train_SMOTE, y_train_SMOTE = smt.fit_sample(X_train, y_train)
    rf = RandomForestClassifier(n_estimators=1000,random_state=42)
    rf.fit(X_train_SMOTE, y_train_SMOTE)
    y_preds = rf.predict(X_test)
    
    accuracy_score_ = accuracy_score(y_test, y_preds)
    precision_score_ = precision_score(y_test, y_preds)
    recall_score_ = recall_score(y_test, y_preds)
    
    return f'Accuracy Score: {accuracy_score_}', f'Precision Score: {precision_score_}', f'Recall Score: {recall_score_}'

In [42]:
df = pd.read_csv("./data/clean_one_hot_data.csv")
df.drop(df.columns[0], axis=1,inplace=True)
smote_1000_tree_random_forest(df)

('Accuracy Score: 0.8586956521739131',
 'Precision Score: 0.42857142857142855',
 'Recall Score: 0.25')

In [44]:
# Interesting that a 1000 tree forest decreased the accuracy, precision score and recall score

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score

df = pd.read_csv("./data/clean_one_hot_data.csv")
df.drop(df.columns[0], axis=1,inplace=True)

In [5]:
# Tuning the hyperparameter class_weight to see if I can improve the Recall score

def default_random_forest(df):
    y = df.pop('attrition').values
    X = df.values
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)
    rf = RandomForestClassifier(random_state=42, class_weight = 'balanced_subsample')
    rf.fit(X_train, y_train)
    
    y_preds = rf.predict(X_test)
    
    accuracy_score_ = accuracy_score(y_test, y_preds)
    precision_score_ = precision_score(y_test, y_preds)
    recall_score_ = recall_score(y_test, y_preds)
    
    return f'Accuracy Score: {accuracy_score_}', f'Precision Score: {precision_score_}', f'Recall Score: {recall_score_}'

In [4]:
default_random_forest(df) #with class_weight = 'balanced'

('Accuracy Score: 0.8804347826086957',
 'Precision Score: 0.8333333333333334',
 'Recall Score: 0.10416666666666667')

In [8]:
df = pd.read_csv("./data/clean_one_hot_data.csv")
df.drop(df.columns[0], axis=1,inplace=True)
default_random_forest(df) #with class_weight = "balanced_subsample"

('Accuracy Score: 0.8804347826086957',
 'Precision Score: 1.0',
 'Recall Score: 0.08333333333333333')

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

import matplotlib.pyplot as plt
plt.style.use("ggplot")

df = pd.read_csv("./data/clean_one_hot_data.csv")
df.drop(df.columns[0], axis=1,inplace=True)

y = df.pop('attrition').values
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

rf = RandomForestClassifier(n_jobs=-1)

param_grid = {
    'min_samples_split': [3, 5, 10], 
    'n_estimators' : [100, 300],
    'max_depth': [3, 5, 15, 25],
    'max_features': [3, 5, 10, 20]
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

In [2]:
def grid_search_wrapper(refit_score='recall_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(rf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # make the predictions
    y_pred = grid_search.predict(X_test)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

In [3]:
rf_grid_search = grid_search_wrapper(refit_score='recall_score')

Best params for recall_score
{'max_depth': 25, 'max_features': 20, 'min_samples_split': 3, 'n_estimators': 100}

Confusion matrix of Random Forest optimized for recall_score on the test data:
     pred_neg  pred_pos
neg       317         3
pos        43         5
