In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
from statsmodels.stats.proportion import proportions_ztest
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from collections import Counter
from imblearn.under_sampling import TomekLinks
import pickle

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


churn = pd.read_csv('churn.csv', index_col=0)

In [2]:
X = churn.drop('churn', axis = 1)
y = churn.churn

In [3]:
y.value_counts()

0    2850
1     483
Name: churn, dtype: int64

# Decision Tree - Grid Search

In [4]:
X_traintl, X_testtl, y_traintl, y_testtl = train_test_split(X, y, random_state=1)

In [5]:
y_traintl.value_counts()

0    2138
1     361
Name: churn, dtype: int64

In [6]:
## I am going to use TomekLinks to downsample because I have a significant class imbalance 
## in my churn variable.

tl = TomekLinks()
X_res, y_res = tl.fit_resample(X_traintl, y_traintl)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({0: 2000, 1: 361})


In [7]:
tl = TomekLinks()
X_res, y_res = tl.fit_sample(X_traintl, y_traintl)


In [8]:
param_dict={'max_depth': range(1,10,1),'criterion': ['gini','entropy'], 'min_samples_leaf' : range(10,40,1), 'max_leaf_nodes': range(0,30,1), 'class_weight': ['balanced']}

In [9]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=1),
                  param_grid= param_dict,
                  scoring='f1', cv=3,verbose = 1, n_jobs=-1)

In [10]:
gs.fit(X_res, y_res)

Fitting 3 folds for each of 16200 candidates, totalling 48600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1100 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 5100 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 10700 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 17900 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 26700 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done 37100 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done 48600 out of 48600 | elapsed:  1.1min finished


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=1), n_jobs=-1,
             param_grid={'class_weight': ['balanced'],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 10),
                         'max_leaf_nodes': range(0, 30),
                         'min_samples_leaf': range(10, 40)},
             scoring='f1', verbose=1)

In [11]:
gs.best_score_

0.7443197808703079

In [12]:
gs.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 6,
 'max_leaf_nodes': 21,
 'min_samples_leaf': 10}

In [23]:
y_predsgs=gs.best_estimator_.predict(X_testtl)


In [40]:
recall_score(y_testtl,y_predsgs)

0.7622950819672131

In [41]:
f1_score(y_testtl,y_predsgs)

0.7153846153846154

In [60]:
mod = open('gs.pkl', 'wb')
pickle.dump(gs.best_estimator_, mod)
mod.close()

## - Second Decision Tree Grid Search
## - This time scoring on Recall instead of F1

In [80]:
gs2 = GridSearchCV(DecisionTreeClassifier(random_state=1),
                  param_grid= param_dict,
                  scoring='recall', cv=3,verbose = 1, n_jobs=-1)

In [81]:
gs2.fit(X_res, y_res)

Fitting 3 folds for each of 16200 candidates, totalling 48600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 920 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 4920 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 10520 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 17720 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 26520 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 36920 tasks      | elapsed:   51.3s
[Parallel(n_jobs=-1)]: Done 48600 out of 48600 | elapsed:  1.2min finished


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=1), n_jobs=-1,
             param_grid={'class_weight': ['balanced'],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 10),
                         'max_leaf_nodes': range(0, 30),
                         'min_samples_leaf': range(10, 40)},
             scoring='recall', verbose=1)

In [83]:
gs2.best_score_

0.8504361799816346

In [84]:
gs2.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 4,
 'max_leaf_nodes': 6,
 'min_samples_leaf': 10}

In [85]:
y_predsgs2=gs2.best_estimator_.predict(X_testtl)


In [86]:
recall_score(y_testtl,y_predsgs2)

0.7459016393442623

In [87]:
f1_score(y_testtl,y_predsgs2)

0.5723270440251572

## - Though it got a higher score on the training set, it did worse on the test set
## - It appears it overfits to training data when I use recall score
## - I will keep the 1st Decision Tree GridSearch and I won't save this one

# Random Forest - Grid Search

## For my first Random Forest Grid Search, I will be scoring on F1

In [15]:
rf = RandomForestClassifier(class_weight='balanced', random_state=1)
param_dictrf={'n_estimators': [100,200], 'max_depth': range(3,10,1), 'criterion': ['gini','entropy'],'min_samples_leaf' : range(5,20,1), 'max_leaf_nodes': range(0,30,1)}
gs_forest=GridSearchCV(rf,param_dictrf,scoring='f1',cv=3,verbose=1,n_jobs=-2)
gs_forest.fit(X_res,y_res)

Fitting 3 folds for each of 12600 candidates, totalling 37800 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-2)]: Done 334 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-2)]: Done 834 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-2)]: Done 1534 tasks      | elapsed:   55.6s
[Parallel(n_jobs=-2)]: Done 2339 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-2)]: Done 2889 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-2)]: Done 3539 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-2)]: Done 4289 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-2)]: Done 5139 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-2)]: Done 6089 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-2)]: Done 7139 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-2)]: Done 8289 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-2)]: Done 9539 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-2)]: Done 10889 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-2)]: Done 12339 tasks    

GridSearchCV(cv=3,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              random_state=1),
             n_jobs=-2,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(3, 10),
                         'max_leaf_nodes': range(0, 30),
                         'min_samples_leaf': range(5, 20),
                         'n_estimators': [100, 200]},
             scoring='f1', verbose=1)

In [16]:
gs_forest.best_params_

{'criterion': 'entropy',
 'max_depth': 6,
 'max_leaf_nodes': 29,
 'min_samples_leaf': 5,
 'n_estimators': 200}

In [45]:
gs_forest.best_estimator_.feature_importances_

array([0.02577201, 0.13089939, 0.05739601, 0.02420921, 0.2853505 ,
       0.02119778, 0.08372057, 0.02053281, 0.0414663 , 0.04456235,
       0.05409738, 0.2072108 , 0.00358489])

In [76]:
gs_forest.best_score_

y_predsgsf=gs_forest.best_estimator_.predict(X_testtl)
recall_score(y_testtl,y_predsgsf)

0.819672131147541

In [75]:
gs_forest.best_score_

y_predsgsf=gs_forest.best_estimator_.predict(X_testtl)
recall_score(y_testtl,y_predsgsf)

f1_score(y_testtl,y_predsgsf)


0.7299270072992702

In [74]:

mod = open('gs_forest.pkl', 'wb')
pickle.dump(gs_forest.best_estimator_, mod)
mod.close()

## For my second Random Forest Grid Search, I will be scoring on recall

In [72]:
rf2 = RandomForestClassifier(class_weight='balanced', random_state=1)
param_dictrf2={'n_estimators': [100,200], 'max_depth': range(3,7,1), 'criterion': ['gini','entropy'],'min_samples_leaf' : range(1,27,1), 'max_leaf_nodes': range(20,32,1)}
gs_forest2=GridSearchCV(rf2,param_dictrf2,scoring='recall',cv=3,verbose=1,n_jobs=-2)
gs_forest2.fit(X_res,y_res)

Fitting 3 folds for each of 4992 candidates, totalling 14976 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-2)]: Done 178 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-2)]: Done 428 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-2)]: Done 778 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-2)]: Done 1228 tasks      | elapsed:   54.2s
[Parallel(n_jobs=-2)]: Done 1778 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-2)]: Done 2428 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-2)]: Done 3178 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-2)]: Done 4028 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-2)]: Done 4978 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-2)]: Done 6028 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-2)]: Done 7178 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-2)]: Done 8428 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-2)]: Done 9778 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-2)]: Done 11228 tasks      

GridSearchCV(cv=3,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              random_state=1),
             n_jobs=-2,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(3, 7),
                         'max_leaf_nodes': range(20, 32),
                         'min_samples_leaf': range(1, 27),
                         'n_estimators': [100, 200]},
             scoring='recall', verbose=1)

In [73]:
gs_forest2.best_score_

0.8421028466483013

In [77]:
y_predsgsf2=gs_forest2.best_estimator_.predict(X_testtl)
recall_score(y_testtl,y_predsgsf2)

0.8278688524590164

In [78]:
f1_score(y_testtl,y_predsgsf2)

0.6579804560260586

## I will be saving both the 1st and 2nd Random Forest Grid Search. Although the second Grid Search had a higher recall, it came at a slight hit to F1. I am primarily interested in Recall but do not want a poor F1, either. I will be saving both models to use in my voting classifier and see what each model can contribute.


In [79]:
mod = open('gs_forest2.pkl', 'wb')
pickle.dump(gs_forest2.best_estimator_, mod)
mod.close()

## Voting Classifier

In [88]:
knn3 = pickle.load(open('knn3.pkl', 'rb'))

In [89]:
X_trainvc, X_testvc, y_trainvc, y_testvc = train_test_split(X, y, random_state=1)

In [90]:
vclf = VotingClassifier(estimators=[('gs_forest', gs_forest), ('knn3', knn3), ('gs', gs), ('gs_forest2', gs_forest2)], voting = 'hard')



In [92]:
vclf = vclf.fit(X_trainvc,y_trainvc)

Fitting 3 folds for each of 12600 candidates, totalling 37800 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-2)]: Done 213 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-2)]: Done 711 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-2)]: Done 1064 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-2)]: Done 1514 tasks      | elapsed:   57.0s
[Parallel(n_jobs=-2)]: Done 2064 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-2)]: Done 2714 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-2)]: Done 3464 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-2)]: Done 4314 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-2)]: Done 5264 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-2)]: Done 6314 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-2)]: Done 7464 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-2)]: Done 8714 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-2)]: Done 10064 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-2)]: Done 11514 tasks    

Fitting 3 folds for each of 16200 candidates, totalling 48600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1640 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 5640 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 11240 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 18440 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 27240 tasks      | elapsed:   38.4s
[Parallel(n_jobs=-1)]: Done 37640 tasks      | elapsed:   52.3s
[Parallel(n_jobs=-1)]: Done 48600 out of 48600 | elapsed:  1.2min finished
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.


Fitting 3 folds for each of 4992 candidates, totalling 14976 fits


[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-2)]: Done 178 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-2)]: Done 428 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-2)]: Done 778 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-2)]: Done 1228 tasks      | elapsed:   52.6s
[Parallel(n_jobs=-2)]: Done 1778 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-2)]: Done 2428 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-2)]: Done 3178 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-2)]: Done 4028 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-2)]: Done 4978 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-2)]: Done 6028 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-2)]: Done 7178 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-2)]: Done 8428 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-2)]: Done 9778 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-2)]: Done 11228 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-2)]: Done 12778 tasks      | elapsed: 10

In [93]:
preds = vclf.predict(X_testvc)

In [94]:
f1_score(y_testvc, preds)

0.7389558232931727

In [95]:
recall_score(y_testvc, preds)

0.7540983606557377

In [58]:
mod = open('vclf.pkl', 'wb')
pickle.dump(vclf, mod)
mod.close()

In [96]:
vclf2 = VotingClassifier(estimators=[('gs_forest', gs_forest), ('knn3', knn3), ('gs', gs), ('gs_forest2', gs_forest2)], voting = 'soft')
vclf2 = vclf2.fit(X_trainvc,y_trainvc)

Fitting 3 folds for each of 12600 candidates, totalling 37800 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-2)]: Done 334 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-2)]: Done 834 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-2)]: Done 1534 tasks      | elapsed:   56.2s
[Parallel(n_jobs=-2)]: Done 2086 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-2)]: Done 2636 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-2)]: Done 3286 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-2)]: Done 4036 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-2)]: Done 4886 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-2)]: Done 5836 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-2)]: Done 6886 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-2)]: Done 8036 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-2)]: Done 9286 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-2)]: Done 10636 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-2)]: Done 12086 tasks    

Fitting 3 folds for each of 16200 candidates, totalling 48600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1640 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 5640 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 11240 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 18440 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done 27240 tasks      | elapsed:   37.5s
[Parallel(n_jobs=-1)]: Done 37640 tasks      | elapsed:   51.3s
[Parallel(n_jobs=-1)]: Done 48600 out of 48600 | elapsed:  1.1min finished
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.


Fitting 3 folds for each of 4992 candidates, totalling 14976 fits


[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-2)]: Done 178 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-2)]: Done 428 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-2)]: Done 778 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-2)]: Done 1228 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-2)]: Done 1778 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-2)]: Done 2428 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-2)]: Done 3178 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-2)]: Done 4028 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-2)]: Done 4978 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-2)]: Done 6028 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-2)]: Done 7178 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-2)]: Done 8428 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-2)]: Done 9778 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-2)]: Done 11228 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-2)]: Done 12778 tasks      | elapsed:  9

In [97]:
preds2 = vclf2.predict(X_testvc)

In [99]:
f1_score(y_testvc, preds2)

0.7603305785123967

In [100]:
recall_score(y_testvc, preds2)

0.7540983606557377

In [65]:
mod = open('vclf2.pkl', 'wb')
pickle.dump(vclf2, mod)
mod.close()

In [71]:
vclf2.n_features_in_

13