In [149]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import numpy.random as rand
from itertools import islice
from sklearn.pipeline import Pipeline
from sklearn.ensemble import (GradientBoostingClassifier, 
                              AdaBoostClassifier,
                              RandomForestClassifier)
import sklearn.datasets as datasets
import sklearn.model_selection as cv
import sklearn.metrics as metrics
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

plt.style.use('ggplot')

from pylab import rcParams
rcParams['figure.figsize'] = (9, 7)
# from IPython.display import HTML
from DataCleaning import data_cleaning

In [124]:
X, y = data_cleaning('data/churn_train.csv')

In [125]:
X_train, X_test, y_train, y_test = cv.train_test_split(X, y)

In [126]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=2, n_jobs=-1)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [128]:
def model_scores(X, y, estimator, p=False, train=False):
    if train:
        scores = cv.cross_validate(estimator, X, y, scoring=['accuracy', 'precision', 'recall', 'neg_log_loss'], cv=5)
        acc = np.mean(scores['test_accuracy'])
        precision = np.mean(scores['test_precision'])
        recall = np.mean(scores['test_recall'])
        log_loss = -np.mean(scores['test_neg_log_loss'])
    else:
        y_hat = estimator.predict(X)
        acc = metrics.accuracy_score(y, y_hat)
        precision = metrics.precision_score(y, y_hat)
        recall = metrics.recall_score(y, y_hat)
        log_loss = metrics.log_loss(y, estimator.predict_proba(X))
    if p:
        print ("Accuracy: {0:2.3} | Precision: {1:2.3} | Recall: {2:2.3} | Log_loss: {3:2.3}".format(
                                                                    acc, precision, recall, log_loss))                                                           
    else:
        return acc, precision, recall, log_loss

In [148]:
print ('Train_Data')
model_scores(X_train, y_train, rfc, p=True, train=False)

Train_Data
Accuracy: 0.728 | Precision: 0.75 | Recall: 0.41 | Log_loss: 0.559


In [130]:
print ('Test_Data')
model_scores(X_test, y_test, rfc, p=True)

Test_Data
Accuracy: 0.735 | Precision: 0.775 | Recall: 0.424 | Log_loss: 0.555


In [132]:
1 - y.mean()

0.6242

In [None]:
null
churn = .625
retain = .375

In [133]:
metrics.confusion_matrix(y_test, rfc.predict(X_test))

array([[5742,  468],
       [2182, 1608]])

In [135]:
1608 / (2182+1608)

0.42427440633245384

In [136]:
from sklearn.linear_model import LogisticRegression

In [138]:
LR = LogisticRegression(solver='lbfgs')
LR.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [139]:
metrics.confusion_matrix(y_test, LR.predict(X_test))

array([[5283,  927],
       [1876, 1914]])

In [142]:
metrics.accuracy_score(y_test, LR.predict(X_test))

0.7197

In [146]:
null_y = np.ones(len(y_test*y_test.mean()))

In [147]:
metrics.confusion_matrix(y_test, null_y)

array([[   0, 6210],
       [   0, 3790]])

In [None]:
logistic_regression_grid = {'fit_intercept':
                            'solver': ['lbfgs', 'saga'],
                            'random_state': [2]}

lr_gridsearch = cv.GridSearchCV(LogisticRegression(),
                                logistic_regression_grid,
                                n_jobs=-1,
                                verbose=True,
                                scoring='neg_log_loss',
                                cv=5)
lr_gridsearch.fit(X_train, y_train)

print("best parameters:        ", lr_gridsearch.best_params_)

best_estimator = rf_gridsearch.best_estimator_
print('best estimator:         ', best_estimator)

best_prediction = best_estimator.predict(X_test)
print('Log_loss of best estimator:  ', metrics.log_loss(y_test, best_prediction))

In [151]:
random_forest_grid = {'max_depth': [1, 2, None],
                      'max_features': ['sqrt', 'log2', None],
                      'min_samples_split': [2, 4],
                      'min_samples_leaf': [1, 2, 4],
                      'bootstrap': [True, False],
                      'n_estimators': [30, 50, 75, 100],
                      'random_state': [2]}

rf_gridsearch = cv.GridSearchCV(RandomForestClassifier(),
                             random_forest_grid,
                             n_jobs=-1,
                             verbose=True,
                             scoring='neg_log_loss',
                             cv=5)
rf_gridsearch.fit(X_train, y_train)

print("best parameters:        ", rf_gridsearch.best_params_)

best_estimator = rf_gridsearch.best_estimator_
print('best estimator:         ', best_estimator)

best_prediction = best_estimator.predict(X_test)

In [154]:
print('Score of estimator:  ', metrics.recall_score(y_test, best_prediction))

Log_loss of best estimator:   0.6617414248021108


In [None]:
gradient_boosted_grid = {'max_depth': [1, 2, None],
                         'max_features': ['sqrt', 'log2', None],
                         'min_samples_split': [2, 4],
                         'min_samples_leaf': [1, 2, 4],
                         'bootstrap': [True, False],
                         'n_estimators': [30, 50, 75, 100],
                         'random_state': [2]}

gbr_gridsearch = cv.GridSearchCV(GradientBoostingClassifier(),
                             gradient_boosted_grid,
                             n_jobs=-1,
                             verbose=True,
                             scoring='neg_log_loss',
                             cv=5)
gbr_gridsearch.fit(X_train, y_train)

print("best parameters:        ", rf_gridsearch.best_params_)

best_estimator = rf_gridsearch.best_estimator_
print('best estimator:         ', best_estimator)

best_prediction = best_estimator.predict(X_test)
print('Log_loss of best estimator:  ', metrics.log_loss(y_test, best_prediction))

In [None]:
X_test, y_test = data_cleaning('data/churn_test.csv')

In [None]:
best_model.fit(X, y)
best_model.predict(y_test)