In [96]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import numpy.random as rand
from itertools import islice
from sklearn.pipeline import Pipeline
from sklearn.ensemble import (GradientBoostingRegressor, 
                              GradientBoostingClassifier, 
                              AdaBoostClassifier,
                              RandomForestClassifier, 
                              RandomForestRegressor)
import sklearn.datasets as datasets
import sklearn.model_selection as cv
import sklearn.metrics as metrics
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

plt.style.use('ggplot')

from pylab import rcParams
rcParams['figure.figsize'] = (9, 7)
# from IPython.display import HTML

In [51]:
df = pd.read_csv('data/churn_train.csv')
ltd = df['last_trip_date']
df.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct
0,6.94,5.0,5.0,1.0,Astapor,2014-05-03,Android,2014-01-12,0.0,0,False,100.0
1,8.06,5.0,5.0,1.0,Astapor,2014-01-26,Android,2014-01-25,0.0,2,True,0.0
2,21.5,4.0,,1.0,Winterfell,2014-05-21,iPhone,2014-01-02,0.0,1,True,100.0
3,9.46,5.0,,2.75,Winterfell,2014-01-10,Android,2014-01-09,100.0,1,False,100.0
4,13.77,5.0,,1.0,Winterfell,2014-05-13,iPhone,2014-01-31,0.0,0,False,100.0


In [53]:
churn = ltd.apply(lambda s: s.split('-')[1] == '06')
churn.value_counts()

In [54]:
phone = df.phone.apply(lambda ph: ph=='iPhone')
Astapor, Winterfell, KingsLanding = df.city.apply(lambda c: c=='Astapor'), df.city.apply(lambda c: c=='Winterfell'), df.city.apply(lambda c: c=="King's Landing")

In [72]:
df['luxury_car_user'] = df['luxury_car_user'].apply(lambda s: s==True)
df['avg_rating_of_driver'] = df['avg_rating_of_driver'].fillna(4.5)
df['avg_rating_by_driver'] = df['avg_rating_by_driver'].fillna(5.0)

In [65]:
df['Astapor'], df['Winterfell'], df["King's Landing"] = Astapor, Winterfell, KingsLanding
df['phone'] = phone

In [73]:
X = df.drop(columns=['city','last_trip_date', 'signup_date']).values
y = churn

In [74]:
X_train, X_test, y_train, y_test = cv.train_test_split(X, y)

In [76]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=2, n_jobs=-1)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [77]:
rfc.predict(X_test)

array([ True, False, False, ..., False, False,  True])

In [102]:
def model_scores(X, y, estimator, p=False, t=False):
    if t:
        scores = cv.cross_validate(estimator, X, y, scoring=['accuracy', 'f1', 'neg_log_loss'], cv=5)
        acc = np.mean(scores['test_accuracy'])
        f_1 = np.mean(scores['test_f1'])
        log_loss = -np.mean(scores['test_neg_log_loss'])
    else:
        y_hat = estimator.predict(X)
        acc = metrics.accuracy_score(y, y_hat)
        f_1 = metrics.f1_score(y, y_hat)
        log_loss = metrics.log_loss(y, y_hat)
    if p:
        print ("Accuracy: {0:2.3} | F1: {1:2.3} | Log_loss: {2:2.3}".format(
                                                                    acc, f_1, log_loss))                                                           
    else:
        return acc, f_1, brier, log_loss

In [103]:
print ('Train_Data')
model_scores(X_train, y_train, rfc, p=True, t=True)

Train_Data
Accuracy: 0.727 | F1: 0.507 | Log_loss: 0.561


In [104]:
print ('Test_Data')
model_scores(X_test, y_test, rfc, p=True)

Test_Data
Accuracy: 0.722 | F1: 0.499 | Log_loss: 9.61


In [95]:
rfc.score(X_test, y_test)

0.7217