In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

df = pd.read_csv('churn_ibm.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
y = df['Churn']
X = df.drop(['Churn','customerID'],axis=1)

for column in X.columns:
    if X[column].dtype == np.object:
        X = pd.concat([X,pd.get_dummies(X[column], prefix=column, drop_first=True)],axis=1).drop([column],axis=1)
        
y = pd.get_dummies(y, prefix='churn', drop_first=True)

## Random forest

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

rf = RandomForestClassifier(n_estimators=20)

# Some algorithms need a transformed version of the dependent variable . Hence, the data is reshaped using ravel()
rf.fit(X_train,y_train.values.ravel())
prediction = rf.predict(X_test)
print('Accuracy:', accuracy_score(y_test,prediction))
print('AUC:',roc_auc_score(y_test,prediction))

Accuracy: 0.7919431279620853
AUC: 0.6867081718585022


In [4]:
print(rf)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


We can also change the parameters. Let's build a larger forest using more trees using parameter ```n_estimators```:

In [5]:
rf2 = RandomForestClassifier(n_estimators=100)
rf2.fit(X_train,y_train.values.ravel())
prediction = rf2.predict(X_test)

print('Accuracy:', accuracy_score(y_test,prediction))
print('AUC:',roc_auc_score(y_test,prediction))

Accuracy: 0.7971563981042654
AUC: 0.6955165778239809


## AdaBoost

In [6]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()
ada.fit(X_train,y_train.values.ravel())
prediction = ada.predict(X_test)
print('Accuracy:', accuracy_score(y_test,prediction))
print('AUC:',roc_auc_score(y_test,prediction))

Accuracy: 0.8061611374407583
AUC: 0.7215587962407843


In [7]:
print(ada)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)


Let's increase that as well:

In [8]:
ada2 = AdaBoostClassifier(n_estimators=100)
ada2.fit(X_train,y_train.values.ravel())
prediction = ada2.predict(X_test)

print('Accuracy:', accuracy_score(y_test,prediction))
print('AUC:',roc_auc_score(y_test,prediction))

Accuracy: 0.804739336492891
AUC: 0.7211833636201717


## Grid search

A lot of these efforts can be streamlined using GridSearch. Below, you can find code that tests different parameters using cross-validation for random forests:

In [9]:
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.model_selection import GridSearchCV


parameters = {'min_samples_leaf':[1,5],'max_depth':[None,10]}


grid_search = GridSearchCV(RandomForestClassifier(n_estimators=20), parameters, cv=10)
grid_search.fit(X_train, y_train.values.ravel())

# The best predictor will be used for the prediction
prediction = grid_search.predict(X_test)
    
best_classifier = grid_search.best_estimator_

print('Best classifier:',best_classifier)
print('Accuracy:', accuracy_score(y_test,prediction))
print('AUC:',roc_auc_score(y_test,prediction))

Best classifier: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Accuracy: 0.8080568720379147
AUC: 0.7181478890209533


It seems that having a minimum of 5 samples per leaf, and a maximum depth of 10 are preferable.

## Feature importance

 This is the (average) reduction in Gini impurity across all trees:

In [10]:
# Random forest - 5 most important features
for c, column in enumerate(X_test.columns):
    if rf.feature_importances_[c] in sorted(rf.feature_importances_)[-5:]:
        print('Variable',column,rf.feature_importances_[c])

Variable tenure 0.16839043806476744
Variable MonthlyCharges 0.1598015604954677
Variable TotalCharges 0.1941393730940047
Variable InternetService_Fiber optic 0.04346388408050021
Variable PaymentMethod_Electronic check 0.047127592824661335


In [12]:
# AdaBoost - 5 most important features
for c, column in enumerate(X_test.columns):
    if ada.feature_importances_[c] in sorted(ada.feature_importances_)[-5:]:
        print('Variable',column,ada.feature_importances_[c])

Variable tenure 0.2
Variable MonthlyCharges 0.22
Variable TotalCharges 0.3
Variable InternetService_Fiber optic 0.04
Variable Contract_Two year 0.04
