# Churn Modelling

## Reading the data

In [59]:
import numpy as np 
import pandas as pd
from time import time
from IPython.display import display

%matplotlib inline

try:
    raw_data = pd.read_excel("Churn-Modelling.xlsx")
    # print(raw_data.head())
    target = raw_data['Exited']
    data = raw_data.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis=1)
    print(data.head())
except:
    print("Could not load the data.")

   CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619    France  Female   42       2       0.00              1   
1          608     Spain  Female   41       1   83807.86              1   
2          502    France  Female   42       8  159660.80              3   
3          699    France  Female   39       1       0.00              2   
4          850     Spain  Female   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  
0          1               1        101348.88  
1          0               1        112542.58  
2          1               0        113931.57  
3          0               0         93826.63  
4          1               1         79084.10  


## Preprocessing
### One-Hot-Encoding and FeatureScaling

In [60]:
#Scaling Features
from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import get_dummies

data = pd.get_dummies(data, columns = ['Geography','Gender'])

scaler = MinMaxScaler() # default=(0, 1)
numerical = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
data[numerical] = scaler.fit_transform(data[numerical])

display(data.head())

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0.538,0.324324,0.2,0.0,0.0,1,1,0.506735,1,0,0,1,0
1,0.516,0.310811,0.1,0.334031,0.0,0,1,0.562709,0,0,1,1,0
2,0.304,0.324324,0.8,0.636357,0.666667,1,0,0.569654,1,0,0,1,0
3,0.698,0.283784,0.1,0.0,0.333333,0,0,0.46912,1,0,0,1,0
4,1.0,0.337838,0.2,0.500246,0.0,1,1,0.3954,0,0,1,1,0


## Training The Model

### Importing Libraries

In [79]:
# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

#Evaluation Metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

#Cross Validation
from sklearn.model_selection import cross_val_score

### Training and Evaluating

In [80]:
# Training classifiers and obtaining some measures

def train_predict(learner, X_train, y_train, X_test, y_test): 
    
    results = {}

    start = time() # Get start time    
    learner = learner.fit(X_train.values, y_train)
    end = time() # Get end time
    

    results['train_time'] = end - start
    

    start = time() 
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    end = time() 
    
    # The total prediction time
    results['pred_time'] = end - start
            
    # Training accuracy
    results['acc_train'] = accuracy_score(y_train, predictions_train)
        
    # Testing accuracy
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # F-score 0.5 on training data
    results['f_train'] = fbeta_score(predictions_train, y_train, beta=0.5)
        
    # F-score 0.5 on training data
    results['f_test'] = fbeta_score(predictions_test, y_test, beta=0.5)
       
    # Success
    print("{} trained succesfully.".format(learner.__class__.__name__))
        
    # Return the results
    return results

In [81]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

clf_A = KNeighborsClassifier()
clf_B = SVC(random_state=10)
clf_C = RandomForestClassifier(random_state=1)

# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
#     print(clf_name)
    results[clf_name] = {}
    results[clf_name] = train_predict(clf, X_train, y_train, X_test, y_test)
    
display(results)


KNeighborsClassifier trained succesfully.
SVC trained succesfully.
RandomForestClassifier trained succesfully.


  'recall', 'true', average, warn_for)


{'KNeighborsClassifier': {'train_time': 0.029981613159179688,
  'pred_time': 0.7938754558563232,
  'acc_train': 0.865625,
  'acc_test': 0.8255,
  'f_train': 0.501469017094017,
  'f_test': 0.3687396807925151},
 'SVC': {'train_time': 2.0419929027557373,
  'pred_time': 1.4015541076660156,
  'acc_train': 0.795375,
  'acc_test': 0.8,
  'f_train': 0.0,
  'f_test': 0.0},
 'RandomForestClassifier': {'train_time': 0.10503673553466797,
  'pred_time': 0.015624761581420898,
  'acc_train': 0.98475,
  'acc_test': 0.8525,
  'f_train': 0.9410234171725932,
  'f_test': 0.4542966611932128}}

## The Chosen Model is RandomForestClassifier

## Tuning Parameters

In [87]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.grid_search import GridSearchCV

parameters = {'n_estimators': [5,10, 15,25 ], 'min_samples_split': [2,5,10], 'min_samples_leaf': [50, 100, 500]}
scorer = make_scorer(fbeta_score, beta=0.5)
grid_obj = GridSearchCV(clf_C, param_grid=parameters, scoring=scorer)
grid_fit = grid_obj.fit(X_train, y_train)
best_clf = grid_fit.best_estimator_
predictions = (clf_C.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))


Unoptimized model
------
Accuracy score on testing data: 0.8525
F-score on testing data: 0.6346

Optimized Model
------
Final accuracy score on the testing data: 0.8555
Final F-score on the testing data: 0.6491


### Cross Validation

In [88]:
scores = cross_val_score(clf_C, data, target, cv=10)
print(scores.mean())

0.8524983115983116
