# Machine Learning

In [1]:
#Import libraries need for cleaning and EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Import libraries for modeling
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import neighbors
from sklearn import tree 
from sklearn import svm 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE

In [4]:
df=pd.read_csv('Bank_Churned.csv')

# Under Sampling Method

In [5]:
#Created the undersampling method by randomly choosing the same amount of 0 values as 1 
df1 = df[df.Attrition_Flag == 1]
df0 = df[df.Attrition_Flag == 0].sample(n=len(df1),random_state = 27)
dfn = pd.concat([df1,df0],axis=0)

In [6]:
#Setting X and y variables
X = dfn.drop(columns=['Attrition_Flag'])
y = dfn.Attrition_Flag

Pipeline and GridSearch will be used to find which model performs the best.

In [7]:
#Spliting the data into train and test
X_train_us,X_test_us,y_train_us,y_test_us = train_test_split(X,y,test_size = 0.3,random_state=27)

In [8]:
%%time
#Created a pipeline to find the best model

estimators = [('normalise', StandardScaler()),
              ('model', svm.SVC())]

pipe = Pipeline(estimators)

param_grid = [
            {'model': [svm.SVC()], 
             'normalise': [StandardScaler(), MinMaxScaler()],
             'model__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
             'model__tol': [0.001, 0.01, 0.1, 1, 10, 100],
             'model__C': [0.001, 0.01, 0.1, 1, 10, 100]},
            {'model': [linear_model.LogisticRegression()], 
             'normalise': [StandardScaler(), MinMaxScaler()],
             'model__tol': [0.001, 0.01, 0.1, 1, 10, 100],
             'model__C': [0.001, 0.01, 0.1, 1, 10, 100]},
            {'model':[neighbors.KNeighborsClassifier()],
             'normalise':[StandardScaler(), MinMaxScaler()],
             'model__n_neighbors':[1,11,21,31,41,51,]},
            {'model':[tree.DecisionTreeClassifier()],
             'normalise':[StandardScaler(),MinMaxScaler()],
             'model__max_depth':[1,11,21,31,41,51],
             'model__min_samples_leaf':[1,2,3]}
]

grid = GridSearchCV(pipe, param_grid, cv=5,n_jobs=-1)
fittedgrid = grid.fit(X_train_us, y_train_us)

Wall time: 47.4 s


In [9]:
#Found the best model for the undersampling method
fittedgrid.best_estimator_

Pipeline(steps=[('normalise', MinMaxScaler()),
                ('model',
                 DecisionTreeClassifier(max_depth=21, min_samples_leaf=3))])

The best estimator was found to be DecisionTreeClassifier with a max_depth value of 21,min sample leaf = 3

In [11]:
#Fitting the best undersampling model
scaler = StandardScaler()
scaler.fit(X_train_us)
X_train_uS = scaler.transform(X_train_us)
X_test_uS = scaler.transform(X_test_us)
tree_US = tree.DecisionTreeClassifier(max_depth=41,min_samples_leaf=3)
tree_US.fit(X_train_uS,y_train_us)
print(f'The Train score was : {tree_US.score(X_train_uS,y_train_us)}')

#Undersampling predictions
y_pred_us = tree_US.predict(X_test_uS)

#Created a confusion matrix to illistrate how well the model preformed
pd.DataFrame(confusion_matrix(y_test_us,y_pred_us),index=['True Exisiting Customer','True Attrited Customer'],columns=['Predicted Existing Customer','Predicted Attrited Customer'])

print('Tree Model Classification Report:')
print('')
print(classification_report(y_test_us,y_pred_us))

The Train score was : 0.9758454106280193
Tree Model Classification Report:

              precision    recall  f1-score   support

           0       0.91      0.91      0.91       479
           1       0.91      0.92      0.91       498

    accuracy                           0.91       977
   macro avg       0.91      0.91      0.91       977
weighted avg       0.91      0.91      0.91       977



With under sampling the model can predict with an accuracy score of about 91% of the target. This tells us the model can predict up too 91% accuracy and 92% precision whether a customer is likely to churn or not. 

# Oversampling Method

For the oversample method I will be using the SMOTE package since it will make for the target variables.

In [12]:
#Seperating the data into X and y variables
X = df.drop(columns=['Attrition_Flag'])
y = df.Attrition_Flag

In [13]:
#Using the SMOTE library to oversample the data by randomly creating data 
sm = SMOTE(random_state=27)
X,y = sm.fit_resample(X,y)

In [14]:
#Spliting the data into train and test
X_train_os,X_test_os,y_train_os,y_test_os = train_test_split(X,y,test_size=0.3)

In [15]:
%%time
#Again using the same pipeline to find the best model
estimators = [('normalise', StandardScaler()),
              ('model', svm.SVC())]

pipe = Pipeline(estimators)

param_grid = [
            {'model': [svm.SVC()], 
             'normalise': [StandardScaler(), MinMaxScaler()],
             'model__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
             'model__tol': [0.001, 0.01, 0.1, 1, 10, 100],
             'model__C': [0.001, 0.01, 0.1, 1, 10, 100]},
            {'model': [linear_model.LogisticRegression()], 
             'normalise': [StandardScaler(), MinMaxScaler()],
             'model__tol': [0.001, 0.01, 0.1, 1, 10, 100],
             'model__C': [0.001, 0.01, 0.1, 1, 10, 100]},
            {'model':[neighbors.KNeighborsClassifier()],
             'normalise':[StandardScaler(), MinMaxScaler()],
             'model__n_neighbors':[1,11,21,31,41,51,]},
            {'model':[tree.DecisionTreeClassifier()],
             'normalise':[StandardScaler(),MinMaxScaler()],
             'model__max_depth':[1,11,21,31,41,51],
             'model__min_samples_leaf':[1,2,3]}
]

grid = GridSearchCV(pipe, param_grid, cv=5,n_jobs=-1)
fittedgrid = grid.fit(X_train_os, y_train_os)

Wall time: 20min 39s


In [16]:
#Found the best model
fittedgrid.best_estimator_

Pipeline(steps=[('normalise', StandardScaler()),
                ('model', SVC(C=10, gamma=0.1, tol=0.1))])

The best estimator was found to be SVM with C = 10, gamma = 0.1 and tol = 0.1

In [17]:
#Fitting the best oversampling model
scaler = StandardScaler()
scaler.fit(X_train_os)
X_train_oS = scaler.transform(X_train_os)
X_test_oS = scaler.transform(X_test_os)
SVM_OS = svm.SVC(C=10,gamma=0.1,tol=0.1)
SVM_OS.fit(X_train_oS,y_train_os)
print(f'The Train score was : {SVM_OS.score(X_train_oS,y_train_os)}')

#Oversampling predictions
y_pred_os = SVM_OS.predict(X_test_oS)

#Created a confusion matrix to illistrate the model on its performance in classifying between Existing and Attrited customers
pd.DataFrame(confusion_matrix(y_test_os,y_pred_os),index=['True Exisiting Customer','True Attrited Customer'],columns=['Predicted Existing Customer','Predicted Attrited Customer'])


print('SVM Model Classification Report:')
print('')
print(classification_report(y_test_os,y_pred_os))

The Train score was : 0.9978991596638656
SVM Model Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.94      0.96      2543
           1       0.95      0.97      0.96      2557

    accuracy                           0.96      5100
   macro avg       0.96      0.96      0.96      5100
weighted avg       0.96      0.96      0.96      5100



Surprisingly the oversampling method using SMOTE, seemed to find a better model with an even higher accuracy and precision score with them being 96% and 97% respectively