Bank Customer Churn Modeling

###**Connecting Drive with Colab**

In [0]:
from google.colab import drive

In [2]:
drive.mount("/content/gdrive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


###**Load Key Libraries**

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

%matplotlib inline
sns.set_style('whitegrid')

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [0]:
tf.reset_default_graph()
tf.set_random_seed(40)

Read the dataset

In [26]:
#read data

bank_df = pd.read_csv('/content/gdrive/My Drive/bank.csv')
bank_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [27]:
bank_df.Geography.unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [0]:
Geo = pd.get_dummies(bank_df['Geography'],prefix = "Geo",prefix_sep = "_",drop_first = True)
bank_df.drop(['Geography'],axis = 1,inplace = True)
bank_df = pd.concat([bank_df,Geo],axis=1)

In [29]:
bank_df.Gender.unique()

array(['Female', 'Male'], dtype=object)

In [0]:
labelencoder = LabelEncoder()
bank_df['Gender'] = labelencoder.fit_transform(bank_df['Gender'])

In [31]:
bank_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geo_Germany,Geo_Spain
0,1,15634602,Hargrave,619,0,42,2,0.0,1,1,1,101348.88,1,0,0
1,2,15647311,Hill,608,0,41,1,83807.86,1,0,1,112542.58,0,0,1
2,3,15619304,Onio,502,0,42,8,159660.8,3,1,0,113931.57,1,0,0
3,4,15701354,Boni,699,0,39,1,0.0,2,0,0,93826.63,0,0,0
4,5,15737888,Mitchell,850,0,43,2,125510.82,1,1,1,79084.1,0,0,1


Drop the columns which are unique for all users like IDs 

In [32]:
bank_df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Gender', 'Age',
       'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited', 'Geo_Germany', 'Geo_Spain'],
      dtype='object')

In [0]:
bank_df.drop(['RowNumber', 'CustomerId','Surname'], axis=1, inplace = True)

In [34]:
bank_df.head(5)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geo_Germany,Geo_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0,1
2,502,0,42,8,159660.8,3,1,0,113931.57,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0,1


Distinguish the feature and target set 

In [0]:
X = bank_df[['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary','Geo_Germany', 'Geo_Spain']]

Y = bank_df['Exited']

Divide the data set into Train and test sets

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=100)

In [37]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(7000, 11)
(3000, 11)
(7000,)
(3000,)


Normalize the train and test data 

Initialize and build the model 

In [0]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.BatchNormalization(input_shape=(11,)))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

In [39]:
model.fit(X_train,Y_train,          
          validation_data=(X_test,Y_test),
          epochs=50)

Train on 7000 samples, validate on 3000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f1a48b57b00>

In [0]:
Y_pred = model.predict(X_test)
Y_pred=Y_pred>0.50

In [43]:
confusion_matrix(Y_test, Y_pred)

array([[2303,   73],
       [ 500,  124]])

In [44]:
accuracy_score(Y_test, Y_pred)

0.809

Optimize the model 

In [0]:

model_nn = tf.keras.models.Sequential()

model_nn.add(tf.keras.layers.BatchNormalization(input_shape=(11,)))

model_nn.add(tf.keras.layers.Dense(10, activation='sigmoid'))

model_nn.add(tf.keras.layers.Dense(6, activation='sigmoid'))

model_nn.add(tf.keras.layers.Dense(4, activation='sigmoid'))

model_nn.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model_nn.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

In [46]:
model_nn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_v1_3 (Ba (None, 11)                44        
_________________________________________________________________
dense_3 (Dense)              (None, 10)                120       
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 66        
_________________________________________________________________
dense_5 (Dense)              (None, 4)                 28        
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 5         
Total params: 263
Trainable params: 241
Non-trainable params: 22
_________________________________________________________________


In [47]:
model_nn.fit(X_train,Y_train,          
          validation_data=(X_test,Y_test), 
          epochs=50,
          batch_size=30)

Train on 7000 samples, validate on 3000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f1a4866cbe0>

In [0]:
Y_pred = model_nn.predict(X_test)
Y_pred = Y_pred>0.50

In [49]:
confusion_matrix(Y_test, Y_pred)

array([[2376,    0],
       [ 624,    0]])

In [50]:
accuracy_score(Y_test, Y_pred)

0.792

Optimize with SGD

In [0]:
model_nn = tf.keras.models.Sequential()

model_nn.add(tf.keras.layers.BatchNormalization(input_shape=(11,)))

model_nn.add(tf.keras.layers.Dense(10, activation='sigmoid'))

model_nn.add(tf.keras.layers.Dense(6, activation='sigmoid'))

model_nn.add(tf.keras.layers.Dense(4, activation='sigmoid'))

model_nn.add(tf.keras.layers.Dense(1, activation='sigmoid'))

sgd_optimizer = tf.keras.optimizers.SGD(lr=0.03)

model_nn.compile(optimizer= sgd_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [52]:
model_nn.fit(X_train,Y_train,          
          validation_data=(X_test,Y_test), 
          epochs=50,
          batch_size=30)

Train on 7000 samples, validate on 3000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f1a4866c3c8>

In [0]:
Y_pred = model_nn.predict(X_test)
Y_pred = Y_pred>0.50

In [54]:
confusion_matrix(Y_test, Y_pred)

array([[2376,    0],
       [ 624,    0]])

In [55]:
accuracy_score(Y_test, Y_pred)

0.792