### **1) Mounting Google Drive**

In [2]:
## To access data , we need to mount google drive at colab.
## Initially the steps were longer for mounting drive but now product has become stable.

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.getcwd() ## need to know current working directory

'/content'

In [5]:
!ls  ## gives list of folders ## drive is google drive

drive  sample_data


In [6]:
## change directory to access churn modelling dataset since we are in content location

In [28]:
os.chdir('/content/drive/MyDrive/Forsk_Dataset/Churn_Modelling.csv')  ## Throwing error since Churn_Modelling.csv file is not in directory.

NotADirectoryError: ignored

In [29]:
os.chdir('/content/drive/MyDrive/Forsk_Dataset')  ## Correct Directory


In [30]:
import os ## Current working Directory
os.getcwd()

'/content/drive/MyDrive/Forsk_Dataset'

### **2) Import Pandas for Downloading Dataset**

In [31]:
import pandas as pd ## Loading the data
dataset = pd. read_csv("Churn_Modelling.csv")

In [33]:
type(dataset)  ## Type is dataframe for object dataset

pandas.core.frame.DataFrame

In [34]:
dataset.shape ## There are 10000 and 14 columns

(10000, 14)

## Problem Statement
- Data from a Australian Bank.
- Telling we have customers with customer id and other values.
- Tenure tells how many years back acc opened.
- Number of Products - if customer using debit card, credit card or loan taken etc
- Is active member  - if regularly using account
- Exited tells if customer has left the bank or not
- 1 tells exited and 0 tells custoemr is with the bank

**We neeed to built a model which can predict about customer, if customer will leave the bank or not (or close the account in othe terms)**

**If customer will continue with the bank or not**


- Exited is label and rest of column is features

In [35]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


- We have to do decision making if all columns will add value to ML model
- For simplicity we will not consider RowNumber (since system generated), CustomerId (since given by bank) and Surname  in features.
- They are not important in Decision making since they will not decide wheter the customer will exit or not.
- This process is called feature selection

### **3) Import Libraries**

In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [37]:
features = dataset.iloc[:, 3:13].values ## Set of InDependent variable
labels = dataset.iloc[:,13].values  ## Set of Dependent variable

In [38]:
# labels = dataset.iloc[:,13]

In [39]:
## iloc used for selection of rows and columns

In [40]:
type (features)

numpy.ndarray

In [41]:
features.shape ## 10000 rows and 10 columns

(10000, 10)

In [42]:
features

array([[619, 'France', 'Female', ..., 1, 1, 101348.88],
       [608, 'Spain', 'Female', ..., 0, 1, 112542.58],
       [502, 'France', 'Female', ..., 1, 0, 113931.57],
       ...,
       [709, 'France', 'Female', ..., 0, 1, 42085.58],
       [772, 'Germany', 'Male', ..., 1, 0, 92888.52],
       [792, 'France', 'Female', ..., 1, 0, 38190.78]], dtype=object)

In [43]:
features[0,:] ## Print first row features. iloc does not work since it works for dataframe.

array([619, 'France', 'Female', 42, 2, 0.0, 1, 1, 1, 101348.88],
      dtype=object)

In [44]:
##'France', 'Female' is in text format but we need to convert into numeric format for training purpose.

In [45]:
labels

array([1, 0, 1, ..., 1, 1, 0])

In [46]:
labels.shape

(10000,)

In [47]:
dataset.dtypes  ## we have removed first 3 columns from this dataset 

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [48]:
## If features data has text data in column, we have to convert text/categorical data into numeric format.
## We have to encode categorical data into numeric format.
## We call that as onehot encoding 

In [49]:
features[0,]

array([619, 'France', 'Female', 42, 2, 0.0, 1, 1, 1, 101348.88],
      dtype=object)

In [50]:
features[0:10, 1] ## Sample Data

array(['France', 'Spain', 'France', 'France', 'Spain', 'Spain', 'France',
       'Germany', 'France', 'France'], dtype=object)

In [51]:
features[0:10, 0:2]

array([[619, 'France'],
       [608, 'Spain'],
       [502, 'France'],
       [699, 'France'],
       [850, 'Spain'],
       [645, 'Spain'],
       [822, 'France'],
       [376, 'Germany'],
       [501, 'France'],
       [684, 'France']], dtype=object)

In [52]:
dataset['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

### **4) Feature Engineering**

In [53]:
## features has 2 columns which are categorical. We have to use one hot encoding

In [54]:
features.shape

(10000, 10)

In [55]:
features

array([[619, 'France', 'Female', ..., 1, 1, 101348.88],
       [608, 'Spain', 'Female', ..., 0, 1, 112542.58],
       [502, 'France', 'Female', ..., 1, 0, 113931.57],
       ...,
       [709, 'France', 'Female', ..., 0, 1, 42085.58],
       [772, 'Germany', 'Male', ..., 1, 0, 92888.52],
       [792, 'France', 'Female', ..., 1, 0, 38190.78]], dtype=object)

In [56]:
labels

array([1, 0, 1, ..., 1, 1, 0])

In [57]:
## features has 2 columns which are categorical. We have to use one hot encoding

In [58]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [1,2])], remainder='passthrough') ## We have created columnTransformer object

features = np.array(columnTransformer.fit_transform(features), dtype = np.float32) ## np is numpy


In [59]:
###### Code explanation
## from sklearn.compose import ColumnTransformer
## from sklearn.preprocessing import OneHotEncoder
 # We are going to call method ColumnTransformer which tranform column to new format

## But we need to tell ColumnTransformer that we need to convert categorical to numerical data.

## columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [1,2])], remainder='passthrough') 
 ## By calling ColumnTransformer we tell 
 ## We have created columnTransformer object
 # We have to specify to ColumnTransformer to do one hot encoding.
 # We tell that we need to perform encoding  'encoder'  by using OneHotEncoder() is for one hot coding
 # [1,2] - Columnindex on which we perform one hot encoding
 # remainder='passthrough' - We are requesting ColumnTransformer to passthrough and don't do any changes on any other columns

## We have created object and given instructions to do one hot encoding
## But We have to give data 

## features = np.array(columnTransformer.fit_transform(features), dtype = np.float32)
 # We are passing the data to fit_transform so data get transformed. We also specified that all numerical value is in float format


In [60]:
features ## Now categorical data have been transformed to numerical data. 

array([[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, ..., 0.0000000e+00,
        1.0000000e+00, 1.1254258e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 1.1393157e+05],
       ...,
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        1.0000000e+00, 4.2085578e+04],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 9.2888523e+04],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 3.8190781e+04]], dtype=float32)

In [61]:
features.shape

(10000, 13)

In [62]:
features[0]

array([1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       0.0000000e+00, 6.1900000e+02, 4.2000000e+01, 2.0000000e+00,
       0.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
       1.0134888e+05], dtype=float32)

In [63]:
## Note: After One hot encoding the dummy variables comes in the beginning.
## 1.0000000e+00, 0.0000000e+00, 0.0000000e+00 represents Geography
## 1.0000000e+00, 0.0000000e+00 represents Gender
## We can also do one hot encoding for label data

In [64]:
features = features[:, 1:] ## Drop the first dummy column for Geography

In [65]:
features[0]  ## First column contatining 1.0000000e+00 is dropped

array([0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
       6.1900000e+02, 4.2000000e+01, 2.0000000e+00, 0.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0134888e+05],
      dtype=float32)

In [66]:
## Now we have to drop the column from Gender also

In [67]:
features = features[:, [0,1,3,4,5,6,7,8,9,10,11]] ## Drop the column for Gender

In [68]:
features.shape  ## We have 10000 records and 11 columns

(10000, 11)

In [69]:
## The above process is feature engineering
## Before we feed to ANN

### **5) Train Test Split**

In [70]:
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.2, random_state = 0)

### **6) Scaling**

In [71]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() ## Creating Object
features_train = sc.fit_transform(features_train) ## Coverting features_train  ## fit_transform used for train data for calcualting standard and mean deviation
features_test = sc.transform(features_test) ## Coverting features_train ## Once calculated mean and standard deviation in above step, we have used same to scale 
# test # Therefore we have not used fit_transform for features_test

In [72]:
features_train[0]  ## Some value is positive and some negative because of standard scaling

array([-0.56984437,  1.7430905 , -1.0916871 ,  0.16958177, -0.46460798,
        0.00666099, -1.2157176 ,  0.8095029 ,  0.642595  , -1.0322704 ,
        1.1064317 ], dtype=float32)

In [73]:
## We have loaded data using pandas
## Seperated features and labels
## We did one hot encoding
## We did feature scaling 
## Now we will jump into Deep Learning


### **7) Creating ANN for for solving problem and do prediction**

In [74]:
# Importing the Keras libraries and packages

import keras
from keras.models import Sequential ## Method used to create any model 
from keras.layers import Dense

In [75]:
## A sequential class will create empty container , no layers in it 

In [76]:
classifier = Sequential()  ## Empty container created 

In [77]:
features.shape ## 10000 rows and 11 columns, therfore we have 11 features 
 ## Number of nodes in Input Layer = Number of features  = 11 

(10000, 11)

In [78]:
## We need to add layers in Empty container created 
## the first layer we add is hidden layer
## We are not adding input layer first since while adding hidden layer we specify dimenions or no of features 
## The above process of adding hidden layer will add input layer 

In [79]:
# adding the first hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 11))

In [80]:
###### Code explanation
## classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 11))
 # classifier is the container created  
 # Dense means output of each node in one layer will go to all the nodes in next layer.
 # units = 6 is telling to add six nodes in hidden layer. No fixed count.
 # kernel_initializer = 'uniform' means to initialize weights by uniform value, since initially weight are randomly initialized
 # activation = 'relu; --  Since in Dense layer relu activation function used.
 # input_dim = 11  -- because there are 11 columns for features
 # The above code will add one hidden layer and one input layer

In [81]:
# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [82]:
###### Code explanation
## Adding the second hidden layer
## classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
 # again adding hidden layer, 6 noded, initializing weight uniformly and activation function relu
 # we are not adding input_dim = 11, since we know that input to this layer will come from previous hidden layer

## Adding the output layer
## classifier.add(Dense(units = 1, kernel_initializer = 'uniform')
 # Adding one unit because 1 because it is binary classification problem if customer will exit or not
 # Last layer we have to use sigmoid activation function


## Compiling the ANN
 # classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
 # optimizer will update weight for us, adam is the most popular. Will perform backpropagation or update weights base on loss function
 # loss = 'binary_crossentropy' loss is binary_crossentropy. It is a loss function which will check good or bad prediction based on
 # which optimizer will update weight

## How many hidden layer we can add
 # We can experiment with any hidden layer based on accuracy. We have taken here 2 for Learning purpose.


In [83]:
classifier.fit(features_train, labels_train, batch_size = 10, epochs = 10)  ## Used for model fitting

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fdaa25d5e50>

In [85]:
###### Code explanation
## classifier.fit(features_train, labels_train, batch_size = 10, epochs = 10)  ## Used for model fitting
 # We pass data into batches. Since if data sixe large and we pass into memory it will give memory error.
 # 10000 records we are passing with batch size 10. Number of batches required  = 10000/10 =  1000
 # The above process is called epoch of passing data in batches
 # epochs = 10 , means above activity or training will happen 10 times
 # batch size dependent on RAM SIZE. Increasing  batch size  and epochs will take more time
 # It will take more time if data is in in image format

### **8) Prediction**

In [86]:
labels_pred = classifier.predict_classes(features_test)  
## Is throwing error https://stackoverflow.com/questions/68836551/keras-attributeerror-sequential-object-has-no-attribute-predict-classes
#labels_pred = (labels_pred > 0.5)

AttributeError: ignored

In [87]:
## The error solution is found in 
 # https://stackoverflow.com/questions/68836551/keras-attributeerror-sequential-object-has-no-attribute-predict-classes/70689389#70689389
 # In the newest version of Tensorflow, the predict_classes function has been deprecated 
 # (there was a warning in previous versions about this). The new syntax is as follows:
 # predictions = np.argmax(model.predict(x_test),axis=1)
 # labels_pred = np.argmax(classifier.predict(features_test),axis=1)  
 # is used for muliclass classification 
 # (e.g. if it uses a softmax last-layer activation)
 # (model.predict(x) > 0.5).astype("int32"), if your model does binary classification (e.g. if it uses a sigmoid last-layer activation).


In [88]:
labels_pred = (classifier.predict(features_test) > 0.5).astype("int32")



In [89]:
labels_pred

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int32)

### **9) Checking Accuracy**

In [90]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(labels_test, labels_pred)

In [91]:
cm

array([[1506,   89],
       [ 202,  203]])

In [92]:
## Using confusion matrix we can say how many cases are predicted are correct and wrong
## diagonal values 1506 and 203 are correctly predicted
## 1506 - I predicted 0 and it is 0
## 203 - I predicted 1 and it is 1
## 89 - I predicted 1 and it is 0
## 202 - I predicted 0 and it is 1