# Artificial Neural Network

Predict if a customer will exit from a bank or not

### Import Libraries

In [2]:
!pip install tensorflow --user



In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
tf.__version__  # From tensorflow 2.0, keras is integrated in tensorflow 

'2.1.0'

### Step 1: Data Preprocessing

#### A) Import dataset

In [3]:
import os
print(os.listdir(r'../'))

['.DS_Store', '.ipynb_checkpoints', 'ChurnPrediction_ANN.ipynb', 'Churn_Modelling.csv']


In [4]:
path = r'Churn_Modelling.csv'
dataset = pd.read_csv(path)
dataset.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [5]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
x = dataset.iloc[:,3:-1].values   # Excluding RowNumber & CustomerId columns (as they are unique for each record) & the target column
y = dataset.iloc[:,-1].values     # Extracting only the target column - Exited

In [7]:
print(x)
print(y)

[[619 'France' 'Female' ... 1 1 101348.88]
 [608 'Spain' 'Female' ... 0 1 112542.58]
 [502 'France' 'Female' ... 1 0 113931.57]
 ...
 [709 'France' 'Female' ... 0 1 42085.58]
 [772 'Germany' 'Male' ... 1 0 92888.52]
 [792 'France' 'Female' ... 1 0 38190.78]]
[1 0 1 ... 1 1 0]


#### B) Take care of missing data

In [8]:
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])

print(x)

[[619 'France' 'Female' ... 1 1 101348.88]
 [608 'Spain' 'Female' ... 0 1 112542.58]
 [502 'France' 'Female' ... 1 0 113931.57]
 ...
 [709 'France' 'Female' ... 0 1 42085.58]
 [772 'Germany' 'Male' ... 1 0 92888.52]
 [792 'France' 'Female' ... 1 0 38190.78]]


#### C) Encoding Categorical Data

<b><u> Encoding dependent variable </u> </b> <br>
<strong> Label Encoding the "Gender" Column </strong>

In [9]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x[:,2] = encoder.fit_transform(x[:,2])

In [10]:
print(x)

[[619 'France' 0 ... 1 1 101348.88]
 [608 'Spain' 0 ... 0 1 112542.58]
 [502 'France' 0 ... 1 0 113931.57]
 ...
 [709 'France' 0 ... 0 1 42085.58]
 [772 'Germany' 1 ... 1 0 92888.52]
 [792 'France' 0 ... 1 0 38190.78]]


<b><u> Encoding independent variable </u> </b> <br>
<strong> One Hot Encoding the "Georaphy"/"Country" Column </strong>

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [1])], remainder='passthrough')  # [1] implies column 1 - Geography
encoder = OneHotEncoder()
x = np.array(ct.fit_transform(x))

print(x)

[[1.0 0.0 0.0 ... 1 1 101348.88]
 [0.0 0.0 1.0 ... 0 1 112542.58]
 [1.0 0.0 0.0 ... 1 0 113931.57]
 ...
 [1.0 0.0 0.0 ... 0 1 42085.58]
 [0.0 1.0 0.0 ... 1 0 92888.52]
 [1.0 0.0 0.0 ... 1 0 38190.78]]


In [25]:
x[2]

array([1.0, 0.0, 0.0, 502, 0, 42, 8, 159660.8, 3, 1, 0, 113931.57],
      dtype=object)

#### D) Splitting dataset into training& test set

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=0)

#### E) Feature Scaling --- Mandatory step in deep learning

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
#x_train [:,3:] = sc.fit_transform(x_train[:,3:])
x_train  = sc.fit_transform(x_train)
#x_test [:,3:] = sc.transform(x_test[:,3:])
x_test  = sc.transform(x_test)

In [14]:
print(x_train)

[[-1.01460667 -0.5698444   1.74309049 ...  0.64259497 -1.03227043
   1.10643166]
 [-1.01460667  1.75486502 -0.57369368 ...  0.64259497  0.9687384
  -0.74866447]
 [ 0.98560362 -0.5698444  -0.57369368 ...  0.64259497 -1.03227043
   1.48533467]
 ...
 [ 0.98560362 -0.5698444  -0.57369368 ...  0.64259497 -1.03227043
   1.41231994]
 [-1.01460667 -0.5698444   1.74309049 ...  0.64259497  0.9687384
   0.84432121]
 [-1.01460667  1.75486502 -0.57369368 ...  0.64259497 -1.03227043
   0.32472465]]


### Step 2: Build ANN

#### A) Initializing ANN as a sequence of layers

In [15]:
ann = tf.keras.models.Sequential()   # Initialize ann as a sequence of layers

#### B) Adding input layer & first hidden Layer

In [16]:
# To add fully connected layer into ANN, use the dense class
#ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=6, activation='relu', kernel_initializer='uniform', input_dim=11))   # Dense(units -> no. of neurons in first hidden layer)  -- no. of neurons depends on input columns or is totally experimental

#### C) Adding second hidden Layer

In [17]:
# To add fully connected layer into ANN, use the dense class
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))   # Dense(units -> no. of neurons in the layer)  -- no. of neurons depends on input columns or is totally experimental

#### D) Adding the output layer

In [18]:
# To add fully connected layer into ANN, use the dense class
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))  # output is 1 or 0, hence one neuron would sufficient to show to output from the interliked/dependent outcomes
'''
Activation function is sigmoid, bcoz we want to know with what probability are we getting the output.
For binary classification, activitation function can be sigmoid,
For non-binary classification, activation function should be softmax
'''

### Step 3: Training the ANN

#### A) Compile the ANN with optimizer/loss function

In [20]:
ann.compile(optimizer= 'adam' , loss='binary_crossentropy' , metrics= ['accuracy'] ) 
'''
 # compile(optimizer, loss, metrics)
 
 optimizer: String (name of optimizer) or optimizer instance. See `tf.keras.optimizers`.
        The best optimizer is the Gradient descent &
        the best in them is the Adam Optimizer that performs stocastic Gradient Descent (update wts to reduce loss/error in each ). 
 loss: String (name of objective function), objective function or
        `tf.keras.losses.Loss` instance. See `tf.keras.losses`. An objective
        function is any callable with the signature
        `scalar_loss = fn(y_true, y_pred)`. If the model has multiple
        outputs, you can use a different loss on each output by passing a
        dictionary or a list of losses. The loss value that will be
        minimized by the model will then be the sum of all individual
        losses.
        For binary classification, where u have to predict a binary outcome, the loss function will be binary_crossentropy.
        For non-binary classification, use categorical_crossentropy.
 metrics: Evaluation metrics -  List of metrics to be evaluated by the model during training
        and testing. Typically you will use `metrics=['accuracy']`.
        To specify different metrics for different outputs of a
        multi-output model, you could also pass a dictionary, such as
        `metrics={'output_a': 'accuracy', 'output_b': ['accuracy', 'mse']}`.
        You can also pass a list (len = len(outputs)) of lists of metrics
        such as `metrics=[['accuracy'], ['accuracy', 'mse']]` or
        `metrics=['accuracy', ['accuracy', 'mse']]`.
'''

"\n # compile(optimizer, loss, metrics)\n \n optimizer: String (name of optimizer) or optimizer instance. See `tf.keras.optimizers`.\n        The best optimizer is the Gradient descent &\n        the best in them is the Adam Optimizer that performs stocastic Gradient Descent (update wts to reduce loss/error in each ). \n loss: String (name of objective function), objective function or\n        `tf.keras.losses.Loss` instance. See `tf.keras.losses`. An objective\n        function is any callable with the signature\n        `scalar_loss = fn(y_true, y_pred)`. If the model has multiple\n        outputs, you can use a different loss on each output by passing a\n        dictionary or a list of losses. The loss value that will be\n        minimized by the model will then be the sum of all individual\n        losses.\n        For binary classification, where u have to predict a binary outcome, the loss function will be binary_crossentropy.\n        For non-binary classification, use categoric

#### Training the ANN on Training Set

In [21]:
# this is the usual fit method that takes in x & y as input.
# Since we are training the neural network, we have to provide additional parameters - batch_size & epochs
ann.fit(x_train, y_train, batch_size=32, epochs=100 )    # batch_size 32 is the default value
                                                         # don't choose too small epoch.
    
# If u observe the output, it started to converge at accuracy 86 on 40th epoch. So we could keep epoch as 40 instead of 100.

Train on 8000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100

<tensorflow.python.keras.callbacks.History at 0x15022257f28>

### Step 4: Making the predictions & evaluating the model

#### Predicting result of a single observation

Input feature values:<ul>
<li>Geography: France</li>
<li>Credit Score: 600</li>
<li>Gender: Male</li>
<li>Age: 40 yrs old</li>
<li>Tenure: 3 yrs</li>
<li>Balance: &#36; 60000 </li>
<li>No. of products: 2 </li>
<li>Does customer have a credit card?: Yes</li>
<li>Is this customer an Active Member?: Yes</li>
<li>Estimated Salary: &#36;50000</li>
</ul>
So will this customer say Goodbye?

In [26]:
print(ann.predict(sc.transform([[1, 0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])))  # Any inputs to the predict is 2D array
                                            # France is denoted by 1 as per encoding above
                                            # We had scaled the matrix above. SO we should call the predict() on scaled data
# Here we expect the answer to be a probability since we used the sigmoid activation function in the output layer. 
# We will later convert this to 1 & 0 (binary output)

# If you check the output of this stmt, the probability is really low of this person exiting the bank.

[[0.02870246]]


In [27]:
# To print True (customer will leave) or False (customer will not leave) instead of probability we can add a condition as follow
print(ann.predict(sc.transform([[1, 0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]]))>0.5)

[[False]]


#### Predicting the Test set results

In [36]:
y_pred = ann.predict(x_test)
y_pred = (y_pred>0.5)         # Set y_pred to binary value i.e T/F

# Print the y_test & y_pred
#print(y_pred)          # y_pred has 2 dimensions
'''   y_pred =          [[False]
                         [False]
                         [False]
                         ...
                         [False]
                         [False]
                         [False]] 
'''
#print(y_test)     # y_test has 1 dimension --->  [0 1 0 ... 0 0 0]    
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),axis=1))   # both should have same shape

[[0 0]
 [0 1]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


#### Making the confusion matrix

In [37]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test,y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[1509   86]
 [ 195  210]]


0.8595