In [2]:
import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Using TensorFlow backend.


#### Import data & divide
- **X** = feature variables
- **y** = output

In [28]:
dataset = pd.read_csv("Churn_Modelling.csv")
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

In [29]:
dataset

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.00,2,1,1,10062.80,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.50,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


#### Encode Catergorical Data
- one-hot encoding must be used for multi-class categorical data
- in this case, **Geography** at index 1 must one-hot enocded
- after the encoding, one dummy variable is removed to avoid falling into the "Dummy Variable trap" 

In [71]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Feature - Geography
label_encoder_1 = LabelEncoder()
X[:, 1] = label_encoder_1.fit_transform( X[:, 1] )

# Feature - Gender
label_encoder_2 = LabelEncoder()
X[:, 2] = label_encoder_1.fit_transform( X[:, 2] )

one_hot = OneHotEncoder( categorical_features=[1] )
X = one_hot.fit_transform(X).toarray()

pd.DataFrame(X) # Note columns 0, 1, 2 - Three dummy variables for feature "Geography"

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,1.0,0.0,1.0,0.0,228.0,0.0,42.0,2.0,0.00,1.0,1.0,1.0,101348.88
1,0.0,1.0,0.0,1.0,1.0,217.0,0.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58
2,0.0,1.0,0.0,1.0,0.0,111.0,0.0,42.0,8.0,159660.80,3.0,1.0,0.0,113931.57
3,0.0,1.0,0.0,1.0,0.0,308.0,0.0,39.0,1.0,0.00,2.0,0.0,0.0,93826.63
4,0.0,1.0,0.0,1.0,1.0,459.0,0.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.10
5,0.0,1.0,0.0,1.0,1.0,254.0,1.0,44.0,8.0,113755.78,2.0,1.0,0.0,149756.71
6,0.0,1.0,0.0,1.0,0.0,431.0,1.0,50.0,7.0,0.00,2.0,1.0,1.0,10062.80
7,1.0,0.0,1.0,0.0,0.0,8.0,0.0,29.0,4.0,115046.74,4.0,1.0,0.0,119346.88
8,0.0,1.0,0.0,1.0,0.0,110.0,1.0,44.0,4.0,142051.07,2.0,0.0,1.0,74940.50
9,0.0,1.0,0.0,1.0,0.0,293.0,1.0,27.0,2.0,134603.88,1.0,1.0,1.0,71725.73


In [72]:
# Remove one dummy variable
X = X[:, 1:]
pd.DataFrame(X) # Note index 0, 1 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.0,1.0,0.0,228.0,0.0,42.0,2.0,0.00,1.0,1.0,1.0,101348.88
1,1.0,0.0,1.0,1.0,217.0,0.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58
2,1.0,0.0,1.0,0.0,111.0,0.0,42.0,8.0,159660.80,3.0,1.0,0.0,113931.57
3,1.0,0.0,1.0,0.0,308.0,0.0,39.0,1.0,0.00,2.0,0.0,0.0,93826.63
4,1.0,0.0,1.0,1.0,459.0,0.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.10
5,1.0,0.0,1.0,1.0,254.0,1.0,44.0,8.0,113755.78,2.0,1.0,0.0,149756.71
6,1.0,0.0,1.0,0.0,431.0,1.0,50.0,7.0,0.00,2.0,1.0,1.0,10062.80
7,0.0,1.0,0.0,0.0,8.0,0.0,29.0,4.0,115046.74,4.0,1.0,0.0,119346.88
8,1.0,0.0,1.0,0.0,110.0,1.0,44.0,4.0,142051.07,2.0,0.0,1.0,74940.50
9,1.0,0.0,1.0,0.0,293.0,1.0,27.0,2.0,134603.88,1.0,1.0,1.0,71725.73


#### Split into training and test set
- 80% : 20 %

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
list(map(len, (X_train, X_test, y_train, y_test )))

#### Feature Scaling

In [66]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_train)

Values before **standard scaling**

In [68]:
X_train

array([ 0.5698444 , -0.5698444 ,  0.5698444 ,  1.74309049,  0.1692298 ,
       -1.09168714, -0.46460796,  0.00666099, -1.21571749,  0.8095029 ,
        0.64259497, -1.03227043,  1.10643166])

Values after **standard scaling**

In [43]:
X_train

array([[-0.5698444 ,  0.5698444 ,  0.5698444 , ...,  0.64259497,
        -1.03227043,  1.10643166],
       [ 1.75486502, -1.75486502, -1.75486502, ...,  0.64259497,
         0.9687384 , -0.74866447],
       [-0.5698444 ,  0.5698444 ,  0.5698444 , ...,  0.64259497,
        -1.03227043,  1.48533467],
       ...,
       [-0.5698444 ,  0.5698444 ,  0.5698444 , ...,  0.64259497,
        -1.03227043,  1.41231994],
       [-0.5698444 ,  0.5698444 ,  0.5698444 , ...,  0.64259497,
         0.9687384 ,  0.84432121],
       [ 1.75486502, -1.75486502, -1.75486502, ...,  0.64259497,
        -1.03227043,  0.32472465]])

#### The Sequential() model keeps all layers in track

In [56]:
import keras
from keras.models import Sequential
from keras.layers import Dense
classifier = Sequential()

## Adding dense layers to our model

`Dense` implements the operation:
`output = activation(dot(input, kernel) + bias)`
`activation` is the element-wise activation function
`kernel` is a weights matrix 
`bias` is a bias vector (only applicable if `use_bias` is `True`).

Note: if the input to the layer has a rank greater than 2, then
it is flattened prior to the initial dot product with `kernel`.

#### Example


    # as first layer in a sequential model:
    model = Sequential()
    model.add(Dense(32, input_shape=(16,)))
    # now the model will take as input arrays of shape (*, 16)
    # and output arrays of shape (*, 32)

    # after the first layer, you don't need to specify
    # the size of the input anymore:
    model.add(Dense(32))

#### Params
    input_shape : dim of inputs, MUST be list
    units: dimensionality of the output
    activation: Activation function to use, MUST be specified
    kernel_initializer: 'uniform'
    kernel_regularizer: Regularizer function applied to
    bias_regularizer: Regularizer function applied to the bias vector
    

In [57]:
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_shape = [11]))
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

Compiling the model with:
- Optimizer: Adam optimizer 
- Loss function: binary cross entropy (as our output variable is binary)
- Metrics: accuracy (legacay metrics recall, precision cease to exist)

In [58]:
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

Training the model with both training sets

In [59]:
classifier.fit(x = X_train, y = y_train, batch_size = 10, epochs = 100)

ValueError: Error when checking input: expected dense_8_input to have shape (None, 11) but got array with shape (8000, 13)

In [62]:
len(X_train)

13

In [146]:
y_pred = classifier.predict(X_test)

Notice all the output are probabilities from our sigmod funciton

In [147]:
y_pred

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

We threshold all the values at 0.5

In [148]:
y_pred = y_pred > 0.5
y_pred 

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

In [151]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1595    0]
 [ 405    0]]


#### Predicitng from a single obeservation
- input must be [ [] ], as all inputs are 2D matrices
- must be scaled identically to the training data

In [163]:
to_predict = np.array([ [0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000] ])
to_predict = sc.fit_transform(to_predict) # Scaling
    



In [1]:
classifier.predict( x = to_predict)

NameError: name 'classifier' is not defined

In [None]:
new_prediction = classifier.predict( x = np.array())