## 1 - Import Modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'


## 2 - Activation Helper Functions

- Sigmoid
- Relu
- Relu_backward
- Sigmoid_backward


In [None]:
def sigmoid(Z):
    
    A = 1/(1+np.exp(-Z))
    cache = Z
    
    return A, cache
    """
    Sigmoid activation in numpy
    
    Arguments:
    Z = numpy array of any shape
    
    Returns:
    A = output of sigmoid(z)
    cache = returns Z, too; used during backpropagation
    """

def relu(Z):
    
    A = np.maximum(0,Z)
    
    assert(A.shape == Z.shape)
    
    cache = Z 
    return A, cache
    """
    RELU function
    
    Arguments:
    Z = output of the linear layer of any shape
    
    Returns:
    A = post-activation parameter
    cache = a python dictionary containing A; used during backpropagation
    """

def relu_backward(dA, cache):
    
    Z = cache
    dZ = np.array(dA, copy=True) # convert dz to correct object
    
    dZ[Z <= 0] = 0  # When z <= 0, set dz to 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ
    """
    Backward propagation for a single RELU unit
    
    Arguments:
    dA = post-activation gradient of any shape
    cache = Z; used during backpropagation
    
    Returns:
    dZ = Gradient of the cost with respect to Z
    """

def sigmoid_backward(dA, cache):
    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ
    """
    Backward propagation for a single SIGMOID unit
    
    Arguments:
    dA = post-activation gradient of any shape
    cache = Z; used during backpropagation
    
    Returns:
    dZ = Gradient of the cost with respect to Z
    """

## 3 - Initialise L-Layer Neural Network

In [None]:
def initialize_parameters_deep(layer_dims):

    parameters = {}
    
    L = len(layer_dims)            # number of layers

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) #*0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        
        assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
        assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))

        
    return parameters
    """
    Arguments:
    layer_dims = python array (list) containing the dimensions of each layer in
    
    Returns:
    parameters = python dictionary containing parameters "W1", "b1", ..., "WL", "bL":
                    Wl = weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    bl = bias vector of shape (layer_dims[l], 1)
    """

## 4 - Forward Propagation Module

### 4.1 - Linear Forward 

The linear forward module (vectorized over all the examples) computes the following equations:

$Z^{[l]} = W^{[l]}A^{[l-1]} +b^{[l]}$

where $A^{[0]} = X$. 


In [None]:
def linear_forward(A, W, b):

    Z = np.dot(W, A) + b
    
    assert(Z.shape == (W.shape[0], A.shape[1]))
    cache = (A, W, b)
    
    return Z, cache
    """
    Arguments:
    A = activations from previous layer (or input data); shape = (size of previous layer, number of examples)
    W = weights matrix; numpy array shape = (size of current layer, size of previous layer)
    b = bias vector; numpy array shape = (size of the current layer, 1)

    Returns:
    Z = the input of the activation function, also called pre-activation parameter 
    cache = a python dictionary containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    """
    

### 4.2 - Linear Activation Forward

Forward propagation of the *LINEAR->ACTIVATION* layer.

$A^{[l]} = g(Z^{[l]}) = g(W^{[l]}A^{[l-1]} +b^{[l]})$ 


In [None]:
def linear_activation_forward(A_prev, W, b, activation):
    
    if activation == "sigmoid":
        # Inputs = A_prev, W, b; outputs = A, activation_cache
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
    
    elif activation == "relu":
        # Inputs = A_prev, W, b; outputs = A, activation_cache
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
    
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache
    """
    Arguments:
    A = activations from previous layer (or input data); shape = (size of previous layer, number of examples)
    W = weights matrix; numpy array shape = (size of current layer, size of previous layer)
    b = bias vector; numpy array shape = (size of the current layer, 1)
    activation = the activation to be used in this layer - sigmoid or relu

    Returns:
    A = the output of the activation function 
    cache = a python dictionary containing 'linear_cache' and 'activation_cache'; used in backpropagation
    """

### 4.3 - L-Layer Model  Forward

(L-1) layers of ReLU + one sigmoid.

`AL` denotes $A^{[L]} = \sigma(Z^{[L]}) = \sigma(W^{[L]} A^{[L-1]} + b^{[L]})$

In [None]:
def L_model_forward(X, parameters):

    caches = []
    A = X
    L = len(parameters) // 2                  # number of layers
    
    # LINEAR -> RELU *(L-1)
    for l in range(1, L):
        A_prev = A 
        A, cache = linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], "relu")
        caches.append(cache)
    
    # LINEAR -> SIGMOID
    AL, cache = linear_activation_forward(A, parameters["W" + str(L)], parameters["b" + str(L)], "sigmoid")
    caches.append(cache)
    
    assert(AL.shape == (1,X.shape[1]))
            
    return AL, caches
    """
    Arguments:
    X = data; numpy array shape = (input size, number of examples)
    parameters = output of initialize_parameters_deep()
    
    Returns:
    AL = last post-activation value
    caches = list of caches containing:
                every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2)
                the cache of linear_sigmoid_forward() (there is one, indexed L-1)
    """


## 5 - Cost function

Cross-entropy cost $J$: 

$-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right))$

In [None]:
def compute_cost(AL, Y):

    m = Y.shape[1]

    # Compute loss from AL and y.
    cost = -(1/m) * np.sum(np.dot(Y, np.log(AL).T) + np.dot((1 - Y), np.log(1 - AL).T))
    
    cost = np.squeeze(cost)      # ensure shape of cost is correct
    assert(cost.shape == ())
    
    return cost
    """
    Arguments:
    AL = probability vector corresponding to label predictions= shape = (1, number of examples)
    Y = labels vector; shape = (1, number of examples)

    Returns:
    cost = cross-entropy cost
    """
    

## 6 - Backpropagation

6.1 = LINEAR backward

6.2 = LINEAR -> ACTIVATION backward where ACTIVATION computes the derivative of either the ReLU or sigmoid activation

6.3 = LINEAR -> RELU $\times$ (L-1) -> LINEAR -> SIGMOID backward
### 6.1 - Linear Backward

$dZ^{[l]} = \frac{\partial \mathcal{L} }{\partial Z^{[l]}}$

The three outputs $(dW^{[l]}, db^{[l]}, dA^{[l]})$ are computed using the input $dZ^{[l]}$

$ dW^{[l]} = \frac{\partial \mathcal{L} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T}$

$ db^{[l]} = \frac{\partial \mathcal{L} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}$

$ dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]}$

In [None]:
def linear_backward(dZ, cache):

    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = (1/m) * np.dot(dZ, A_prev.T)
    db = (1/m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db
    """
    Arguments:
    dZ = gradient of the cost with respect to the linear output (of current layer l)
    cache = tuple of values (A_prev, W, b) from the forward propagation in the current layer

    Returns:
    dA_prev = gradient of the cost with respect to the activation (of the previous layer l-1); same shape as A_prev
    dW = gradient of the cost with respect to W (current layer l); same shape as W
    db = gradient of the cost with respect to b (current layer l); same shape as b
    """

### 6.2 - Linear-Activation Backward

$dZ^{[l]} = dA^{[l]} * g'(Z^{[l]})$  


In [None]:
def linear_activation_backward(dA, cache, activation):

    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db
    """
    Arguments:
    dA = post-activation gradient for current layer l 
    cache = tuple of values (linear_cache, activation_cache); used for backpropagation
    activation = the activation to be used in this layer - sigmoid or relu
    
    Returns:
    dA_prev = gradient of the cost with respect to the activation (of the previous layer l-1); same shape as A_prev
    dW = gradient of the cost with respect to W (current layer l); same shape as W
    db = gradient of the cost with respect to b (current layer l); same shape as b
    """

### 6.3 - L-Layer Model Backward 

Backpropagation for the LINEAR->RELU $\times$ (L-1) -> LINEAR -> SIGMOID model.

`dAL` $= \frac{\partial \mathcal{L}}{\partial A^{[L]}}$.


```python
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # derivative of cost with respect to AL
```

In [None]:
def L_model_backward(AL, Y, caches):

    grads = {}
    L = len(caches) # number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # make Y the same shape as AL
    
    # Initializing the backpropagation
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # derivative of cost with respect to AL
    
    # Lth layer SIGMOID -> LINEAR gradients
    current_cache = caches[L-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation="sigmoid") 
    
    for l in reversed(range(L-1)):
        # lth layer RELU -> LINEAR gradients
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 2)], current_cache, activation="relu") 
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads
    """
    Arguments:
    AL = probability vector; output of the forward propagation (L_model_forward())
    Y = labels vector
    caches = list of caches containing:
                every cache of linear_activation_forward() with relu (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with sigmoid (it's caches[L-1])
    
    Returns:
    grads = A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """


## 7 - Update Parameters


Update the parameters using gradient descent: 

$ W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]}$

$ b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]}$

where $\alpha$ is the learning rate.

In [None]:
def update_parameters(parameters, grads, learning_rate):

    L = len(parameters) // 2 # number of layers

    for l in range(1, L + 1):
        parameters["W" + str(l)] = parameters["W" + str(l)] - learning_rate * grads["dW" + str(l)]
        parameters["b" + str(l)] = parameters["b" + str(l)] - learning_rate * grads["db" + str(l)]
    return parameters
    """
    Arguments:
    parameters = python dictionary containing parameters 
    grads = python dictionary containing gradients; output of L_model_backward
    
    Returns:
    parameters = python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    

## 8 - L-Layer Neural Network


In [None]:
### CONSTANTS ###
layers_dims = [13, 10, 5, 1] #  4-layer model

In [None]:
def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009

    costs = []                         # keep track of cost
    
    # Parameters initialization.
    parameters = initialize_parameters_deep(layers_dims)
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):

        # Forward propagation: LINEAR -> RELU *(L-1) -> LINEAR -> SIGMOID
        AL, caches = L_model_forward(X, parameters)
        
        # Compute cost
        cost = compute_cost(AL, Y)
    
        # Backpropagation
        grads = L_model_backward(AL, Y, caches)
 
        # Update parameters
        parameters = update_parameters(parameters, grads, learning_rate)
                
        # Print the cost every 100 training examples
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)
            
    # plot the cost
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters
    """
    Arguments:
    X = data; numpy array shape = (number of examples, num_px * num_px * 3)
    Y = labels vector; shape = (1, number of examples)
    layers_dims = list containing the input size and each layer size, of length (number of layers + 1)
    learning_rate = learning rate of the gradient descent update rule
    num_iterations = number of iterations of the optimization loop
    print_cost = if True, print cost every 100 steps
    
    Returns:
    parameters = parameters learnt by the model - used for prediction
    """

## 9 - Train the Model


Split data into training and test sets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

X_train.shape

Train the model.

In [None]:
parameters = L_layer_model(X_train.T, y_train.T, layers_dims, num_iterations = 2500, print_cost = True)

## 10 - Predict Classifications of Test Data

Prediction function.

In [None]:
def predict(parameters, X):

    # Computes probabilities using forward propagation and classifies to 0 or 1 using 0.5 as the threshold.
    A2, cache = L_model_forward(X, parameters)
    predictions = np.zeros((A2.shape))
    for i in range(A2.shape[1]):
        if A2[0, i] > 0.5:
            predictions[0, i] = 1
        else:
            predictions[0, i] = 0

    return predictions
    """
    Arguments:
    parameters = python dictionary containing your parameters 
    X = input data; size = (n_x, m)
    
    Returns
    predictions = vector of predictions
    """
    

Run predictions.

In [None]:
# assign test_data to X test data

test_data = X_test.T

predictions = predict(parameters, test_data)


Calculate accuracy.

In [None]:
# assign Y to test data

Y = y_test.T

# calculate accuracy

print ('Accuracy: %d' % float((np.dot(Y,predictions.T) + 
                               np.dot(1- Y,1-predictions.T))/float(Y.size)*100) + '%')