# A Deep Neural Network Classifying Cats

Introduction:

Build a neural network to classified cats vs. non-cats. This Network has $L$ layers, which has $L - 1$ layers of ReLU activation function followed by an output layer with a sigmoid activation function.


In [18]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
from testCases_v4a import *

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 0 - Helper functions

In [13]:
def sigmoid(Z):
    return 1 / (np.exp(-Z) + 1)

In [14]:
def relu(Z):
    return np.maximum(0, Z)

In [33]:
def sigmoid_backward(dA, Z):
    s = 1 / (np.exp(-Z) + 1)
    dZ = dA * s * (1 - s)
    return dZ

In [34]:
def relu_backward(dA, Z):
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0
    return dZ

### 1 - Initialization

In [2]:
def initialize_parameters(layer_dim):
    """
    Arguments:
    layer_dims -- python array (list) containing the sizes of each layer in the network
    
    Returns:
    parameters -- python dictionary containing  parameters "W1", "b1", ..., "WL", "bL":
                    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    bl -- bias vector of shape (layer_dims[l], 1)
    """
    
    np.random.seed(3)
    parameters = {}
    
    for i in range(1, len(layer_dim)):
        parameters['W' + str(i)] = np.random.randn(layer_dim[i], layer_dim[i-1]) * 0.01
        parameters['b' + str(i)] = np.zeros((layer_dim[i], 1))
    
    return parameters

### 2 - Forward propagation

$$Z^{[l]} = W^{[l]}A^{[l-1]} +b^{[l]}\tag{4}$$

where $A^{[0]} = X$. 

In [19]:
def linear_activation_forward(A_prev, W, b, activation):
    '''
    Implement the forward propagation for the LINEAR->ACTIVAITION layer
    
    Arguments:
    A_prev -- activation from the previous layer
    W -- wegihts matrix of the current layer
    b -- bias vecot of the current layer
    activation: the activation function to be used in this layer, stored as string 'relu' or 'sigmoid'
    
    Returns:
    A -- output of the activation function
    cache -- a python tuple containing 'linear_cache' and 'activation_cache'; stored for computing backward pass more efficiently
    '''
    Z = np.dot(W, A_prev) + b
    linear_cache = (A_prev, W, b)
    
    if activation == 'sigmoid':
        A = sigmoid(Z)
    
    if activation == 'relu':
        A = relu(Z)
    
    activation_cache = Z
    cache = (linear_cache, activation_cache)
    
    return A, cache     

In [25]:
def forward_propagation(X, parameters):
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters()
    
    Returns:
    AL -- the activation value of the output layer
    caches -- list of caches containing every cache of linear_activation_forward() 
    """
    
    L = len(parameters) // 2
    A_prev = X
    caches = []
    
    # layer 1 -> L-1
    for i in range(1, L):
        W = parameters['W' + str(i)]
        b = parameters['b' + str(i)]
        A_prev, cache = linear_activation_forward(A_prev, W, b, 'relu')
        caches.append(cache)
    
    # layer L
    W = parameters['W' + str(L)]
    b = parameters['b' + str(L)]
    AL, cache = linear_activation_forward(A_prev, W, b, 'sigmoid')
    caches.append(cache)
    
    return AL, caches

In [26]:
X, parameters = L_model_forward_test_case_2hidden()
AL, caches = forward_propagation(X, parameters)
print("AL = " + str(AL))
print("Length of caches list = " + str(len(caches)))

AL = [[0.03921668 0.70498921 0.19734387 0.04728177]]
Length of caches list = 3


### 3 - Compute cost
$$-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)) \tag{7}$$


In [27]:
def compute_cost(AL, Y):
    m = Y.shape[1]
    cost = - np.sum(Y * np.log(AL) + (1-Y) * np.log(1-AL)) / m
    cost = np.squeeze(cost)
    return cost

### 4 - Backward propagation

In [41]:
def linear_activation_backward(dA, cache, activation):
    m = dA.shape[1]
    linear_cache, activation_cache = cache
    Z = activation_cache
    
    if activation == 'sigmoid':
        dZ = sigmoid_backward(dA, Z) 
    if activation == 'relu':
        dZ = relu_backward(dA, Z)
    
    A_prev, W, b = linear_cache
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 1, keepdims = True) / m
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [53]:
def backward_propagation(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    m = AL.shape[1]
    L = len(caches)
    Y = Y.reshape(AL.shape)
    grads = {}
    
    # layer L: sigmoid
    dAL = - np.divide(Y, AL) + np.divide(1-Y, 1-AL)
    grads['dA' + str(L-1)], grads['dW' + str(L)], grads['db' + str(L)] = linear_activation_backward(dAL, caches[L-1], 'sigmoid')
    
    # layer L-1 ~ 1: relu
    for i in range(L-1, 0, -1):
        dA_prev, grads['dW' + str(i)], grads['db' + str(i)] = linear_activation_backward(grads['dA'+str(i)], caches[i-1], 'relu')
    
    return grads

In [54]:
AL, Y_assess, caches = L_model_backward_test_case()
grads = backward_propagation(AL, Y_assess, caches)
print_grads(grads)

dW1 = [[0.41010002 0.07807203 0.13798444 0.10502167]
 [0.         0.         0.         0.        ]
 [0.05283652 0.01005865 0.01777766 0.0135308 ]]
db1 = [[-0.22007063]
 [ 0.        ]
 [-0.02835349]]
dA1 = [[ 0.12913162 -0.44014127]
 [-0.14175655  0.48317296]
 [ 0.01663708 -0.05670698]]


### 5 - Update parameters

In [55]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    
    for i in range(1, L + 1):
        parameters['W' + str(i)] -= learning_rate * grads['dW' + str(i)]
        parameters['b' + str(i)] -= learning_rate * grads['db' + str(i)]
    
    return parameters