In [60]:
import numpy as np
import matplotlib.pyplot as plt
from utils import *
from test_cases import *

In [61]:
def initialise_deep(layer_dims):
    """
    Layer dims is a list indicating the various layers
    """
    params = {}
    L = len(layer_dims)
    
    #Setting the shape for parameters W and b of layer l to be (l, l-1) and (l, 1) respectively
    for l in range(1, L):
        params["W" + str(l)] = np.random.rand(layer_dims[l], layer_dims[l-1])*0.01
        params["b" + str(l)] = np.zeros(shape = (layer_dims[l], 1))
        
        assert(params["W" + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
        assert(params["b" + str(l)].shape == (layer_dims[l], 1)) 
        
    return params

In [62]:
# Init test
params = initialise_deep([2,1])
params

{'W1': array([[0.00078808, 0.00805757]]), 'b1': array([[0.]])}

In [63]:
# foward prop without activation
def linear_foward(A_prev, W, b):
    
    Z = np.dot(W, A_prev) + b
    
    assert(Z.shape == (W.shape[0], A_prev.shape[1]))
    
    cache = (A_prev, W, b)
    
    return Z, cache

In [64]:
# Linear_foward_test
def linear_foward_test():
    A_prev = np.random.rand(3,2)
    W = np.random.rand(4,3)
    b = np.zeros(shape = (4,1))
    return A_prev, W, b
    

In [74]:
# Linear_test_foward
A_prev, W, b = linear_foward_test()
Z, cache = linear_foward(A_prev, W, b)
print("Z :", Z.shape)
for i in cache:
    print("content:", i)

Z : (4, 2)
content: [[0.43179347 0.26522888]
 [0.93116502 0.65949471]
 [0.94000788 0.98403165]]
content: [[0.17763874 0.51384609 0.68409248]
 [0.64985922 0.76776227 0.80978965]
 [0.96936564 0.09262521 0.89938912]
 [0.58247105 0.75353202 0.89115137]]
content: [[0.]
 [0.]
 [0.]
 [0.]]


In [66]:
# Linear foward with activation 
def linear_activation_foward(A_prev, W, b, activation):
    Z, cache = linear_foward(A_prev, W, b)
    if activation == "sigmoid":
        A, activation_cache = sigmoid(Z)
    elif activation == "relu":
        A, activation_cache = relu(Z)
    
    cache = (A_prev, W, b, Z)
    return A, cache
    

In [90]:
# linear foward with activation test
A, cache = linear_activation_foward(A_prev, W, b, activation="sigmoid")
print("A: ", A)
cache
# for i in cache:
#     print("content: ", i)


A:  [[0.76820995 0.74253044]
 [0.85279926 0.81390029]
 [0.79417017 0.76909657]
 [0.85703258 0.82176619]]


(array([[0.43179347, 0.26522888],
        [0.93116502, 0.65949471],
        [0.94000788, 0.98403165]]),
 array([[0.17763874, 0.51384609, 0.68409248],
        [0.64985922, 0.76776227, 0.80978965],
        [0.96936564, 0.09262521, 0.89938912],
        [0.58247105, 0.75353202, 0.89115137]]),
 array([[0.],
        [0.],
        [0.],
        [0.]]),
 array([[1.19823107, 1.05916235],
        [1.75672699, 1.47555524],
        [1.35024797, 1.20321696],
        [1.79085916, 1.52835968]]))

In [68]:
# model with L layers foward path
def L_model_foward(X, params):
    """
    X: input tensor
    params: dict containing weights and biases for all layers
    cache: tuple containing all the previous values of A_prev, W, b and Z
    
    """
    # list for storing all the intermediate values
    caches = []
    L = len(params) // 2
    
    A = X
    
    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_foward(A_prev, 
                                            params["W" + str(l)],
                                            params["b" + str(l)], 
                                            "relu")
        caches.append(cache)
        
    AL, cache = linear_activation_foward(A, 
                                         params["W" + str(L)],
                                         params["b" + str(L)],
                                         "sigmoid")
    caches.append(cache)
    
    return AL, caches
    

In [69]:
# L model test case
def L_model_foward_test_case():
    X = np.random.rand(3,2)
    params = initialise_deep([3,4,2])
    return X, params

In [70]:
# L model test
X, params = L_model_foward_test_case()
AL, caches = L_model_foward(X, params)
print("Al: ", AL)
for i in caches:
    print("cache_content: " + str(i[3]))

Al:  [[0.50005532 0.50003367]
 [0.50004026 0.50002263]]
cache_content: [[0.01426713 0.00848462]
 [0.0111772  0.00553742]
 [0.00767508 0.0051125 ]
 [0.00392853 0.00238374]]
cache_content: [[2.21269071e-04 1.34661790e-04]
 [1.61022315e-04 9.05228591e-05]]


In [71]:
# Cost
def cost(AL, Y):
    m = Y.shape[0]
    
    cost = (-1/m) * np.sum(np.multiply(Y, np.log(AL))+ np.multiply(1-Y, np.log(1-AL)))
    
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert(cost.shape==())
           
    return cost

In [73]:
# cost test
Y = np.random.rand(2,2)
d = cost(AL, Y)
d

1.386230336061199

In [85]:
# Back propagation (linear)
def linear_backward(dZ, cache):
    A_prev, W, b, _ = cache
    m = A_prev.shape[1]
    dW = np.dot(dZ, A_prev.T)* (1/m)
    db = np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T, dZ)*(1/m)
    db.shape
    assert(dA_prev.shape == A_prev.shape)
    assert(dW.shape == W.shape)
    assert(db.shape == b.shape)
    
    return dA_prev, dW, db


In [80]:
# Linear backprop test
def linear_backward_test_case(Z):
    dZ = np.random.rand(Z.shape[0], Z.shape[1])
    return dZ

In [84]:
dZ = linear_backward_test_case(Z)
dA_prev, dW, db = linear_backward(dZ, cache)


In [89]:
def linear_activation_backward(dA, cache, activation):
    A_prev, W, b, Z = cache
    m = A_prev.shape[1]
    if activation == "sigmoid":
        dZ = sigmoid_backward(dA, Z)
    if activation == "relu":
        dZ = relu_backward(dA, Z)
    dA_prev, dW, db = linear_backward(dZ, cache)
    return dA_prev, dW, db

In [94]:
# linear_activation_backward test case
def linear_activation_backward_test_case(dZ):
    dA = np.random.rand(dZ.shape[0], dZ.shape[1])
    return dA
dA = linear_activation_backward_test_case(dZ)
dA_prev, dW, db = linear_activation_backward(dA, cache, activation="relu")
print("dA_prev: ", dA_prev)
print("dW: ", dW)
print("db: ", db)

dA_prev:  [[0.95716363 0.69459362]
 [0.70108167 0.61876646]
 [1.14190043 0.97355165]]
dW:  [[0.12724061 0.30954539 0.44030162]
 [0.29317912 0.66586242 0.79443795]
 [0.27631724 0.62345184 0.7296329 ]
 [0.2000835  0.44174081 0.4832428 ]]
db:  [[0.89918816]
 [1.65430102]
 [1.52289117]
 [1.01718826]]


In [None]:
## Model Back prop
def L_momdel_backward(AL, Y, caches):
    grads = {}
    L = len(caches)   #num layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, "sigmoid")
    dA = grads["dA" + str(L-1)]
    
    for l in range(L-1):
        current_cache = cache[l]
        dA_temp, dW_temp, db_temp = linear_activation_backward(dA, current_cache, "relu")
        
        grads["dA" + str(l+1)],
        grads["dW" + str(l+1)],
        grads["db" + str(l+1)] = dA_temp, dW_temp, db_temp
        
    return grads

In [None]:
def L_model_backward(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # Initializing the backpropagation
    ### START CODE HERE ### (1 line of code)
    
    ### END CODE HERE ###
    
    # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "AL, Y, caches". Outputs: "grads["dAL"], grads["dWL"], grads["dbL"]
    ### START CODE HERE ### (approx. 2 lines)
    current_cache = caches[-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
    ### END CODE HERE ###
    
    for l in reversed(range(L-1)):
        # lth layer: (RELU -> LINEAR) gradients.
        # Inputs: "grads["dA" + str(l + 2)], caches". Outputs: "grads["dA" + str(l + 1)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)] 
        ### START CODE HERE ### (approx. 5 lines)
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_backward(sigmoid_backward(dAL, current_cache[1]), current_cache[0])
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
        ### END CODE HERE ###

    return grads


In [None]:
def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent

    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward

    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """

    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l + 1)]
    
    return parameters

In [None]:
def two_layer_model(X, Y, layers_dims, learning_rate=0.0075, num_iterations=3000, print_cost=False):
    """
    Implements a two-layer neural network: LINEAR->RELU->LINEAR->SIGMOID.
    
    Arguments:
    X -- input data, of shape (n_x, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    layers_dims -- dimensions of the layers (n_x, n_h, n_y)
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    print_cost -- If set to True, this will print the cost every 100 iterations 
    
    Returns:
    parameters -- a dictionary containing W1, W2, b1, and b2
    """
    
    np.random.seed(1)
    grads = {}
    costs = []                              # to keep track of the cost
    m = X.shape[1]                           # number of examples
    (n_x, n_h, n_y) = layers_dims
    
    # Initialize parameters dictionary, by calling one of the functions you'd previously implemented
    ### START CODE HERE ### (≈ 1 line of code)
    parameters = initialize_parameters(n_x, n_h, n_y)
    ### END CODE HERE ###
    
    # Get W1, b1, W2 and b2 from the dictionary parameters.
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    # Loop (gradient descent)

    for i in range(0, num_iterations):

        # Forward propagation: LINEAR -> RELU -> LINEAR -> SIGMOID. Inputs: "X, W1, b1". Output: "A1, cache1, A2, cache2".
        ### START CODE HERE ### (≈ 2 lines of code)
        A1, cache1 = linear_activation_forward(X, W1, b1, 'relu')
        A2, cache2 = linear_activation_forward(A1, W2, b2, 'sigmoid')
        ### END CODE HERE ###
        
        # Compute cost
        ### START CODE HERE ### (≈ 1 line of code)
        cost = compute_cost(A2, Y)
        ### END CODE HERE ###
        
        # Initializing backward propagation
        dA2 = - (np.divide(Y, A2) - np.divide(1 - Y, 1 - A2))
        
        # Backward propagation. Inputs: "dA2, cache2, cache1". Outputs: "dA1, dW2, db2; also dA0 (not used), dW1, db1".
        ### START CODE HERE ### (≈ 2 lines of code)
        dA1, dW2, db2 = linear_activation_backward(dA2, cache2, 'sigmoid')
        dA0, dW1, db1 = linear_activation_backward(dA1, cache1, 'relu')
        ### END CODE HERE ###
        
        # Set grads['dWl'] to dW1, grads['db1'] to db1, grads['dW2'] to dW2, grads['db2'] to db2
        grads['dW1'] = dW1
        grads['db1'] = db1
        grads['dW2'] = dW2
        grads['db2'] = db2
        
        # Update parameters.
        ### START CODE HERE ### (approx. 1 line of code)
        parameters = update_parameters(parameters, grads, learning_rate)
        ### END CODE HERE ###

        # Retrieve W1, b1, W2, b2 from parameters
        W1 = parameters["W1"]
        b1 = parameters["b1"]
        W2 = parameters["W2"]
        b2 = parameters["b2"]
        
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
        if print_cost and i % 100 == 0:
            costs.append(cost)
       
    # plot the cost

    plt.plot(np.squeeze(costs))B
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters
