In [1]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt

% matplotlib inline

# (0) Activation functions      :------------------------------------------------------------

In [83]:
def sigmoid(Z):
    A = 1/(1+np.exp(-Z))
    cache= Z
    return (A,Z)
def sigmoid_backward(dA, activation_cache):
    der_sigmoid= sigmoid(np.array(activation_cache))[0]*(1-sigmoid(np.array(activation_cache))[0])
    dZ= np.multiply(dA,der_sigmoid)
    return(dZ)

In [None]:
def tanh(Z):
    A = np.tanh(Z)
    cache= Z
    return (A,Z)
def tanh_backward(dA, activation_cache):
    der_tanh= 1-((tanh(np.array(activation_cache)))[0]**2)
    dZ= np.multiply(dA,der_tanh)
    return(dZ)

In [None]:
def relu(Z):
    A = max(0,Z)
    cache= Z
    return (A,Z)
def relu_backward(dA, activation_cache):
    if (activation_cache<0):
        der_relu= 0
    else:
        der_relu= 1
    dZ= np.multiply(dA,der_relu)
    return(dZ)

# (1) Parameter initialization  :-----------------------------------------------------------

In [84]:
def parameter_initialization(network_architecture):
    """
    Arguments:
    network_architecture -- python list containing the NN architecture [n0,n1,n2,...,nL]

    Returns  :
    parameters -- python dictionary containing parameters "W1","b1","W2","b2",...,"WL","bL"

    """
    parameters = {}
    for l in range(1, len(network_architecture)):
        parameters['W'+str(l)] = np.random.randn(network_architecture[l], network_architecture[l-1]) * 0.01
        parameters['b'+str(l)] = np.zeros((network_architecture[l], 1))
    return (parameters)

# (2) Forward pass functions  :----------------------------------------------------------

In [85]:
def compute_Zl(A_prev, W, b):
    """
    Arguments:
    A_prev -- output of the previous layer
    W -- weights matrix for the current layer: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector for the current layer   : numpy array of shape (size of the current layer, 1)

    Returns  :
    Z -- input to the activation function for the current layer
    cache -- a python dictionary containing "A_prev", "W" and "b" stored for computing the backward pass efficiently
    
    """
    Z = np.dot(W,A_prev)+b
    assert(Z.shape == (W.shape[0], A_prev.shape[1]))
    cache = (A_prev, W, b)
    return (Z, cache)

def compute_Al(A_prev, W, b, activation):
    """
    Arguments:
    A_prev -- output of the previous layer
    W -- weights matrix for the current layer: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector for the current layer   : numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "tanh" or "relu"

    Returns  :
    A -- output of the current layer
    cache -- a python dictionary containing "A_prev", "W", "b" (cache from compute_Zl()) and "Z" (cache from sigmoid() or tanh() or relu()) stored for computing the backward pass efficiently
    
    """
    if activation == "sigmoid":
        Z, linear_cache = compute_Zl(A_prev,W,b)
        A, activation_cache = sigmoid(Z)
    elif activation == "tanh" :
        Z, linear_cache = compute_Zl(A_prev,W,b)
        A, activation_cache = tanh(Z)
    elif activation == "relu" :
        Z, linear_cache = compute_Zl(A_prev,W,b)
        A, activation_cache = relu(Z)
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)
    return (A, cache)

def forward_propagation(X, parameters, activation):
    """
    Arguments:
    X -- data, numpy array of shape (n0,m)
    parameters -- output of parameter_initialization() during training and update_parameters() during testing
    activation -- the activation to be used for hidden layers, stored as a text string: "sigmoid" or "tanh" or "relu"
    
    Returns  :
    AL -- output of the layer L
    caches -- list of caches containing every cache of compute_Al() (there are L-1 of them)
    
    """
    caches= []
    A = X
    L = len(parameters)//2
    for l in range(1, L):
        A_prev   = A 
        if activation == "sigmoid":
            A, cache = compute_Al(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], "sigmoid")
            caches.append(cache)
        elif activation == "tanh":
            A, cache = compute_Al(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], "tanh")
            caches.append(cache)
        elif activation == "relu":
            A, cache = compute_Al(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], "relu")
            caches.append(cache)  
    AL, cache = compute_Al(A, parameters['W' + str(L)], parameters['b' + str(L)], "sigmoid")
    caches.append(cache)
    assert(AL.shape == (1,X.shape[1]))
    return (AL, caches)


# (3) Compute the Cost           :----------------------------------------------------------

In [86]:
def compute_cost(AL, Y):
    """
    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
     Y -- ground truth vector: numpy array of shape (1, number of training examples(m_train))
    
    Returns  :
    cost -- cross-entropy cost
    """
    m = Y.shape[1]
    cost = -1/m*np.sum(np.multiply(Y,np.log(AL))+np.multiply((1-Y),np.log(1-AL)))
    cost = np.squeeze(cost)
    assert(cost.shape == ())
    return (cost)

# (4) Backward pass functions:----------------------------------------------------------

In [87]:
def compute_dParam(dZ, cache):
    """
    Arguments:
    dZ -- gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- gradient of the cost with respect to the activation (of the previous layer l-1)
    dW -- gradient of the cost with respect to W (current layer l)
    db -- gradient of the cost with respect to b (current layer l)
    
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]
    dW = 1/m*np.dot(dZ,np.transpose(A_prev))
    db = 1/m*np.sum(dZ,axis=1,keepdims=True)
    dA_prev = np.dot(np.transpose(W),dZ)
    return (dA_prev, dW, db)

def compute_dZ_dParam(dA, cache, activation):
    """
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "tanh" or "relu" (same as used in forward propagation)
    
    Returns  :
    dA_prev -- gradient of the cost with respect to the activation (of the previous layer l-1)
    dW -- gradient of the cost with respect to W (current layer l)
    db -- gradient of the cost with respect to b (current layer l)
    
    """
    linear_cache, activation_cache = cache
    if activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = compute_dParam(dZ, linear_cache)
    elif activation == "tanh":
        dZ = tanh_backward(dA, activation_cache)
        dA_prev, dW, db = compute_dParam(dZ, linear_cache)
    elif activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = compute_dParam(dZ, linear_cache)
    return (dA_prev, dW, db)

def backward_propagation(AL, Y, caches, activation):
    """
    Arguments:
    AL -- output of forward propagation (forward_propagation())
     Y -- true "label" vector 
    caches -- list of caches containing every cache of compute_Al() (there are L-1 of them)
    activation -- the activation to be used for hidden layers, stored as a text string: "sigmoid" or "tanh" or "relu"
    
    Returns  :
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) 
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) 
    
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
   
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = compute_dZ_dParam(dAL, current_cache, "sigmoid")
    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = compute_dZ_dParam(grads["dA" + str(l + 1)], current_cache, activation)
        grads["dA" + str(l)]     = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return (grads)

# (5) Updating the Parameters:----------------------------------------------------------

In [88]:
def update_parameters(parameters, grads, learning_rate):
    """
    Arguments:
    parameters -- python dictionary containing parameters 
    grads -- python dictionary containing gradients, output of backward_propagation()
    
    Returns  :
    parameters -- python dictionary containing updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    L = len(parameters) // 2
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)]-(learning_rate*grads["dW" + str(l+1)])
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]-(learning_rate*grads["db" + str(l+1)])
    return (parameters)

# --------------------------<< Training the Network >>----------------------------------

In [None]:
def train_NN(X, Y, network_architecture, activation, num_iterations, learning_rate, print_cost):
    """
    Trains an L-layer neural network
    
    Arguments:
    X -- training data: numpy array of shape (number of input features(n0), number of training examples(m_train))
    Y -- ground truth vector: numpy array of shape (1, number of training examples(m_train))
    network_architecture -- python list containing the NN architecture [n0,n1,n2,...nL]
    activation -- the activation to be used for hidden layers, stored as a text string: "sigmoid" or "tanh" or "relu"
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    print_cost -- if True, it prints the cost every 100 steps
    
    Returns :
    parameters -- parameters learnt by the model. They can then be used to predict.
    
    """
    costs = []
    parameters = parameter_initialization(network_architecture)
    for i in range(0, int(num_iterations)):
        AL, caches = forward_propagation(X, parameters, activation)
        cost  = compute_cost(AL, Y)
        grads = backward_propagation(AL, Y, caches, activation)
        parameters = update_parameters(parameters, grads, learning_rate)
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return (parameters)

# --------------------------<<     Implementation     >>----------------------------------

In [89]:
def predict(X, Y, parameters,activation):
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    AL = forward_propagation(X, parameters, activation)[0]
    for i in range(AL.shape[1]):
        if AL[0,i]<=0.5:
            AL[0,i]=0
        else:
            AL[0,i]=1
        Y_prediction = AL
    return (Y_prediction)

In [91]:
def NN_model(X_train, Y_train, X_test, Y_test, network_architecture, activation, num_iterations, learning_rate, print_cost):
    """
    Computes the output of an L-layer neural network (training and test accuracy)
    
    Arguments:
    X_train -- training data: numpy array of shape (number of input features(n0), number of training examples(m_train))
    Y_train -- ground truth vector: numpy array of shape (1, number of training examples(m_train))
    X_test  -- test data: numpy array of shape (number of input features(n0), number of test examples(m_test))
    Y_test  -- ground truth vector: numpy array of shape (1, number of test examples(m_test))
    network_architecture -- python list containing the NN architecture [n0,n1,n2,...nL]
    activation -- the activation to be used for hidden layers, stored as a text string: "sigmoid" or "tanh" or "relu"
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    print_cost -- if True, it prints the cost every 100 steps

    """
    parameters = train_NN(X_train, Y_train, network_architecture, activation, num_iterations, learning_rate, print_cost)
    
    Y_prediction_train= predict(X_train, Y_train, parameters, activation)
    Y_prediction_test = predict(X_test , Y_test , parameters, activation)
    
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))