# Deep Learning - Assigment 1

Sagiv Melamed - I.D.
Dan Peled - I.D. 211547013

In [1]:
# imports
import numpy as np
# from util import *
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from typing import Callable
import tensorflow as tf

# Part 1
This section contains the implementations of the forward propagation

In [2]:
def initialize_parameters(layers_dims: list) -> dict:
    """
    Create an ANN architecture depending on layers_dims
    :param layers_dims: list of layers dimentions
    :type layers_dims: list
    :return: dictionary built as follows:
        W: list of matrices representing layer's weights, initialized randomly,
        b: list of biases for each layer, initialized to zero
    :rtype: dict
    """
    # Create W

    W_sizes = [(next_dim, current_dim) for current_dim, next_dim in zip(layers_dims[:-1], layers_dims[1:])]
    W = [np.random.randn(*Wi_size) for Wi_size in W_sizes]

    # create b

    b_sizes = layers_dims[1:]
    b = [np.zeros((bi_size, 1)) for bi_size in b_sizes]

    return {
        "W": W,
        "b": b
    }


In [3]:
def linear_forward(A: np.ndarray, W: np.ndarray, B: np.ndarray) -> dict:
    """
    Performing linear forward on NN
    :param A: Activation vector of previous layer
    :type A: np.ndarray
    :param W: Weight matrix of the current layer
    :type W: np.ndarray
    :param B: Bias vector of the current layer
    :type B: np.ndarray
    :return: dictionary built as follows:
        Z: linear component of activation function
        linear_cache: A,W,B
    :rtype: dict
    """
    return {
        "Z": W.dot(A) + B,
        "linear_cache": {
            "A": A,
            "W": W,
            "B": B
        }
    }

Activation functions:

In [4]:
def softmax(Z: np.ndarray) -> dict:
    """
    Applying softmax on Z
    :param Z: the linear component of the activation function
    :type Z: np.ndarray
    :return: dictionary built as follows:
        A: Activation of th layer
        activation_cache: Z
    :rtype: dict
    """
    # To avoid overflow in the exponent we will subtract the max value from z
    # and perform softmax on that. The mathematical proof provided in the report.
    z = np.copy(Z)
    z -= z.max(axis=0)
    return {
        "A": np.exp(z) / np.exp(z).sum(),
        "activation_cache": {
            "Z": Z
        }
    }


def relu(Z: np.ndarray) -> dict:
    """
        Applying relu on Z
        :param Z: the linear component of the activation function
        :type Z: np.ndarray
        :return: dictionary built as follows:
            A: Activation of th layer
            activation_cache: Z
        :rtype: dict
        """
    return {
        "A": np.maximum(0, Z),
        "activation_cache": {
            "Z": Z
        }
    }


In [5]:
def linear_activation_forward(A_prev: np.ndarray, W: np.ndarray, B: np.ndarray,
                              activation: Callable[[np.ndarray], dict]) -> dict:
    cache = {}
    linear = linear_forward(A_prev, W, B)
    z, linear_cache = linear['Z'], linear['linear_cache']

    active = activation(z)
    a, activation_cache = active['A'], active['activation_cache']

    cache['linear_cache'] = linear_cache
    cache['activation_cache'] = activation_cache

    return {
        "A": a,
        "cache": cache
    }


In [6]:
def L_model_forward(X: np.ndarray, parameters: dict, use_batchnorm: bool = False):
    """

    :param X: matrix of inputs
    :type X: np.ndarray
    :param parameters: a dict like object containing W and b
    :type parameters: dict
    :param use_batchnorm: whether to use batch normalization or not
    :type use_batchnorm: bool
    :return:
        dictionary containing the activation of the ANN represented by the parameters on X and cache actions
    :rtype:
        dict
    """
    cache_list = list()
    a = X

    # Relu layers
    for W_i, b_i in zip(parameters["W"][:-1], parameters["b"][:-1]):
        results = linear_activation_forward(a, W_i, b_i, relu)
        a = results['A']
        if use_batchnorm:
            a = apply_batchnorm(a)

        cache_list.append(results['cache'])

    # Softmax layer
    results = linear_activation_forward(a, parameters["W"][-1], parameters["b"][-1], softmax)
    cache_list.append(results['cache'])
    return results['A'], cache_list



In [7]:
def compute_cost(Al: np.ndarray, Y: np.ndarray):
    """
    Compute loss(cost) using prediction(Al) and true values(Y)
    :param Al:
    :type Al:
    :param Y:
    :type Y:
    :return:
    :rtype:
    """
    return np.sum(Y * np.log(Al)) / -Y.shape[0]

In [8]:
def apply_batchnorm(A: np.ndarray) -> np.ndarray:
    mean = A.mean()
    std = A.std()

    return (A - mean) / np.sqrt(std ** 2 + .0001)  # plus .0001 to avoid zero division

# Part 2

This part contains functions related to the back propagation

In [9]:
def linear_backward(dZ: np.ndarray, cache: dict):
    """
Implements the linear part of the backward propagation process for a single layer
    :param dZ: the gradient of the cost with respect to the linear output of the current laye
    :type dZ: np.ndarray
    :param cache:
    :type cache: dict
    :return:
        tuple of derivatives dA,dW,dB
    :rtype:
    """
    dA = cache["W"].T.dot(dZ)
    dW = dZ.dot(cache['A'].T)
    dB = np.sum(dZ, axis=1, keepdims=True)
    return dA, dW, dB

In [10]:
def linear_activation_backward(dA: np.ndarray, cache: dict, activation):
    """
    Implements the backward propagation for the LINEAR->ACTIVATION layer. The function first computes dZ and then applies the linear_backward function.
    :param dA: post activation gradient of the current layer
    :type dA: np.ndarray
    :param cache: contains both the linear cache and the activations cache
    :type cache: dict
    :param activation: activation backward function
    :type activation: function
    :return:
                tuple of derivatives dA,dW,dB
    :rtype:
    """
    dZ = activation(dA, cache['activation_cache'])
    return linear_backward(dZ, cache['linear_cache'])

In [11]:
def relu_backward(dA: np.ndarray, activation_catch: dict):
    """
    Implements backward propagation for a ReLU unit
    :param dA: the post-activation gradient
    :type dA: np.ndarray
    :param activation_catch: contains Z (stored during the forward propagation)
    :type activation_catch: dict
    :return:
        derivative of Z
    :rtype:
        np.ndarray
    """
    dZ = np.array(dA, copy=True)
    dZ[activation_catch['Z'] <= 0] = 0
    return dZ


def softmax_backward(dA, activation_cache):
    return dA - activation_cache['Y']


In [12]:
def l_model_backward(Al: np.ndarray, Y: np.ndarray, caches: list):
    """
    Implement the backward propagation process for the entire network.
    :param Al: the probabilities vector, the output of the forward propagation
    :type Al: np.ndarray
    :param Y: the true labels vector (the "ground truth" - true classifications)
    :type Y: np.ndarray
    :param caches: contains Z (stored during the forward propagation)
    :type caches: dict
    :return:
    gradient of the cost with respect to Z
    :rtype:
    np.ndarray
    """
    layers = len(caches) - 1
    grads = dict()

    ## compute the gradient on predictions

    # softmax layer update
    layer = caches.pop()
    layer['activation_cache']['Y'] = Y
    grads[f"dA_{layers}"], grads[f"dW_{layers}"], grads[f"dB_{layers}"] = \
            linear_activation_backward(Al, layer, softmax_backward)
    # Layers update
    for i, layer in enumerate(reversed(caches)):
        grads[f"dA_{layers - i - 1}"], grads[f"dW_{layers - i - 1}"], grads[f"dB_{layers - i - 1}"] = \
            linear_activation_backward(grads[f"dA_{layers - i}"], layer, relu_backward)
        # dA = grads[f"dA_{layers - i}"]
    return grads


In [13]:
def update_parameters(parameters: dict, grads: dict, learning_rate: float):
    """
    Updates parameters using gradient descent
    :param parameters: parameters of the ANN
    :type parameters: dict
    :param grads: – a python dictionary containing the gradients (generated by L_model_backward)
    :type grads: dict
    :param learning_rate: the learning rate used to update
    :type learning_rate: float
    :return:
        Updated parameters of the ANN
    :rtype:
        dict
    """
    for index in range(len(parameters["W"])):
        parameters['W'][index] -= learning_rate * grads[f'dW_{index}']
        parameters['b'][index] -= learning_rate * grads[f'dB_{index}']
    return parameters


# Part 3

This part contains the functions of training and testing a model

In [14]:
def L_layer_model(X, Y, layer_dims, learning_rate, num_iterations, batch_size, ckpt=None):
    """
    Train a model for one epoch.

    :param X: The training data
    :param Y: The training data's labels
    :param layer_dims: iterable of integers represents the number of neurons in every layer.
    :param learning_rate: the learning rate of the model
    :param num_iterations: number of iteration to perform during 1 epoch
    :param batch_size: size of 1 individual batch to feed the model at once.
    :param ckpt: optional, pre-trained parameters for further training. If None, initialize new parameters.
    :return: tuple of (parameters, costs). parameters are the trained model, and costs are the loss for every 100 iterations.
    """
    params = ckpt if ckpt is not None else initialize_parameters(layer_dims)
    batch_mask = np.zeros(X.shape[1])
    batch_mask[:batch_size] = 1
    costs = []
    for i in range(1, num_iterations + 1):
        np.random.shuffle(batch_mask)
        x = X[:, batch_mask.astype(bool)]
        al, caches = L_model_forward(x, params)
        y = Y[:, batch_mask.astype(bool)]
        if i % 100 == 0:
            costs.append(compute_cost(al, y))
        grads = l_model_backward(al, y, caches)
        params = update_parameters(params, grads, learning_rate)
    return params, costs


In [15]:
def predict(X, Y, parameters, use_batchnorm: bool=False):
    predicted, _ = L_model_forward(X, parameters, use_batchnorm)
    diff = np.argmax(predicted, axis=0) == np.argmax(Y, axis=0)
    return diff.sum() / len(diff)


# Part 4
Training the model over MNIST dataset.

## Loading the dataset

In [16]:
def _to_matrix(y):
    """
    This function takes the y vector from the MNIST dataset and transform it to 1/0 matrix
    :param y:
    :return:
    """
    return np.eye(10)[y].T

In [21]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

x_train = np.moveaxis(x_train, 0, -1).reshape((784, -1))
x_test = np.moveaxis(x_test, 0, -1).reshape((784, -1))

y_train = _to_matrix(y_train)
y_test = _to_matrix(y_test)

valdition_count = int(0.2 * y_train.shape[1])
val_mask = np.zeros(y_train.shape[1])
val_mask[:valdition_count] = 1
np.random.shuffle(val_mask)
val_mask = val_mask.astype(bool)
x_val, y_val = x_train[:, val_mask], y_train[:, val_mask]
x_train, y_train = x_train[:, ~val_mask], y_train[:, ~val_mask]

print(f"Train set size = {x_train.shape[1]}\nValidation set size = {x_val.shape[1]}\nTest set size = {x_test.shape[1]}")

Train set size = 48000
Validation set size = 12000
Test set size = 10000


## Training the model

In [22]:
learning_rate = .009
layers = [784, 20, 7, 5, 10]
epoch = 0
last_acc = 0  # used for the stopping criterion
batch_size = 256

costs = []

params, c = L_layer_model(x_train, y_train, layers, learning_rate, 100, 4)
print(params)

{'W': [array([[ 0.22798101,  0.52787335,  0.8497621 , ...,  0.39710117,
        -0.6736147 , -0.50238817],
       [-0.10572573,  1.2034748 ,  0.23126104, ...,  0.57854146,
         1.91213453, -0.0121829 ],
       [-1.21930458, -2.06363699, -0.61587093, ...,  0.57028286,
        -0.99211108,  1.42395384],
       ...,
       [-0.52891753, -0.84789733,  1.18593307, ...,  0.25205403,
         0.97201266, -0.13648241],
       [ 1.66975654,  0.36441743,  0.01132962, ..., -0.73811998,
         0.32376731,  0.83642622],
       [ 0.54576875, -0.5449489 , -0.93497351, ..., -1.61328202,
        -1.82176762,  1.43355309]]), array([[-5.08444848e+01,  7.70808931e+04,  7.73047537e+19,
         2.28099438e+00, -1.06234734e+01, -3.81135278e+65,
        -8.71267339e+61, -5.38182281e+01, -3.91629853e+65,
        -4.34364770e+63, -4.43855972e+01, -3.27464360e+01,
        -4.41611113e+43, -5.41663311e+43, -1.67469996e+47,
        -4.78424823e+01,  7.19476323e-01, -2.99832326e+02,
         1.26649694e+01, 