In [5]:
import numpy as np
import matplotlib.pyplot as plt
import math
import os
import random
import cv2
from dataset_initialization import train_x_set,train_y_set,test_x_set,test_y_set

Shape of Training-X set before flatten:  (3500, 64, 64, 3)
Shape of Testing-X set before flatten:  (500, 64, 64, 3)
Shape of Training-y set:  (1, 3500)
Shape of Testing-y set:  (1, 500)
Shape of Training-X set after flatten:  (12288, 3500)
Shape of Testing-X set after flatten:  (12288, 500)


## Initializing parameters

parameters 'W' and 'B' initialized for each layer. 

function 'initialize_params()' take argument as list "layer_dims", which contain no.of nuerons in each layer.

i.e. layer_dims = [64,16,8,1]

layer0(input_layer) has 64 nuerons , layer1 = 16 , layer2 = 8 , layer4(output_layer) = 1

In [None]:
def initialize_params(layer_dims):
    np.random.seed(3)
    
    parameters = {}
    
    for i in range(1,len(layer_dims)):
        
        parameters['W'+str(i)] = np.random.randn(layer_dims[i],layer_dims[i-1] ) * math.sqrt(2./layer_dims[i-1])
        parameters['B'+str(i)] = np.zeros((layer_dims[i],1))
        
    return parameters

In [None]:
def sigmoid(x):
    return 1 / ( 1 + np.exp(-x) )

In [None]:
def relu(x):
    return np.maximum(0,x)

In [None]:
def tanh(x):
    return np.tanh(x)

## Forward propogation

The linear forward module (vectorized over all the examples) computes the following equations:

$$Z^{[l]} = W^{[l]}A^{[l-1]} +b^{[l]}$$

where $A^{[0]} = X(input_layer)$.

return cache dictionary which contain 'Z' and 'A' for each layer


In [None]:
def forward(X,parameters):
    A = X
    cache = {}
    L = int(len(parameters)/2)
    
    for i in range(1,L+1):
        A_prev = A
        
        Z = np.dot(parameters['W'+str(i)],A_prev) + parameters['B'+str(i)]
        
        if layer_info[i-1][1] == 'relu':
            A = np.maximum(0,Z)
        elif layer_info[i-1][1] == 'sigmoid':
            A = 1 / ( 1 + np.exp(-Z) )
        elif layer_info[i-1][1] == 'tanh':
            A = np.tanh(Z)
        cache['Z'+str(i)] = Z
        cache['A'+str(i)] = A

    return  A, cache
    

##  Backward Propogation

For layer $l$, the linear part is: $Z^{[l]} = W^{[l]} A^{[l-1]} + b^{[l]}$ (followed by an activation).

Suppose you have already calculated the derivative $dZ^{[l]} = \frac{\partial \mathcal{L} }{\partial Z^{[l]}}$. You want to get $(dW^{[l]}, db^{[l]} dA^{[l-1]})$.

<img src="images/linearback_kiank.png" style="width:250px;height:300px;">
<caption><center> **Figure 4** </center></caption>

The three outputs $(dW^{[l]}, db^{[l]}, dA^{[l]})$ are computed using the input $dZ^{[l]}$.Here are the formulas you need:
$$ dW^{[l]} = \frac{\partial \mathcal{L} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T} $$
$$ db^{[l]} = \frac{\partial \mathcal{L} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}$$
$$ dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]} $$


In [None]:
def backward(AL, Y, X, caches, parameters):
    grads = {}
    L = int(len(caches)/2)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) 
    
#     last layer dA
    dA = -(Y/AL - (1-Y)/(1-AL))
    
    for i in range(L,1,-1):
        
        if layer_info[i-1][1] == 'relu':
            dZ = np.array(dA, copy=True)
            dZ[caches['Z'+str(i)] <= 0] = 0
        elif layer_info[i-1][1] == 'sigmoid':
            s = 1 / (1 + np.exp(-caches['Z'+str(i)]))
            dZ = dA * s * (1-s)
        elif layer_info[i-1][1] == 'tanh':
            dZ = np.dot(parameters['W'+str(i+1)].T,dZ) * (1 - np.power(caches['A'+str(i)],2))
            
        grads['dW'+str(i)] = np.dot(dZ,caches['A'+str(i-1)].T) / m
        grads['dB'+str(i)] = np.sum(dZ) / m
        
        dA = np.dot(parameters['W'+str(i)].T,dZ)
    
    #  for first layer
    if layer_info[0][1] == 'relu':
        dZ = np.array(dA, copy=True)
        dZ[caches['Z'+str(1)] <= 0] = 0
    elif layer_info[0][1] == 'sigmoid':
        s = 1 / (1 + np.exp(-caches['Z'+str(1)]))
        dZ = dA * s * (1-s)
    elif layer_info[0][1] == 'tanh':
        dZ = np.dot(parameters['W'+str(2)].T,dZ) * (1 - np.power(caches['A'+str(1)],2))
    
    grads['dW1'] = np.dot(dZ,X.T) / m
    grads['dB1'] = np.sum(dZ) / m
    
    return grads

### Update Parameters

In this section you will update the parameters of the model, using gradient descent: 

$$ W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]} $$
$$ b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]} $$

where $\alpha$ is the learning rate. After computing the updated parameters, store them in the parameters dictionary. 

In [None]:
def update_params(parameters, grads, learning_rate):
    L = int(len(parameters)/2)
    
    for i in range(1,L+1):
        parameters['W'+str(i)] -= learning_rate * grads['dW'+str(i)]
        parameters['B'+str(i)] -= learning_rate * grads['dB'+str(i)]
    
    return parameters

In [None]:
def predict(X,parameters):
    AL , cache = forward(X,parameters)
    predictions = (AL > 0.5) * 1.0
    
    return predictions

## Cost function

Now you will implement forward and backward propagation. You need to compute the cost, because you want to check if your model is actually learning.

Compute the cross-entropy cost $J$, using the following formula: $$-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)) $$


In [None]:
def compute_cost(AL, Y):
    m = Y.shape[1]
    
    cost = - (np.dot(Y, np.log(AL).T) + np.dot((1 - Y), np.log(1 - AL).T)) / m
    
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    
    return cost

### intialization of layer info

1. **"Enter number of layer including output  layer :** i.e. **3** (don't include input layer )" 
2. **"Layer 1 -> Enter no. of neurons and activation function : "** i.e. **16 relu**
3. **"Layer 2 -> Enter no. of neurons and activation function : "** i.e. **8 relu**
4. **"Layer 3 -> Enter no. of neurons and activation function : "** i.e. **1 sigmoid**

In [None]:
def layer_initialization():
    
    layers = int(input("Enter number of layer including output layer : "))
    layer_info = []
    for i in range(layers):
        neurons , activation = input("Layer "+str(i+1)+" -> Enter no. of neurons and activation function : ").split(' ')
        layer_info.append([int(neurons),activation])
    layer_dims = [ i[0] for i in layer_info]
    layer_dims.insert(0,X.shape[0])
    
    return layer_info , layer_dims

In [None]:
layer_info = None
layer_dims = None

In [None]:
def model(X, Y, no_of_epoch, learning_rate=0.075):
    global layer_info , layer_dims
    layer_info , layer_dims = layer_initialization()
    parameters = initialize_params(layer_dims)
    
    for i in range(no_of_epoch):
        AL, cache = forward(X,parameters) 
        cost = compute_cost(AL,Y)
        grads = backward(AL,Y,X,cache,parameters)
        parameters = update_params(parameters,grads,learning_rate)

        if i%100 == 0:
            Y_predict_test = predict(test_x_set,parameters)
            Y_predict_train = predict(train_x_set,parameters)
            print("Cost after ",str(i)," iteration : ",cost,end='')
            print("\ttrain accuracy: {} %".format(100 - np.mean(np.abs(Y_predict_train - train_y_set)) * 100),end='')
            print("\ttest accuracy: {} %".format(100 - np.mean(np.abs(Y_predict_test - test_y_set)) * 100))

In [None]:
model(X, Y, no_of_epoch=2000, learning_rate=0.075)

## 2- L2 Regularization

The standard way to avoid overfitting is called **L2 regularization**. It consists of appropriately modifying your cost function, from:
$$J = -\frac{1}{m} \sum\limits_{i = 1}^{m} \large{(}\small  y^{(i)}\log\left(a^{[L](i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right) \large{)} \tag{1}$$
To:
$$J_{regularized} = \small \underbrace{-\frac{1}{m} \sum\limits_{i = 1}^{m} \large{(}\small y^{(i)}\log\left(a^{[L](i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right) \large{)} }_\text{cross-entropy cost} + \underbrace{\frac{1}{m} \frac{\lambda}{2} \sum\limits_l\sum\limits_k\sum\limits_j W_{k,j}^{[l]2} }_\text{L2 regularization cost} \tag{2}$$

Let's modify your cost and observe the consequences.

Implement `compute_cost_with_regularization()` which computes the cost given by formula (2). To calculate $\sum\limits_k\sum\limits_j W_{k,j}^{[l]2}$  , use :
```python
np.sum(np.square(Wl))
```
Note that you have to do this for $W^{[1]}$, $W^{[2]}$ and $W^{[3]}$, then sum the three terms and multiply by $ \frac{1}{m} \frac{\lambda}{2} $.

In [None]:
def compute_cost_with_regularization(AL, Y, parameters, lambd):
    """
    Implement the cost function with L2 regularization. See formula (2) above.
    
    Arguments:
    AL -- post-activation, output of forward propagation, of shape (output size, number of examples)
    Y -- "true" labels vector, of shape (output size, number of examples)
    parameters -- python dictionary containing parameters of the model
    
    Returns:
    cost - value of the regularized loss function (formula (2))
    """
    
    m = Y.shape[1]
    L = int(len(parameters)/2)
    total = 0
    
    for i in range(1,L+1):
        total += np.sum(np.square(parameters['W'+str(i)]))
    regularization_cost = total * lambd / (2*m)
    
    cost = - (np.dot(Y, np.log(AL).T) + np.dot((1 - Y), np.log(1 - AL).T)) / m
    cost = np.squeeze(cost) 
    
    
    return cost + regularization_cost
    

because you changed the cost, you have to change backward propagation as well! All the gradients have to be computed with respect to this new cost. 

Implement the changes needed in backward propagation to take into account regularization. The changes only concern dW1, dW2,.... For each, you have to add the regularization term's gradient ($\frac{d}{dW} ( \frac{1}{2}\frac{\lambda}{m}  W^2) = \frac{\lambda}{m} W$).

In [None]:
def backward_with_regularization(AL,Y,X,caches,parameters,lambd):
    grads = {}
    L = int(len(caches)/2)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) 
    
    dA = -(Y/AL - (1-Y)/(1-AL))
    
    for i in range(L,1,-1):
        
        if layer_info[i-1][1] == 'relu':
            dZ = np.array(dA, copy=True)
            dZ[caches['Z'+str(i)] <= 0] = 0
        elif layer_info[i-1][1] == 'sigmoid':
            s = 1 / (1 + np.exp(-caches['Z'+str(i)]))
            dZ = dA * s * (1-s)
        elif layer_info[i-1][1] == 'tanh':
            dZ = np.dot(parameters['W'+str(i+1)].T,dZ) * (1 - np.power(caches['A'+str(i)],2))
            
        grads['dW'+str(i)] = np.dot(dZ,caches['A'+str(i-1)].T) / m + (lambd/m * parameters['W'+str(i)])
        grads['dB'+str(i)] = np.sum(dZ) / m
        
        dA = np.dot(parameters['W'+str(i)].T,dZ)
    
    #  for first layer
    if layer_info[0][1] == 'relu':
        dZ = np.array(dA, copy=True)
        dZ[caches['Z'+str(1)] <= 0] = 0
    elif layer_info[0][1] == 'sigmoid':
        s = 1 / (1 + np.exp(-caches['Z'+str(1)]))
        dZ = dA * s * (1-s)
    elif layer_info[0][1] == 'tanh':
        dZ = np.dot(parameters['W'+str(2)].T,dZ) * (1 - np.power(caches['A'+str(1)],2))
    
    grads['dW1'] = np.dot(dZ,X.T) / m + (lambd/m * parameters['W'+str(1)])
    grads['dB1'] = np.sum(dZ) / m
    
    return grads

In [None]:
def L2_regularization_model(X, Y, no_of_epoch, learning_rate=0.075,lambd=0.5):
    global layer_info , layer_dims
    layer_info , layer_dims = layer_initialization()
    parameters = initialize_params(layer_dims)
    
    for i in range(no_of_epoch):
        AL, cache = forward(X,parameters) 
        cost = compute_cost_with_regularization(AL,Y,parameters,lambd)
        grads = backward_with_regularization(AL,Y,X,cache,parameters,lambd)
        parameters = update_params(parameters,grads,learning_rate)

        if i%100 == 0:
            Y_predict_test = predict(test_x_set,parameters)
            Y_predict_train = predict(train_x_set,parameters)
            print("Cost after ",str(i)," iteration : ",cost,end='')
            print("\ttrain accuracy: {} %".format(100 - np.mean(np.abs(Y_predict_train - train_y_set)) * 100),end='')
            print("\ttest accuracy: {} %".format(100 - np.mean(np.abs(Y_predict_test - test_y_set)) * 100))

In [None]:
L2_regularization_model(X, Y, no_of_epoch=2000, learning_rate=0.075,lambd=0.5)

**Observations**:
- The value of $\lambda$ is a hyperparameter that you can tune using a dev set.
- L2 regularization makes your decision boundary smoother. If $\lambda$ is too large, it is also possible to "oversmooth", resulting in a model with high bias.

**What is L2-regularization actually doing?**:

L2-regularization relies on the assumption that a model with small weights is simpler than a model with large weights. Thus, by penalizing the square values of the weights in the cost function you drive all the weights to smaller values. It becomes too costly for the cost to have large weights! This leads to a smoother model in which the output changes more slowly as the input changes.

## 3 - Dropout

**dropout** is a widely used regularization technique that is specific to deep learning. 
**It randomly shuts down some neurons in each iteration.** Watch these two videos to see what this means!


<center>
<video width="620" height="440" src="images/dropout1_kiank.mp4" type="video/mp4" controls>
</video>
</center>
<br>
<caption><center> <u> Figure 2 </u>: Drop-out on the second hidden layer. <br> At each iteration, you shut down (= set to zero) each neuron of a layer with probability $1 - keep\_prob$ or keep it with probability $keep\_prob$ (50% here). The dropped neurons don't contribute to the training in both the forward and backward propagations of the iteration. </center></caption>

<center>
<video width="620" height="440" src="images/dropout2_kiank.mp4" type="video/mp4" controls>
</video>
</center>

<caption><center> <u> Figure 3 </u>: Drop-out on the first and third hidden layers. <br> $1^{st}$ layer: we shut down on average 40% of the neurons.  $3^{rd}$ layer: we shut down on average 20% of the neurons. </center></caption>


When you shut some neurons down, you actually modify your model. The idea behind drop-out is that at each iteration, you train a different model that uses only a subset of your neurons. With dropout, your neurons thus become less sensitive to the activation of one other specific neuron, because that other neuron might be shut down at any time. 

### Forward propagation with dropout

Implement the forward propagation with dropout. You are using a L layer neural network, and will add dropout to the layers with relu activation. We will not apply dropout to the input layer or output layer. 

**Instructions**:
You would like to shut down some neurons in the layers with relu activation. To do that, you are going to carry out 4 Steps:

1. creating a variable $d^{[1]}$ with the same shape as $a^{[1]}$ using `np.random.rand()` to randomly get numbers between 0 and 1. Here, you will use a vectorized implementation, so create a random matrix $D^{[1]} = [d^{[1](1)} d^{[1](2)} ... d^{[1](m)}] $ of the same dimension as $A^{[1]}$.

2. Set each entry of $D^{[1]}$ to be 0 with probability (`1-keep_prob`) or 1 with probability (`keep_prob`), by thresholding values in $D^{[1]}$ appropriately. Hint: to set all the entries of a matrix X to 0 (if entry is less than 0.5) or 1 (if entry is more than 0.5) you would do: `X = (X < 0.5)`. Note that 0 and 1 are respectively equivalent to False and True.

3. Set $A^{[1]}$ to $A^{[1]} * D^{[1]}$. (You are shutting down some neurons). You can think of $D^{[1]}$ as a mask, so that when it is multiplied with another matrix, it shuts down some of the values.

4. Divide $A^{[1]}$ by `keep_prob`. By doing this you are assuring that the result of the cost will still have the same expected value as without drop-out. (This technique is also called inverted dropout.)

In [None]:
def forward_propagation_with_dropout(X, parameters, keep_prob = 0.5):
    A = X
    cache = {}
    drop =  {}
    L = int(len(parameters)/2)
    
    for i in range(1,L+1):
        A_prev = A
        
        Z = np.dot(parameters['W'+str(i)],A_prev) + parameters['B'+str(i)]
        
        if layer_info[i-1][1] == 'relu':
            A = np.maximum(0,Z)
            D = np.random.rand(Z.shape[0],Z.shape[1])     # Step 1: initialize matrix D1 = np.random.rand(..., ...)
            D = (D < keep_prob) * 1     # Step 2: convert entries of D1 to 0 or 1 (using keep_prob as the threshold)
            drop['D'+str(i)] = D
            A = A * D                   # Step 3: shut down some neurons of A1
            A = A / keep_prob           # Step 4: scale the value of neurons that haven't been shut down
        elif layer_info[i-1][1] == 'sigmoid':
            A = 1 / ( 1 + np.exp(-Z) )
        elif layer_info[i-1][1] == 'tanh':
            A = np.tanh(Z)
        cache['Z'+str(i)] = Z
        cache['A'+str(i)] = A

    return  A, cache, drop

### 3.2 - Backward propagation with dropout

Implement the backward propagation with dropout. As before, you are training a L layer network. Add dropout to the layers with relu activation, using the masks $D^{[1]}$ and $D^{[2]}$ stored in the cache. 

**Instruction**:
Backpropagation with dropout is actually quite easy. You will have to carry out 2 Steps:
1. You had previously shut down some neurons during forward propagation, by applying a mask $D^{[1]}$ to `A1`. In backpropagation, you will have to shut down the same neurons, by reapplying the same mask $D^{[1]}$ to `dA1`. 
2. During forward propagation, you had divided `A1` by `keep_prob`. In backpropagation, you'll therefore have to divide `dA1` by `keep_prob` again (the calculus interpretation is that if $A^{[1]}$ is scaled by `keep_prob`, then its derivative $dA^{[1]}$ is also scaled by the same `keep_prob`).


In [None]:
def backward_propagation_with_dropout(AL,Y,X, caches,parameters,drop, keep_prob=0.5):
    grads = {}
    L = int(len(caches)/2)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) 
    
    dA = -(Y/AL - (1-Y)/(1-AL))
    
    for i in range(L,1,-1):
        
        if layer_info[i-1][1] == 'relu':
            dZ = np.array(dA, copy=True)
            dZ[caches['Z'+str(i)] <= 0] = 0
            
            grads['dW'+str(i)] = np.dot(dZ,caches['A'+str(i-1)].T) / m
            grads['dB'+str(i)] = np.sum(dZ) / m
        
            dA = np.dot(parameters['W'+str(i)].T,dZ)
            dA = dA * drop['D'+str(i-1)]     # Step 1: Apply mask D2 to shut down the same neurons as during the forward propagation
            dA = dA / keep_prob              # Step 2: Scale the value of neurons that haven't been shut down
        
        elif layer_info[i-1][1] == 'sigmoid':
            s = 1 / (1 + np.exp(-caches['Z'+str(i)]))
            dZ = dA * s * (1-s)
            
            grads['dW'+str(i)] = np.dot(dZ,caches['A'+str(i-1)].T) / m
            grads['dB'+str(i)] = np.sum(dZ) / m
        
            dA = np.dot(parameters['W'+str(i)].T,dZ)
            dA = dA * drop['D'+str(i-1)]
            dA = dA / keep_prob
            
        elif layer_info[i-1][1] == 'tanh':
            dZ = np.dot(parameters['W'+str(i+1)].T,dZ) * (1 - np.power(caches['A'+str(i)],2))
            
            grads['dW'+str(i)] = np.dot(dZ,caches['A'+str(i-1)].T) / m
            grads['dB'+str(i)] = np.sum(dZ) / m
        
            dA = np.dot(parameters['W'+str(i)].T,dZ)
    
    #  for first layer
    if layer_info[0][1] == 'relu': 
        dZ = np.array(dA, copy=True)
        dZ[caches['Z'+str(1)] <= 0] = 0
    elif layer_info[0][1] == 'sigmoid':
        s = 1 / (1 + np.exp(-caches['Z'+str(1)]))
        dZ = dA * s * (1-s)
    elif layer_info[0][1] == 'tanh':
        dZ = np.dot(parameters['W'+str(2)].T,dZ) * (1 - np.power(caches['A'+str(1)],2))
    
    grads['dW1'] = np.dot(dZ,X.T) / m
    grads['dB1'] = np.sum(dZ) / m
    
    return grads

In [None]:
def dropout_model(X, Y, no_of_epoch, learning_rate=0.075,keep_prob=0.5):
    global layer_info , layer_dims
    layer_info , layer_dims = layer_initialization()
    parameters = initialize_params(layer_dims)
    
    for i in range(no_of_epoch):
        AL, cache, drop = forward_propagation_with_dropout(X,parameters,keep_prob) 
        cost = compute_cost(AL,Y)
        grads = backward_propagation_with_dropout(AL,Y,X,cache,parameters,drop, keep_prob)
        parameters = update_params(parameters,grads,learning_rate)

        if i%100 == 0:
            Y_predict_test = predict(test_x_set,parameters)
            Y_predict_train = predict(train_x_set,parameters)
            print("Cost after ",str(i)," iteration : ",cost,end='')
            print("\ttrain accuracy: {} %".format(100 - np.mean(np.abs(Y_predict_train - train_y_set)) * 100),end='')
            print("\ttest accuracy: {} %".format(100 - np.mean(np.abs(Y_predict_test - test_y_set)) * 100))

In [None]:
dropout_model(X, Y, no_of_epoch=2000, learning_rate=0.075,keep_prob=0.5)

**Note**:
- A **common mistake** when using dropout is to use it both in training and testing. You should use dropout (randomly eliminate nodes) only in training. 