In [23]:
import numpy as np
import h5py
import copy
import matplotlib.pyplot as plt

In [24]:
def load_data():
    train_dataset = h5py.File('train_catvnoncat.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
    train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels

    test_dataset = h5py.File('test_catvnoncat.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
    test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels

    classes = np.array(test_dataset["list_classes"][:]) # the list of classes
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

In [25]:
train_x_orig, train_y, test_x_orig, test_y, classes = load_data()

# <a name='4'></a>
## 4 - Two-layer Neural Network

<a name='ex-1'></a>


Use the helper functions you have implemented in the previous assignment to build a 2-layer neural network with the following structure: *LINEAR -> RELU -> LINEAR -> SIGMOID*. The functions and their inputs are:
```python
def initialize_parameters(n_x, n_h, n_y):
    ...
    return parameters 
def linear_activation_forward(A_prev, W, b, activation):
    ...
    return A, cache
def compute_cost(AL, Y):
    ...
    return cost
def linear_activation_backward(dA, cache, activation):
    ...
    return dA_prev, dW, db
def update_parameters(parameters, grads, learning_rate):
    ...
    return parameters
```

In [26]:
def relu(Z): # relu function returns Activation Value and prior Z value
    
    A = np.maximum(0, Z)
    cache = Z
    
    return A, cache

def sigmoid(Z): # sigmoid function returns Activation Value and prior Z value
    
    A = 1 / (1 + np.exp(-Z))
    cache = Z
    
    return A, cache

def relu_backward(dA, cache):
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def sigmoid_backward(dA, cache):
    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

In [27]:
# for initilizing parameter shapes for W1, W2, b1, b2 
def initialize_parameters(n_x, n_h, n_y):
    
    W1 = np.random.randn(n_h, n_x) * 0.01
    W2 = np.random.randn(n_y, n_h) * 0.01
    b1 = np.random.randn(n_h, 1)
    b2 = np.random.randn(n_y, 1)
    
    parameters = {
        
    'W1': W1,
    'b1': b1,
    'W2': W2,
    'b2': b2
        
    }
        
    return parameters

In [28]:
def linear_forward(A, W, b):
    
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    
    return Z, cache

In [29]:
def linear_activation_forward(A_prev, W, b, activation):
    
    if activation == "relu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
        
    elif activation == "sigmoid":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
        
    cache = (linear_cache, activation_cache)
        
    return A, cache

In [30]:
def compute_cost(AL, Y):
    
    m = Y.shape[1] 
    
    cost = -(1/m) * np.sum(Y * (np.log(AL)).T + (1 - Y) * (np.log(1 - AL)).T)
    cost = np.squeeze(cost)
   
    return cost

# <a name='6-1'></a>
### 6.1 - Linear Backward

For layer $l$, the linear part is: $Z^{[l]} = W^{[l]} A^{[l-1]} + b^{[l]}$ (followed by an activation).

Suppose you have already calculated the derivative $dZ^{[l]} = \frac{\partial \mathcal{L} }{\partial Z^{[l]}}$. You want to get $(dW^{[l]}, db^{[l]}, dA^{[l-1]})$.

The three outputs $(dW^{[l]}, db^{[l]}, dA^{[l-1]})$ are computed using the input $dZ^{[l]}$.

Here are the formulas you need:
$$ dW^{[l]} = \frac{\partial \mathcal{J} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T} \tag{8}$$
$$ db^{[l]} = \frac{\partial \mathcal{J} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}\tag{9}$$
$$ dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]} \tag{10}$$


$A^{[l-1] T}$ is the transpose of $A^{[l-1]}$. 

In [31]:
def linear_backward(dZ, cache):
    
    A_prev, W, b = cache
    
    m = A_prev.shape[1]
    
    dW = (1/m)*np.dot(dZ, A_prev.T)
    db = (1/m)*np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T, dZ)

    return dW, db, dA_prev

# <a name='6-2'></a>
### 6.2 - Linear-Activation Backward

Next, you will create a function that merges the two helper functions: **`linear_backward`** and the backward step for the activation **`linear_activation_backward`**. 

To help you implement `linear_activation_backward`, two backward functions have been provided:
- **`sigmoid_backward`**: Implements the backward propagation for SIGMOID unit. You can call it as follows:

```python
dZ = sigmoid_backward(dA, activation_cache)
```

- **`relu_backward`**: Implements the backward propagation for RELU unit. You can call it as follows:

```python
dZ = relu_backward(dA, activation_cache)
```

If $g(.)$ is the activation function, 
`sigmoid_backward` and `relu_backward` compute $$dZ^{[l]} = dA^{[l]} * g'(Z^{[l]}). \tag{11}$$  

<a name='ex-8'></a>
### Exercise 8 -  linear_activation_backward

Implement the backpropagation for the *LINEAR->ACTIVATION* layer.

In [32]:
def linear_activation_backward(dA, cache, activation):
    
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dW, db, dA_prev = linear_backward(dZ, linear_cache) 
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dW, db, dA_prev = linear_backward(dZ, linear_cache)
        
    return dA_prev, dW, db

In [33]:
def update_params(params, grads, learning_rate = 0.1):
    
    parameters = copy.deepcopy(params)
    
    L = len(parameters) // 2
    
    for l in range(L):
        parameters["W" + str(l + 1)] -= learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l + 1)] -= learning_rate * grads["db" + str(l + 1)]
    
    return parameters

In [34]:
train_x_orig = (train_x_orig.reshape(train_x_orig.shape[0], -1)).T # flatten to (features, examples m)

m = train_x_orig.shape[0] # number of training examples
n_x = train_x_orig.shape[1] # number of inputs/features
print("The shape of our training set is " + str(train_x_orig.shape))
print("The shape of our testing set is " + str(train_y.shape))

The shape of our training set is (12288, 209)
The shape of our testing set is (1, 209)


# Debugging (Step by Step)

In [67]:
(n_x, n_h, n_y) = (12288, 4, 1)
parameters = initialize_parameters(n_x, n_h, n_y)

# L = len(parameters) // 2
print("Layers: " + str(L))
print("\n")

# check parameters shapes
print("Shapes of Params: ")
for i in range(0, L):
    
    print("W" + str(i + 1) + " " + str(parameters["W" + str(i + 1)].shape))
    print("b" + str(i + 1) + " " + str(parameters["b" + str(i + 1)].shape))

    
print("\n")
    
print("X Shape: " + str(X.shape))
print("Y Shape: " + str(Y.shape))
print("\n")



W1 = parameters["W1"]
W2 = parameters["W2"]
b1 = parameters["b1"]
b2 = parameters["b2"]

X = train_x_orig
Y = train_y

# first pass of forward propagation
A1, cache1 = linear_activation_forward(X, W1, b1, "relu")
print("A1 Shape: " + str(A1.shape))

# second pass of forward propagation
A2, cache2 = linear_activation_forward(A1, W2, b2, "sigmoid")
print("A2 Shape: " + str(A2.shape))
print("\n")

# compute cost
cost = compute_cost(A2, Y)
print("Cost: " + str(cost))
print("\n")


# find dA2 to find dW's and db's
dA2 = - (np.divide(Y, A2) - np.divide(1 - Y, 1 - A2))
print("dA2 Shape: " + str(dA2.shape))
print("\n")

# first pass of back prop
dA1, dW2, db2 = linear_activation_backward(dA2, cache2, "sigmoid")
print("dA1 Shape: " + str(dA1.shape))
print("dW2 Shape: " + str(dW2.shape))
print("db2 Shape: " + str(db2.shape))
print("\n")


# second pass of back prop
dA0, dW1, db1 = linear_activation_backward(dA2, cache2, "sigmoid")
print("dA0 Shape: " + str(dA0.shape))
print("dW1 Shape: " + str(dW1.shape))
print("db1 Shape: " + str(db1.shape))

grads = {
            "dW1" : dW1,
            "dW2" : dW2,
            "db1" : db1,
            "db2" : db2
        }

parameters = update_params(parameters, grads)


Layers: 2


Shapes of Params: 
W1 (4, 12288)
b1 (4, 1)
W2 (1, 4)
b2 (1, 1)


X Shape: (12288, 209)
Y Shape: (1, 209)


A1 Shape: (4, 209)
A2 Shape: (1, 209)


Cost: 271.45042049379026


dA2 Shape: (1, 209)


dA1 Shape: (4, 209)
dW2 Shape: (1, 4)
db2 Shape: (1, 1)


dA0 Shape: (4, 209)
dW1 Shape: (1, 4)
db1 Shape: (1, 1)


ValueError: operands could not be broadcast together with shapes (4,12288) (1,4) (4,12288) 

In [35]:
def model(X, Y, layers_dims, number_of_iterations = 100, learning_rate = 0.01):
    
    cost = ()
    
    (n_x, n_h, n_y) = layers_dims
    
    parameters = initialize_parameters(n_x, n_h, n_y)
    
    for i in range(number_of_iterations):
        
        W1 = parameters["W1"]
        W2 = parameters["W2"]
        b1 = parameters["b1"]
        b2 = parameters["b2"]
    
        # forward propagation (returns (A, cache))
        A1, cache1 = linear_activation_forward(X, W1, b1, "relu")
        A2, cache2 = linear_activation_forward(A1, W2, b2, "sigmoid")
        
        # compute costs
        cost = compute_cost(A2, Y)

        #compute dA2 with respect to cost function, in order to find parameters, dW, db ...
        dA2 = - (np.divide(Y, A2) - np.divide(1 - Y, 1 - A2))
        
        # backward propagation
        dA1, dW2, db2 = linear_activation_backward(dA2, cache2, "sigmoid")
        dA0, dW1, db1 = linear_activation_backward(dA2, cache2, "relu")
        
        grads = {
            "dW1" : dW1,
            "dW2" : dW2,
            "db1" : db1,
            "db2" : db2
        }
        
        # update parameters
        parameters = update_params(parameters, grads)
        
        print(cost)
        
    

In [36]:
model(train_x_orig, train_y, (12288, 4, 1))

ValueError: operands could not be broadcast together with shapes (4,12288) (1,4) (4,12288) 