# Shallow NN

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import sklearn.linear_model

#np.random.seed(42)

ModuleNotFoundError: No module named 'matplotlib'

<img src="images/nn.png" style="width:600px;height:300px;">

**Mathematically**:

For one unit $x^{(i)}$:
$$z^{[1] (i)} =  W^{[1]} x^{(i)} + b^{[1]}\tag{1}$$ 
$$a^{[1] (i)} = \tanh(z^{[1] (i)})\tag{2}$$
$$z^{[2] (i)} = W^{[2]} a^{[1] (i)} + b^{[2]}\tag{3}$$
$$\hat{y}^{(i)} = a^{[2] (i)} = \sigma(z^{ [2] (i)})\tag{4}$$
$$y^{(i)}_{prediction} = \begin{cases} 1 & \mbox{if } a^{[2](i)} > 0.5 \\ 0 & \mbox{otherwise } \end{cases}\tag{5}$$

Given the predictions on all the examples, you can also compute the cost $J$ as follows: 
$$J = - \frac{1}{m} \sum\limits_{i = 0}^{m} \large\left(\small y^{(i)}\log\left(a^{[2] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[2] (i)}\right)  \large  \right) \small \tag{6}$$

**Reminder**: The general methodology to build a Neural Network is to:
    1. Define the neural network structure ( # of input units,  # of hidden units, etc). 
    2. Initialize the model's parameters
    3. Loop:
        - Implement forward propagation
        - Compute loss
        - Implement backward propagation to get the gradients
        - Update parameters (gradient descent)

You often build helper functions to compute steps 1-3 and then merge them into one function we call `nn_model()`. Once you've built `nn_model()` and learnt the right parameters, you can make predictions on new data.

In [None]:
def shallo_nn_model(X, Y, n_h, num_iterations=10000):
    
    np.random.seed(3)
    
    # Get size of the input and output layers
    n_x = None
    n_y = None
    
    # initialize the weights
    parameters = initialize_parameters(n_x, n_h, n_y)
    
    
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    
    # Gradient descent
    for i in range(0, num_iterations):
        
        A2, cache = forward_propagation(None)
        
        cost = compute_cost(A2, Y, parameters)
        
        grads = backward_propagation(parameters, cache, X, Y)
        
        parameters = update_parameters(parameters, grads)
        if print_cost and i % 1000 == 0:
            print ("Cost after iteration %i: %f" % (i, cost))
    return parameters

In [3]:
"""
    Arguments:
    X -- input dataset of shape (input size, number of examples)
    Y -- labels of shape (output size, number of examples)
    
    Returns:
    n_x -- the size of the input layer
    n_h -- the size of the hidden layer / (Hyper parameter)
    n_y -- the size of the output layer
    
    Hints:
    Size of input layer = number of features
    Size of output layer = number of classes
"""
def layer_sizes(X, Y):
    
    n_x = None 
    n_y = None
    return (n_x, n_y)

NameError: name 'load_planar_dataset' is not defined

In [None]:
"""
    Argument:
    n_x -- size of the input layer
    n_h -- size of the hidden layer
    n_y -- size of the output layer
    
    Returns:
    params -- python dictionary containing your parameters:
                    W1 -- weight matrix of shape (n_h, n_x)
                    b1 -- bias vector of shape (n_h, 1)
                    W2 -- weight matrix of shape (n_y, n_h)
                    b2 -- bias vector of shape (n_y, 1)
                    
                    
                    
  Hints: 
    dimensions of W1 = number of hidden layer units * number of input layer units
    dimensions of b1 = number of hidden layer units
    dimensions of W2 = number of output layer units * number of hidden layer units
    dimensions of b2 = number of output layer units
"""
def initialize_parameters(n_x, n_h, n_y):
    np.random.seed(2) 
    
    """
        np.random.randn returns a random number, we mulitple it with .01 to make the values between 0 and 1. Why?
    """
    W1 = np.random.randn(None) * 0.01
    b1 = np.zeros(shape=(None))
    W2 = np.random.randn(None) * 0.01
    b2 = np.zeros(shape=(None))
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    return parameters

In [None]:
"""

Hints:
    Forward porpogation: Calculate Z1 -> A1 -> Z2 -> A2
    
    Z1 = W1.X + b1
    A1 = tanh(Z1)  (or relu)
    Z2 = W2.X + b2
    A2 = sigmoid(Z2) (or softmax)
    
    to calcuate the dott product of W & X matrix use: np.dot(W,X)
"""

def forward_propagation(X, parameters):

    # fetch the weights from parameters dictionary
    W1 = None
    b1 = None
    W2 = None
    b2 = None

    Z1 = None
    A1 = None
    Z2 = None
    A2 = None
    
    
    cache = {"Z1": Z1,
             "A1": A1,
             "Z2": Z2,
             "A2": A2}
    
    return A2, cache

Now that you have computed $A^{[2]}$ (in the Python variable "`A2`"), which contains $a^{[2](i)}$ for every example, you can compute the cost function as follows:

$$J = - \frac{1}{m} \sum\limits_{i = 0}^{m} \large{(} \small y^{(i)}\log\left(a^{[2] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[2] (i)}\right) \large{)} \small\tag{13}$$

**Exercise**: Implement `compute_cost()` to compute the value of the cost $J$.

**Instructions**:
- There are many ways to implement the cross-entropy loss. To help you, we give you how we would have implemented
$- \sum\limits_{i=0}^{m}  y^{(i)}\log(a^{[2](i)})$:
```python
logprobs = np.multiply(np.log(A2),Y)
cost = - np.sum(logprobs)                # no need to use a for loop!
```

In [None]:
def compute_cost(A2, Y, parameters):
    
    # Fetch the value of m : number of training examples
    # use dim of Y to fetch it
    m = None 
    
    # Fetch weights from parameters dictionary 
    W1 = None
    W2 = None
    
    # Calculate the cost
    logprobs = None
    cost = None
    
    
    cost = np.squeeze(cost)     # makes sure cost is the dimension we expect. 
                                # E.g., turns [[17]] into 17      
    return cost

In [None]:
def backward_propagation(parameters, cache, X, Y):
    # Fetch the value of m : number of training examples
    m = None
    
    # Fetch weights from parameters dictionary 
    W1 = None
    W2 = None
    
    # Fetch A1 and A2 from cache dictionary
    A1 = None
    A2 = None
    
    # Backward propagation: calculate dW1, db1, dW2, db2. 
    dZ2= A2 - Y
    dW2 = (1 / m) * np.dot(dZ2, A1.T)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2))
    dW1 = (1 / m) * np.dot(dZ1, X.T)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)
    
    
    # Return the gradients
    grads = {"dW1": dW1,
             "db1": db1,
             "dW2": dW2,
             "db2": db2}
    return grads

**Question**: Implement the update rule. Use gradient descent. You have to use (dW1, db1, dW2, db2) in order to update (W1, b1, W2, b2).

**General gradient descent rule**: $ \theta = \theta - \alpha \frac{\partial J }{ \partial \theta }$ where $\alpha$ is the learning rate and $\theta$ represents a parameter.

In [None]:
def update_parameters(parameters, grads, learning_rate=1.2):
    # Fetch weights from parameters dictionary 
    W1 = None
    b1 = None
    W2 = None
    b2 = None
    
    # Fetch gradients from grads dictionary 
    dW1 = None
    db1 = None
    dW2 = None
    db2 = None
    
    # Update the weights using gradients
    # i.e. a = a - learning_rate * da
    W1 = None
    b1 = None
    W2 = None
    b2 = None
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    return parameters


In [None]:
def predict(parameters, X):
    A2, cache = forward_propagation(X, parameters)
    predictions = np.round(A2)    
    return predictions

In [None]:
def sigmoid(z):
    s = None
    return s

### Dataset

In [None]:
from sklearn.model_selection import train_test_split
%matplotlib inline
import sklearn.datasets

In [None]:
X,Y = sklearn.datasets.make_circles(n_samples=1000, factor=.5, noise=.3)
X, Y = sklearn.datasets.make_moons(n_samples=1000, noise=.2)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
Y_train, Y_test = Y_train.reshape(Y_train.shape[0],1), Y_test.reshape(Y_test.shape[0],1)
X, Y = X.T, Y.reshape(1, Y.shape[0])
X_train, Y_train = X_train.T, Y_train.reshape(1, Y_train.shape[0])
X_test, Y_test = X_test.T, Y_test.reshape(1, Y_test.shape[0])
plt.scatter(X[0, :], X[1, :], c=Y[0], s=10, cmap=plt.cm.Spectral)