In [10]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import time
import scipy
from PIL import Image
from scipy import ndimage
#upload other packages if required

In [None]:
X_train, Y= #upload your own dataset
shape_X = X_train.shape
shape_Y = Y.shape
m = X_train.shape[0] # no.of examples (change the code if required based  on your data set)
num_px = X_train.shape[1]#shape of each image (assuming that the images are of square shape. Make changes as per your data set)

In [None]:
# Reshape the training and test examples 
train_x_flatten = X_train.reshape(X_train.shape[0], -1).T   # The "-1" makes reshape flatten the remaining dimensions

# Standardize data to have feature values between 0 and 1.
X = train_x_flatten/255.


print ("shape: " + str(X.shape))

# Basic Neural Network with one hidden layer

In this network let us consider one input layer followed by one hidden layer and one output layer.
Tanh activation function is used for hidden layer and as we need the output to be eiither 0 or 1 and therefore we use sigmoid activation funtion for the output layer. 

### Steps to build a model
1. Initialize W (weights) and b(bias)
2. Optimize the loss iteratively to learn parameters (w,b):
      (i) Forward propagation step will be done which is followed by Computing the cost
      (ii) Back propagation need to be done which is followed by Update Parameters using gradient descent
3. Use the learned (W,b) to predict the labels for the given set of examples

### The basic structure of the model build here is as follows

We will be using 5 methods:
1. Initialize parameters
2. Forward propagation
3. Computing cost
4. Back Propagation
5. Update parameters

All these five methods will be used in a seperate method to build a Neural Networks model. This is followed by a method that is used to make predictions after the model is trained.

After implementing all these methods 2 lines of code is written to run the model


### Initializing parameters:
Since we have considered one hidden layer we need two weight matrices and two baises

One weight and bias between the input layer and hidden layer and the other weight and bias between hidden layer and output layer

In [2]:
def initialize_parameters(n_x, n_h, n_y):
    """
    Argument:
    n_x -- size of the input layer
    n_h -- size of the hidden layer
    n_y -- size of the output layer
    
    Returns:
    params -- python dictionary containing your parameters:
                    W1 -- weight matrix of shape (n_h, n_x)
                    b1 -- bias vector of shape (n_h, 1)
                    W2 -- weight matrix of shape (n_y, n_h)
                    b2 -- bias vector of shape (n_y, 1)
    """
    
   
    W1 = np.random.randn(n_h,n_x)*0.01
    b1 = np.zeros((n_h,1))
    W2 = np.random.randn(n_y,n_h)*0.01
    b2 = np.zeros((n_y,1))

    
    assert (W1.shape == (n_h, n_x))
    assert (b1.shape == (n_h, 1))
    assert (W2.shape == (n_y, n_h))
    assert (b2.shape == (n_y, 1))
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

### Forward Propagation:
In this model the input data is fed to the hidden layer, this hidden layer processes the data as per the activation function and passes it to the output layer. Output layer again processes the data as per its activation function and gives us the output.(There may be more than one hidden layer where output from first hidden layer is fed to the next one and so on, the output of the last hidden layer is fed into the output layer)

Below are the formulae used in this model to compute forward propagation. a1 is the output of hidden layer and a2 is the actual output of the model.

𝑧1=(𝑊1*𝑥)+𝑏1                                                                                                 

𝑎1=tanh(𝑧1)     (tanh activation function)                                                                             

𝑧2=(𝑊2*𝑎1)+𝑏2                                                                                                 

𝑦̂ (𝑖)=𝑎2=𝜎(𝑧2)  (sigmoid activation function)                                                                                           

𝑦(𝑖)𝑝𝑟𝑒𝑑𝑖𝑐𝑡𝑖𝑜𝑛={1 if 𝑎2>0.5
               
               0 otherwise                                                                                


In [20]:
def forward_propagation(X, parameters):
    """
    Argument:
    X -- input data of size (n_x, m)
    parameters -- python dictionary containing your parameters (output of initialization function)
    
    Returns:
    A2 -- The sigmoid output of the second activation
    cache -- a dictionary containing "Z1", "A1", "Z2" and "A2"
    """
    # Retrieve each parameter from the dictionary "parameters"
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
   
    
    # Implement Forward Propagation to calculate A2 (probabilities)
    Z1 = np.dot(W1,X)+b1
    A1 = np.tanh(Z1)
    Z2 = np.dot(W2,A1)+b2
    A2 = 1/(1+np.exp(-Z2)) #sigmoid activation function
  
    
    assert(A2.shape == (1, X.shape[1]))
    
    cache = {"Z1": Z1,
             "A1": A1,
             "Z2": Z2,
             "A2": A2}
    
    return A2, cache

### Computing Cost:
Find the cost using the below method. 

Note: logprobs compute the loss function for each example. The formula used in the below code is one of the loss function formula.

cost computes the cross-entropy cost function which is just summation of logprops for all the examples.


In [2]:
def compute_cost(A2, Y):
    """
    Computes the cross-entropy cost
    
    Arguments:
    A2 -- The sigmoid output of the second activation, of shape (1, number of examples)
    Y -- actual labels vector of shape (1, number of examples)
    parameters -- python dictionary containing your parameters W1, b1, W2 and b2
    Returns:
    cost -- cross-entropy cost 
    
    """
    

    # Compute the cross-entropy cost
    logprobs = (-1/m)*(np.multiply(np.log(A2),Y)+np.multiply(np.log(1-A2),1-Y))
    cost = np.sum(logprobs)
    
    
    cost = float(np.squeeze(cost))  # makes sure cost is the dimension we expect. 
                                    # E.g., turns [[17]] into 17 
    assert(isinstance(cost, float))
    
    return cost

### Back propagation:
Back propagation step is moving backward in the network and fine tuning the weights. In other words we find the differentiation of the equations used in forward propagation. 

Note: dZ2, dW2, db2, dZ1, dW1, db1 calculations below are the differention values for the equations implemented in forward propagation


In [3]:
def backward_propagation(parameters, cache, X, Y):
    """
    Arguments:
    parameters -- python dictionary containing our parameters 
    cache -- a dictionary containing "Z1", "A1", "Z2" and "A2".
    X -- input data of shape (2, number of examples)
    Y -- actual labels vector of shape (1, number of examples)
    
    Returns:
    grads -- python dictionary containing your gradients with respect to different parameters
    """
    
    # First, retrieve W1 and W2 from the dictionary "parameters".
    W1 = parameters["W1"]
    W2 = parameters["W2"]
   
        
    # Retrieve also A1 and A2 from dictionary "cache".
    A1 = cache["A1"]
    A2 = cache["A2"]
   
    
    # Backward propagation: calculate dW1, db1, dW2, db2. 
    dZ2 = A2-Y
    dW2 = (1/m)*(np.dot(dZ2,A1.T))
    db2 = (1/m)*np.sum(dZ2,axis=1,keepdims=True)
    dZ1 = np.dot(W2.T,dZ2)*(1-np.power(A1,2))
    dW1 = (1/m)*np.dot(dZ1,X.T)
    db1 = (1/m)*np.sum(dZ1,axis=1,keepdims=True)
  
    
    grads = {"dW1": dW1,
             "db1": db1,
             "dW2": dW2,
             "db2": db2}
    
    return grads

### Updating Parameters:
We will update the weights and biases using Gradient descent.

Gradient descent update rule:

θ=θ−α∂J/∂θ where α is the learning rate and θ represents a parameter.

In [None]:
def update_parameters(parameters, grads, learning_rate = 1.2):
    """
    Updates parameters using the gradient descent update rule given above
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients 
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
    """
    # Retrieve each parameter from the dictionary "parameters"
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
   
    
    # Retrieve each gradient from the dictionary "grads"
    dW1 = grads["dW1"]
    db1 = grads["db1"]
    dW2 = grads["dW2"]
    db2 = grads["db2"]

    
    # Update rule for each parameter
    W1 = W1-(learning_rate*dW1)
    b1 = b1-(learning_rate*db1)
    W2 = W2-(learning_rate*dW2)
    b2 = b2-(learning_rate*db2)

    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

### Lets build a Neural Networks model using the above methods

In [28]:
def nn_model(X, Y, n_h, num_iterations = 10000, print_cost=False):
    """
    Arguments:
    X -- dataset of shape (num_px*num_px_3, number of examples)
    Y -- labels of shape (1, number of examples)
    n_h -- size of the hidden layer
    num_iterations -- Number of iterations in gradient descent loop
    print_cost -- if True, print the cost every 1000 iterations
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """
    
    n_x = X.shape[0]
    n_y = Y.shape[0]
    
    # Initialize parameters
    parameters = initialize_parameters(n_x, n_h, n_y)
  
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):
        
        # Forward propagation. Inputs: "X, parameters". Outputs: "A2, cache".
        A2, cache = forward_propagation(X, parameters)
        
        # Cost function. Inputs: "A2, Y". Outputs: "cost".
        cost = compute_cost(A2, Y)
 
        # Backpropagation. Inputs: "parameters, cache, X, Y". Outputs: "grads".
        grads = backward_propagation(parameters, cache, X, Y)
 
        # Gradient descent parameter update. Inputs: "parameters, grads". Outputs: "parameters".
        parameters = update_parameters(parameters, grads)
         
        # Print the cost every 1000 iterations
        if print_cost and i % 1000 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))

    return parameters

### Predictions:
If A2 (y predicted) value is greater than 0.5 then we will consider A2=1 else 0

In [7]:
def predict(parameters, X):
    """
    Using the learned parameters, predicts a class for each example in X
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    X -- input data of size (n_x, m)
    
    Returns
    predictions based on the data set used
    """
    
    # Computes probabilities using forward propagation, and classifies to 0/1 using 0.5 as the threshold.
    A2, cache = forward_propagation(X, parameters)
    predictions = (A2>0.5)
   
    
    return predictions

### Running the model
change tha values of n_h and num_iterations if required

In [None]:
# Build a model with a n_h-dimensional hidden layer
parameters = nn_model(X, Y, n_h = 4, num_iterations = 10000, print_cost=True)

In [None]:
predictions = predict(parameters, X)
print ('Accuracy: %d' % float((np.dot(Y,predictions.T) + np.dot(1-Y,1-predictions.T))/float(Y.size)*100) + '%')

### Test the model with a image

In [None]:
my_image =  # give the name of your image file 
my_label_y = [1] # the actual class of your image (1 or 0), change it to 0 if the true class of your image is 0


fname = "images/" + my_image
image = np.array(ndimage.imread(fname, flatten=False))
my_image = scipy.misc.imresize(image, size=(num_px,num_px)).reshape((num_px*num_px*3,1))
my_image = my_image/255.
my_predicted_image = predict(my_image, my_label_y, parameters)

plt.imshow(image)
print ("y = " + str(np.squeeze(my_predicted_image)) + ", your L-layer model predicts a \"" + classes[int(np.squeeze(my_predicted_image)),].decode("utf-8") +  "\" picture.")

# Neural Networks model with multiple hidden layers

This network contains one input layer followed my multiple hidden layers and the last hidden layer is followed by a output layer.

Relu Activation function is used for all the hidden layers and sigmoid activation function is used for the output layer as this is a model for binary classification(output either 1 or 0)


Same as above  model with one hidden layer we will be building 5 methods
1. Initialize parameters
2. Forward propagation
3. Computing cost
4. Back Propagation(2 methods are used)
5. Update parameters

All these five methods will be used in a seperate method to build a Neural Networks model. This is followed by  a method that is used to make predictions after the model is trained.

After all these methods are built 2 lines of code will be written to run the model


### useful methods in the model
Sigmoid and Relu activation functions are used in this Neural Network, therefore these methods are manually implemented in seperate methods to make it more clear.

In [21]:
#sigmoid activation function
def sigmoid(Z):
    """
    Implements the sigmoid activation in numpy
    
    Arguments:
    Z -- numpy array of any shape
    
    Returns:
    A -- output of sigmoid(z), same shape as Z
    cache -- returns Z as well, useful during backpropagation
    """
    
    A = 1/(1+np.exp(-Z))
    cache = Z
    
    return A, cache

In [23]:
# relu activation function
def relu(Z):
    """
    Implement the RELU function.

    Arguments:
    Z -- Output of the linear layer, of any shape

    Returns:
    A -- Post-activation parameter, of the same shape as Z
    cache -- a python dictionary containing "A" ; stored for computing the backward pass efficiently
    """
    
    A = np.maximum(0,Z)
    
    assert(A.shape == Z.shape)
    
    cache = Z 
    return A, cache

In [24]:
# differention of sigmoid function. This methods is used in back propagation
def sigmoid_backward(dA, cache):
    """
    Implement the backward propagation for a single SIGMOID unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

In [25]:
# differention of relu function. This methods is used in back propagation
def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ

### Intializing parameters:
Initialize the weights and bias for any number of hidden layers using one for loop.

In [None]:
def initialize_parameters_deep(layer_dims):
    """
    Arguments:
    layer_dims -- python array (list) containing the dimensions of each layer in our network
    
    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    bl -- bias vector of shape (layer_dims[l], 1)
    """
    
    parameters = {}
    L = len(layer_dims)            # number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l],1))
        
        assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
        assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))

        
    return parameters

### Forward Propagation:
As already mentioned relu activation function is used for hidden layers and sigmoid activation function is used for output layer. Below are the formulae used in this model.

Z=(𝑊l*A)+𝑏l (A=X for the first hidden layer)

A=relu(𝑧) (relu activation function)

The above mentioned equations are for hidden layers where 

Wl is Weight matrix of l th layer (l=1,2,3,.....L-1)

bl is bias matrix of l th layer (l=1,2,3,.....L-1) 

ZL=(𝑊L*A)+𝑏L

𝑦̂ (𝑖)=AL=𝜎(ZL) (sigmoid activation function)

The above mentioned two equations are for output layer where 

WL is Weight matrix of L th layer where L is the last layer in the network

bL is bias matrix of L th layer where L is the last layer in the network

𝑦_𝑝𝑟𝑒𝑑𝑖𝑐𝑡𝑖𝑜𝑛= AL= {1 if 𝑎2>0.5

                0 otherwise    
           


Note: Calculations for W,b,Z, A, cache can be implemented in another method and can be called in this method which reduces the number of lines of code. To make the code more clear I have implemented it in this way.

In [11]:
def L_model_forward(X, parameters):
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters_deep()
    
    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
                every cache of linear_activation_forward() (there are L-1 of them, indexed from 0 to L-1)
    """

    caches = []
    A = X
    L = len(parameters) // 2                  # number of layers in the neural network
    # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.
    for l in range(1, L):
        A_prev = A 
        W= parameters["W"+str(l)]
        b= parameters["b"+str(l)]
        Z = np.dot(W,A_prev)+b
        A, cache= relu(Z)
        caches.append(cache)
    
    # Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list.
    W= parameters["W"+str(l+1)]
    b= parameters["b"+str(l+1)]
    Z = np.dot(W,A)+b
    A, cache= sigmoid(Z)
    caches.append(cache)
    
    assert(AL.shape == (1,X.shape[1]))
            
    return AL, caches

### Computing Cost
In the above model with single hidden layer we have implemented the cost function in 2 lines of code(logprobs, cost) but in this case we have just combined both of them into a single line of code. 

In [12]:
def L_model_compute_cost(AL, Y):
    """
    Implement the cost function.

    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    Y -- actual "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """
    

    # Compute loss from aL and y.
    cost = (-1/m)*np.sum(((Y*np.log(AL))+((1-Y)*(np.log(1-AL)))))
    
    cost = np.squeeze(cost) # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert(cost.shape == ())
    
    return cost

### Backpropagation:
To implement Back propagation step two methods are implemented for easy understanding.
1. Finding dZ, dW, db, dA_prev for both sigmoid and relu activation functions
2. Carrying out back propagation step using the above mentioned method

Note: In the below mentioned code dW, db, dA_prev can be calculated seperately in another method and can be called here, this reduces the number of lines of code. To make the code clear I have implemented it in this way.

In [13]:
def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    if activation == "relu":
        dZ = relu_backward(dA,activation_cache)
        dW = (1/m)*(np.dot(dZ,A_prev.T))
        db = (1/m)*np.sum(dZ,axis=1,keepdims=True)
        dA_prev = np.dot(W.T,dZ)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA,activation_cache)
        dW = (1/m)*(np.dot(dZ,A_prev.T))
        db = (1/m)*np.sum(dZ,axis=1,keepdims=True)
        dA_prev = np.dot(W.T,dZ)
    
    return dA_prev, dW, db

In [14]:
def L_model_backward(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- actual "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of L_model_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of L_model_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) # the number of layer
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # Initializing the backpropagation
    dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "dAL, current_cache". Outputs: "grads["dAL-1"], grads["dWL"], grads["dbL"]
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache , activation = "sigmoid")
    
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        # lth layer: (RELU -> LINEAR) gradients.
        # Inputs: "grads["dA" + str(l + 1)], current_cache". Outputs: "grads["dA" + str(l)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)] 
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA"+str(l+1)], current_cache, activation = "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

### Updating parameters:
Same gradient descent rule that is used in the above model with one hidden layer will be used in this model also.

Note: L = len(parameters) // 2
Here L resembles no.of layers and // is used for floor division and this is divided by 2 as the dictionary "parameters" contain both the weights and biases.

In [16]:
def L_model_update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    
    L = len(parameters) // 2 # number of layers in the neural network

    # Update rule for each parameter
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - (learning_rate*grads["dW" + str(l+1)])
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - (learning_rate*grads["db" + str(l+1)])
    return parameters

### Lets build a model using the above implemented methods

In [18]:
def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009
    """
    Implements a L-layer neural network: [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID.
    
    Arguments:
    X -- data, numpy array of shape (num_px * num_px * 3, number of examples)
    Y -- actual "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    layers_dims -- list containing the input size and each layer size, of length (number of layers + 1).
    learning_rate -- learning rate of the gradient descent update rule
    num_iterations -- number of iterations of the optimization loop
    print_cost -- if True, it prints the cost every 100 steps
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """

    costs = []                         # keep track of cost
    
    # Parameters initialization. (≈ 1 line of code)
    parameters = initialize_parameters_deep(layers_dims)
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):

        # Forward propagation: [LINEAR -> RELU]*(L-1) -> LINEAR -> SIGMOID.
        AL, caches = L_model_forward(X, parameters)
        
        # Compute cost.
        cost = L_model_compute_cost(AL, Y)
    
        # Backward propagation.
        grads = L_model_backward(AL, Y, caches)
 
        # Update parameters.
        parameters = L_model_update_parameters(parameters, grads, learning_rate)
                
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)
            
    # plot the cost
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters

### Predictions:
If AL value is greater than 0.5 it is considered to be 1

In [17]:
def L_model_predict(parameters, X):
    """
    Using the learned parameters, predicts a class for each example in X
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    X -- input data of size (n_x, m)
    
    Returns
    predictions based on the data set used
    """
    
    # Computes probabilities using forward propagation, and classifies to 0/1 using 0.5 as the threshold.
    AL, cache = L_model_forward(X, parameters)
    predictions = (AL>0.5)
   
    
    return predictions

### Running the model
Change the values of layers_dims and num_iterations if required

In [None]:
layers_dims = [12288, 20, 7, 5, 1]# Assuming the model to be of 4 layers
parameters = L_layer_model(train_x, train_y, layers_dims, num_iterations = 2500, print_cost = True)

In [None]:
predictions = L_layer_predict(parameters, X)
print ('Accuracy: %d' % float((np.dot(Y,predictions.T) + np.dot(1-Y,1-predictions.T))/float(Y.size)*100) + '%')

### Test the model with your own image

In [None]:
my_image =  # give the name of your image file 
my_label_y = [1] # the true class of your image (1 or 0), change it to 0 if the true class of your image is 0


fname = "images/" + my_image
image = np.array(ndimage.imread(fname, flatten=False))
my_image = scipy.misc.imresize(image, size=(num_px,num_px)).reshape((num_px*num_px*3,1))
my_image = my_image/255.
my_predicted_image = L_model_predict(my_image, my_label_y, parameters)

plt.imshow(image)
print ("y = " + str(np.squeeze(my_predicted_image)) + ", your L-layer model predicts a \"" + classes[int(np.squeeze(my_predicted_image)),].decode("utf-8") +  "\" picture.")