In [None]:
def backward_propagation_step(dL_dA, cache, activation):
    """
    dL_dA - activation gradient for current layer l
    cache - (W, b, A_prev, Z) stored for current layer  l
    activation - string: "sigmoid" or "relu"
    
    Returns:
    dL_dA_prev - Gradient activation of the previous layer l-1, same shape as A_prev
    dL_dW - Gradient of W current layer l, same shape as W
    dL_db - Gradient of b (current layer l), same shape as b
    """
    W, b, A_prev, Z = cache 

    # backward activation part:
    if activation == "relu":
        dg_dz = relu_backward(Z)
    elif activation == "sigmoid":
        dg_dz = sigmoid_backward(Z)
        
    assert (dL_dA.shape == dg_dz.shape)
    dL_dZ = dL_dA * dg_dz

    # backward linear part:
   
    dL_dW = 1 / A_prev.shape[1] * np.dot(dL_dZ, A_prev.T)
    dL_db = 1 / A_prev.shape[1] * np.sum(dL_dZ, axis=1, keepdims=True)
    dL_dA_prev = np.dot(W.T, dL_dZ)
    

    assert (dL_dA_prev.shape == A_prev.shape)
    assert (dL_dW.shape == W.shape)
    assert (dL_db.shape == b.shape)

    return dL_dA_prev, dL_dW, dL_db
        

def relu_backward(Z):
    dg_dz = np.where(Z > 0, 1, 0)
    assert (dg_dz.shape == Z.shape)    
    return dg_dz



def sigmoid_backward(Z):

    dg_dz = sigmoid(Z) * (1 - sigmoid(Z))
    assert (dg_dz.shape == Z.shape)    
    return dg_dz



def forward_propagation_whole_process(X, parameters):
    """
    [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID
    X - data, array of shape (input size, number of examples)
    parameters - initialized parameters foreach of 'W' and 'b' keas values have keys 1,2,...L 
    
    Returns:
    A_last - last activation value (y_pred)
    caches - dict of caches containing every cache of forward propagation indexed from 0 to L-1
    """

    caches = {}
    A = X
    
    L = len(parameters['W']) # number of layers in the neural network

    # [LINEAR -> RELU]*(L-1)
    for l in range(1, L):
        Z = np.dot(parameters['W'][l], A) + parameters['b'][l]
        A = np.maximum(0, Z)
        cache = (A, Z)
        caches[l] = cache

    
    #LINEAR -> SIGMOID
    Z = np.dot(parameters['W'][L], A) + parameters['b'][L]  # LINEAR
    A_last = sigmoid(Z)
    caches[L] = (A, Z)

    assert(A_last.shape == (1, X.shape[1])) # (1,m) 
            
    return A_last, caches


def backward_propagation_whole_process(A_last, Y, caches):
    """
    backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID 
    A_last - probability vector, output(y_pred) of the forward propagation 
    Y - true labels (0 if non-cat, 1 if cat)
    caches - dict of caches for each layer that contains (W, b, A, Z)
    Returns: grads - of keys 'W' and 'b' each containing the  dictionaries of keys 1..L  
    """
    dL_dA= {}
    dL_dW = {}
    dL_db= {}
    
    L = len(caches) # the number of layers
    m = A_last.shape[1] # number of samples
    #Y = Y.reshape(A_last.shape) # make sure Y is the same shape as A_last(y_pred)
    
    # Initialize the backpropagation    
    dL_dA[L] = - (np.divide(Y, A_last) - np.divide(1 - Y, 1 - A_last))

    # layer (SIGMOID -> LINEAR) gradients
    current_cache = caches[L-1]
    dL_dA[L-1], dL_dW[L], dL_db[L] = backward_propagation_step(dL_dA[L], current_cache, activation='sigmoid')

    # Loop from l=L-2 to l=0
    for l in reversed(range(1,L)): #  starts with L-1 ends with 1 
        # l-th layer: (RELU -> LINEAR) gradients.
        current_cache = caches[l-1]
        dL_dA[l-1], dL_dW[l], dL_db[l] = backward_propagation_step(dL_dA[l], current_cache, activation='relu')
        
    grads = {'dW': dL_dW, 'db': dL_db}
    
    return grads