### How to implement backpropagation

In [None]:
import numpy as np

def sigmoid(z):
    return 1/(1 + np.exp(-z))

def sigmoid_derivative(z):
    return sigmoid(z)*(1 - sigmoid(z))

# def binary_cross_entropy_loss(y, t):
#     eps = 1e-9
#     return -np.mean(t * np.log(y + eps) + (1 - t) * np.log(1 - y + eps))

def forward(X, params):
    W1,b1, W2, b2, W3, b3 = params["W1"], params["b1"], params["W2"], params["b2"], params["W3"], params["b3"]

    z1 = np.dot(X, W1) + b1
    a1 = sigmoid(z1)

    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)

    z3 = np.dot(a2, W3) + b3
    y = sigmoid(z3)

    # We will return everything as a cache which will be used in backprop
    return {"z1":z1, "a1": a1, "z2":z2, "a2": a2, "z3": z3, "y":y}

def backprop(y_true, W1, b1, W2, b2, W3, b3, cache):
    z1, a1, z2, a2, z3, y = cache["z1"], cache["a1"], cache["z2"], cache["a2"], cache["z3"], cache["y"],

    m = y_true.shape[0]
    dz3 = y - y_true
    dw3 = np.dot(a2.T, dz3) / m  # note: y - y_true is dz3
    db3 = np.sum(dz3, axis=0, keepdims=True) / m

    # Hidden layer 2 gradient
    da2 = np.dot(dz3, W3.T)
    dz2 = da2 * sigmoid_derivative(a2)
    dw2 = np.dot(a1.T, dz2) / m
    db2 = np.sum(dz2, axis=0, keepdims=True) / m

    # Hidden layer 1 gradient
    da1 = np.dot(dz2, W2.T)
    dz1 = da1 * sigmoid_derivative(a1)
    dw1 = np.dot(X.T, dz1) / m
    db1 = np.sum(dz1, axis=0, keepdims=True) / m

    grads = (dw1, db1, dw2, db2, dw3, db3)
    return grads







    