### https://medium.freecodecamp.org/building-a-3-layer-neural-network-from-scratch-99239c4af5d3

In [None]:
def softmax(z):
    #Calculate exponent term first
    exp_scores = np.exp(z)
    return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

In [11]:
def loss_derivative(y,y_hat):
    return (y_hat-y)

def tanh_derivative(x):
    return (1 - np.power(x, 2))

In [12]:
def forward_prop(model,a0):
    W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'],model['b3']
    
    z1 = a0.dot(W1) + b1
    a1 = np.tanh(z1)
    
    z2 = a1.dot(W2) + b2
    a2 = np.tanh(z2)
    
    z3 = a2.dot(W3) + b3
    a3 = softmax(z3)
    
    cache = {
        'a0':a0,
        
        'z1':z1,
        'a1':a1,

        'z2':z2,
        'a2':a2,
        
        'a3':a3,
        'z3':z3
    }

    return cache

In [None]:
def backward_prop(model,cache,y):

    # Load parameters from model
    W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'],model['W3'],model['b3']
    
    # Load forward propagation results
    a0,a1, a2,a3 = cache['a0'],cache['a1'],cache['a2'],cache['a3']
    
    # Get number of samples
    m = y.shape[0]
    
    # Calculate loss derivative with respect to output
    dz3 = loss_derivative(y=y,y_hat=a3)

    # Calculate loss derivative with respect to second layer weights
    dW3 = 1/m*(a2.T).dot(dz3) #dW2 = 1/m*(a1.T).dot(dz2) 
    
    # Calculate loss derivative with respect to second layer bias
    db3 = 1/m*np.sum(dz3, axis=0)
    
    # Calculate loss derivative with respect to first layer
    dz2 = np.multiply(dz3.dot(W3.T) ,tanh_derivative(a2))
    
    # Calculate loss derivative with respect to first layer weights
    dW2 = 1/m*np.dot(a1.T, dz2)
    
    # Calculate loss derivative with respect to first layer bias
    db2 = 1/m*np.sum(dz2, axis=0)
    
    dz1 = np.multiply(dz2.dot(W2.T),tanh_derivative(a1))
    
    dW1 = 1/m*np.dot(a0.T,dz1)
    
    db1 = 1/m*np.sum(dz1,axis=0)
    
    # Store gradients
    grads = {'dW3':dW3, 'db3':db3, 'dW2':dW2,'db2':db2,'dW1':dW1,'db1':db1}
    return grads


In [None]:
#TRAINING PHASE
def initialize_parameters(nn_input_dim, nn_hdim, nn_output_dim):
    W1 = 2 * np.random.randn(nn_input_dim, nn_hdim) - 1
    b1 = np.zeros((1, nn_hdim))
    
    W2 = 2 * np.random.randn(nn_hdim, nn_hdim) - 1
    b2 = np.zeros((1, nn_hdim))

    W3 = 2 * np.random.rand(nn_hdim, nn_output_dim) - 1
    b3 = np.zeros((1,nn_output_dim))
    
    model = {
        'W1': W1,
        'b1': b1,
        
        'W2': W2,
        'b2': b2,
        
        'W3':W3,
        'b3':b3
    }
    
    return model

In [None]:
def update_parameters(model,grads,learning_rate):
    # Load parameters
    W1, b1, W2, b2, b3, W3 = model['W1'], model['b1'], model['W2'], model['b2'],model['b3'],model["W3"]
    
    # Update parameters
    W1 -= learning_rate * grads['dW1']
    b1 -= learning_rate * grads['db1']
    W2 -= learning_rate * grads['dW2']
    b2 -= learning_rate * grads['db2']
    W3 -= learning_rate * grads['dW3']
    b3 -= learning_rate * grads['db3']
    
    # Store and return parameters
    model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2, 'W3':W3,'b3':b3}
    return model


In [None]:
def predict(model, x):
    c = forward_prop(model,x)
    y_hat = np.argmax(c['a3'], axis=1)

    return y_hat


In [9]:
losses = []
def train(model,X_,y_,learning_rate, epochs=20000):
    for i in range(0, epochs):
        cache = forward_prop(model,X_)
        grads = backward_prop(model,cache,y_)
        model = update_parameters(model=model,grads=grads,learning_rate=learning_rate)

    return model


In [None]:
np.random.seed(0)
# This is what we return at the end
model = initialize_parameters(nn_input_dim=13, nn_hdim= 5, nn_output_dim= 3)
model = train(model,X,y,learning_rate=0.07,epochs=4500)
plt.plot(losses)