# Implementing the following 3-layer network
![title](img/chap6_3layer.png)

In [2]:
import numpy as np

# np.random.seed(1)

def relu(x):
    return (x > 0) * x

alpha = 0.2
hidden_size = 4

streetlights = np.array([[1, 0, 1],
                         [0, 1, 1],
                         [0, 0, 1],
                         [1, 1, 1]])

walk_vs_stop = np.array([[1, 1, 0, 0]]).T

# Generate weights in a 3x4 matrix
weights_0_1 = 2 * np.random.random((3, hidden_size)) - 1
# Generate weights in a 4x1 matrix
weights_1_2 = 2 * np.random.random((hidden_size, 1)) - 1

print("Weights from 0 to 1:")
print(weights_0_1)
print("Weights from 1 to 2:")
print(weights_1_2)

layer_0 = streetlights[0]
layer_1 = relu(np.dot(layer_0, weights_0_1))


print("Layer 1's 4 node values are:")
print(layer_1)

layer_2 = np.dot(layer_1, weights_1_2)
print("Layer 2's output is:")
print(layer_2)


Weights from 0 to 1:
[[ 0.49762021 -0.83559291  0.15844297  0.06489488]
 [ 0.45836711  0.780598    0.81965766  0.04149217]
 [-0.72932099  0.04604929 -0.73109746  0.17238014]]
Weights from 1 to 2:
[[ 0.03471422]
 [ 0.89945896]
 [ 0.12471945]
 [-0.0187963 ]]
Layer 1's 4 node values are:
[-0.         -0.         -0.          0.23727502]
Layer 2's output is:
[-0.00445989]


### Now, using backpropagation:
Try to follow through with the vector operations on paper


In [8]:
import numpy as np

np.random.seed(1)

def relu(x):
    return (x > 0) * x

# Derivative of the relu function
# Outputs: 1 if positive, or 0 if negative
# 1 is the slope of the relu function for positive inputs, 0 is the slope for negative inputs into relu
def reluDerivative(x):
    return x > 0

alpha = 0.2
hidden_size = 4

streetlights = np.array([[1, 0, 1],
                         [0, 1, 1],
                         [0, 0, 1],
                         [1, 1, 1]])

walk_vs_stop = np.array([[1, 1, 0, 0]]).T

weights_0_1 = 2 * np.random.random((3, hidden_size)) - 1
weights_1_2 = 2 * np.random.random((hidden_size, 1)) - 1

for iteration in range(60):
    layer_2_error = 0
    for i in range(len(streetlights)):
        # Note that slicing [i : i + 1] will return a list of the single element at index i
        layer_0 = streetlights[i : i + 1]  
        # np.dot with matrix operands of sizes: 1x3 . 3x4 will produce a 1x4 result 
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        
        # layer_2 is just the single output layer node
        layer_2 = np.dot(layer_1, weights_1_2)
        
        # Mean squared error of layer 2
        layer_2_error += np.sum((layer_2 - walk_vs_stop[i:i+1]) ** 2)
        layer_2_delta = layer_2 - walk_vs_stop[i : i + 1]
        
        # Determine layer_1's deltas (one for each of the 4 nodes in layer 1)
        # This calculation gives a 'weighting' to how much each weight contributed to the output delta
        # of layer_2.
        # Since we only want to attribute errors to the nodes of layer_1 whose input value was NOT zero
        # we can multiply by reluDerivative which will assign a delta of 0 to the node with input value 0 
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * reluDerivative(layer_1)
        # Adjusting the weights of layer1 -> layer2 and layer0 -> layer1
        weights_1_2 -= alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 -= alpha * layer_0.T.dot(layer_1_delta)

    if(iteration % 10 == 9):
        print("Error: {}".format(layer_2_error))

# Some simple tests:
# Input [0, 1, 0] should be 1
layer_0 = [0, 1, 0]  
layer_1 = relu(np.dot(layer_0, weights_0_1))
layer_2 = np.dot(layer_1, weights_1_2)
print("{} has prediction: {}".format(layer_0, layer_2))

# Input [0, 0, 1] should be 0
layer_0 = [0, 0, 1]  
layer_1 = relu(np.dot(layer_0, weights_0_1))
layer_2 = np.dot(layer_1, weights_1_2)
print("{} has prediction: {}".format(layer_0, layer_2))


Error: 0.6342311598444467
Error: 0.35838407676317513
Error: 0.0830183113303298
Error: 0.006467054957103705
Error: 0.0003292669000750734
Error: 1.5055622665134859e-05
[0, 1, 0] has prediction: [0.99592182]
[0, 0, 1] has prediction: [0.00263144]


The 'delta' for a node tells us how much we want the output value to that node to change. Consider layer_1: once we know the layer_1 deltas (from layer_2_delta * weights_1_2 * reluDerivative(layer_1)), we know how to adjust the weights in weights_1_2 (simply multiply each node in layer_1's outgoing weight by the input and subtract that value from the current weight). We have reluDerivative(layer_1) because if a certain node value was switched to zero, we can't attribute any error to that node.

Backpropagation is about calculating deltas for intermediate layers so that you can perform gradient descent.