# Backpropagation

The algorithm has two phases
<!--  --> The Forward pass where inputs are passed through the network and output prediction obtained
<!--  --> The Backward pass where we compute the gradient of the loss function at the final layer and use
<!--  --> this gradient to recursively apply the chain rule to update the weights in the network

In [167]:
import numpy as np

In [168]:
class NeuralNetwork:
    def __init__(self, layers, alpha=0.1):
        self.W = []
        self.alpha = alpha
        self.layers = layers
        
        # Weight initialization for  set of weight lines 1
        for i in np.arange(0, len(layers) - 2):
            w = np.random.randn(layers[i] + 1, layers[i + 1] + 1)
            self.W.append(w / np.sqrt(layers[i]))
        
        w = np.random.randn(layers[-2] + 1, layers[-1])
        self.W.append(w /np.sqrt(layers[-2]))
        
    # Python magic function to format string
    def __repr__(self):
        return "Neural Network: {}".format("-".join(str(l) for l in self.layers))
    
    # Sigmoid activation function
    def sigmoid(self, x):
        return 1. / (1 + np.exp(x))
    
    # In backpropagation when you choose activation function make sure to choose the one which is differentiable
    # Below is a derivative of sigmoid function
    def sigmoid_deriv(self, x):
        return x * ( 1 - x)
    
    # Fit method for actual learning
    def fit(self, X, y, epochs=1000, displayUpdate=100):
        X = np.c_[X, np.ones((X.shape[0]))]
        
        # Train for a desired number of epochs
        for epoch in np.arange(0, epochs):
            for (x, target) in zip(X, y):
                self.partial_fit(x, target)
        
        # Display a training update
        if epoch == 0 or (epoch + 1) % displayUpdate == 0:
            loss = self.calculate_loss(X, y)
            print("[INFO] epoch={}, loss={}".format(str(epoch + 1), str(loss)))
        
    # partial_fit method
    def partial_fit(self, x, y):
        # Construct a list of activations
        #The first activation is a special case -- its just the input feature vector itself
        A = [np.atleast_2d(x)]
        
        for layer in np.arange(0, len(self.W)):
            # Feed forward the activation at the current layer by taking the dot product btn the activation and weight matrix
            # This is called the net input to the current layer
            net = A[layer].dot(self.W[layer])
            
            # Compute the net output by pass the net input to the activation function in our case is sigmoid
            out = self.sigmoid(net)
            
            # Append the output to our activation list
            A.append(out)
            
        #print(A)
        # Backpropagation Phase
        # The first step in backpropagation phase is to get the error
        error = A[-1] - y
        
        # We have taken A[-1] since this is our last node in our network with a prediction
        # From here we need to apply the chain rule and build our list of deltas D
        # The first entry in our delta list is simply the error of the input layer times the derivative of our activation function for the output value
        D = [error * self.sigmoid_deriv(A[-1])]
        
        # Chain rule application
        for layer in np.arange(len(A) - 2, 0, -1):
            # Delta of the current layer is equal to the delta of the previous layer dotted with the weight matrix of the current layer
            # followed by the multiplying the delta by the derivetive of nonlinear activation function for the activation of the current later
            delta = D[-1].dot(self.W[layer].T)
            delta = delta * self.sigmoid_deriv(A[layer])
            D.append(delta)
        
        #Reverse the deltas
        D = D[::-1]
        # print(D)
        
        # Weight update
        for layer in np.arange(0, len(self.W)):
            # Update weight by taking he dot product of the layer activations with their respective deltas, then multiplying this value
            # by some small learning rate and adding to our weight matrix: this is where the actual learning takes place
            self.W[layer] = -self.alpha * A[layer].T.dot(D[layer])
        
        #print(self.W) # This mark the end of backpropagation phase
        
    # Predict method to give out predictions
    def predict(self, X, addBias=False):
        # While prediction initialize the output prediction as the input features, to help with forward pass to obtain the final prediction
        p = np.atleast_2d(X)
        
        # Check to see if we should add a bias column
        if addBias:
            # insert a columns of 1's to the input feature to reflect the bias trick
            p = np.c_[p, np.ones((X.shape[0]))]
            
            # Loop over the network layers
            for layer in np.arange(0, len(self.W)):
                # compute the prediction by taking the dot product btn the current activation value p and the weight matrix associated 
                # with the current layer, then pass this value through a nonlinear activation function (sigmoid function)
                p = self.sigmoid(np.dot(p, self.W[layer]))
        
        # return the predicted value
        return p
        
    def calculate_loss(self, X, targets):
        # Make predictions for the input data points then compute the loss
        targets = np.atleast_2d(targets)
        predictions = self.predict(X, addBias=False)
        loss = 0.5 * (predictions - targets) ** 2
        
        return loss

In [169]:
# Weights initialized
nn= NeuralNetwork([2,2,1])
nn.W

[array([[ 1.19681089, -0.54812268,  0.95172947],
        [-1.17212331, -0.38055493, -0.01901262],
        [ 0.18420064,  0.22515644, -0.25415963]]),
 array([[-2.03208034],
        [ 0.09260364],
        [ 0.53151717]])]

In [170]:
p = nn.W
len(p)

2

In [171]:
# Network architecture
print(nn)

Neural Network: 2-2-1


In [172]:
X = np.asarray([[0,0], [0,1], [1,0], [1,1]])
y = np.asarray([[0], [1], [1], [0]])
# nn.fit(X, y)

In [173]:
nn2 = NeuralNetwork([2,2,1], alpha=0.5)
nn2.fit(X, y, epochs=2000)

[INFO] epoch=2000, loss=[[0.  0.  0.5]
 [0.5 0.  0. ]
 [0.  0.5 0. ]
 [0.5 0.5 0.5]]


In [174]:
#Now that our network is trained we can loop over the XOR data points
for (x, target) in zip(X, y):
    # make prediction and display results
    pred = nn2.predict(x)[0][0]
    step = 1 if pred > 0.5 else 0
    print("[INFO] data={}, ground_truth={}, pred={:.4f}, step={}".format(x, target[0], pred, step))

[INFO] data=[0 0], ground_truth=0, pred=0.0000, step=0
[INFO] data=[0 1], ground_truth=1, pred=0.0000, step=0
[INFO] data=[1 0], ground_truth=1, pred=1.0000, step=1
[INFO] data=[1 1], ground_truth=0, pred=1.0000, step=1
