# Backpropagation Algorithm

The algorithm consists of two phases: 
    1). Forward pass/Propagation phase, where our inputs are passed through the network and output predictions obtained
    2). Backward pass /Weight update phase, where we compute the gradient of the loss function at the final layer i.e., 
          prediction layer of the network and use this gradient to recursively apply the chain rule to update the weights in our 
          network

In [1]:
import numpy as np

In [2]:
# W implies weight matrix
# layers is a list or intergers to define our network architecture eg (2, 2, 1) Architecture (Define #inputs)
# alpha is our learning rate which controls the size of our step toward an optimal gradient decent with local/global minimum
class NeuralNetwork:
    # Construclor initialization
    def __init__(self, layers, alpha=0.1):
        self.layers = layers
        self.W = []
        self.alpha = alpha
        
        for i in np.arange(0, len(layers) - 2):
            # randomy initialize the weight matrix connecting the number of nodes in each respective layer together
            # Add extra node for bias trick
            # Append the weights into W by scalling them with square root of number of node in the current layer
            w = np.random.randn(layers[i] + 1, layers[i + 1] + 1)
            self.W.append(w / np.sqrt(layers[i]))
            
        # the last two layers
        # layer[-2] + 1 is an input layer which require a bias trick thats why we have added 1 to it
        # layer[-1] is an output layer which does not require a bias trick
        w = np.random.randn(layers[-2] + 1, layers[-1])
        self.W.append(w/ np.sqrt(layers[-2]))
    
    # Python magic function for debugging
    def __repr__(self):
        # construct and return a string that represents the network architecture
        return "Neural Network: {}".format("-".join(str(layer) for layer in self.layers))
    
    # Define sigmoid activation function
    def sigmoid(self, x):
        return 1.0 / (1 + np.exp(-x)) 
    
    # Define derivative of the sigmoid to be used during the backward pass
    # Note that whenever you perform backpropagation, you’ll always want to choose an activation 
    #function that is differentiable.
    def sigmoid_deriv(self, x):
        # compute the derivative of the sigmoid function assuming that x has already been passed 
        #through the sigmoid function
        return x * (1 - x)
    
    # Define fit() method to train our network
    def fit(self, X, y, epochs=1000, displayUpdate=100):
        # Perform bias trick on input features
        X = np.c_[X, np.ones((X.shape[0]))]
        
        # Loop over a desired number of epochs to train
        for epoch in np.arange(0,epochs):
            # Loop over individual data point to train the network
            for (x, target) in zip(X, y):
                self.fit_partial(x, target)
            
            #check if we should display a training update
            if epoch == 0 or (epoch + 1) % displayUpdate ==0:
                loss = self.calculate_loss(X, y)
                print("[INFO] epoch={}, loss={:.7f}".format(epoch + 1, loss))
        
        # Define partial_fit() method which is the heart of backppropagation algorithm
    def fit_partial(self, x, y):
        # Create a list to store activation outputs and initialize it to x 
        A = [np.atleast_2d(x)]

        # Feedforward the inputs to the network by looping over the layers in the network
        for layer in np.arange(0, len(self.W)):
            # Net input operation = dot product btn the activation and the weights matrix
            net = A[layer].dot(self.W[layer])

            # Compute the "net output" by applying nonlinear activation function to the net input
            out = self.sigmoid(net)

            # Add net output to the list of activation
            A.append(out)

            #Back_propagation
            #Compute error = difference btn our prediction and the true target value
            #Prediction = last entry in our activation output list
            error = A[-1] - y

            #Apply chain rule and build a list of deltas 'D'
            #The first entry in the deltas is simply the error of the output layer times the derivative 
            #of activation function for the output value
            D = [error * self.sigmoid_deriv(A[-1])]

            #Compute the deltas for the entire network
            for layer in np.arange(len(A) - 2, 0, -1):
                #Note: the delta for the current layer is equal to the delta of the previous layer dotted 
                #with the weight matrix of the current layer, followed by multiplying the delta by the 
                #derivative of the nonlinear activation function for the activations of the current layer
                delta = D[-1].dot(self.W[layer].T)
                delta = delta * self.sigmoid_deriv(A[layer])
                D.append(delta)

            #Reverse the list of deltas since we looped over the layer in reverse order
            D = D[::-1]
            
            print("Delta length", len(D))
            print("Weight length", len(self.W))
            print("Activation length", len(A))
            
            #Weight update phase
            for layer in np.arange(0, len(self.W)):
                #Update weights by taking the dot product of the layer activations with their respective 
                #deltas, then multiplying this value by some small learning rate and adding to our weight 
                #matrix. This is where the actual learning  takes place
                self.W[layer] +=  -self.alpha *  A[layer].T.dot(D[layer])
                #After weight update the back propagation is official done
                
                
                
        
        #Define predict() method to make predictions on the testing data
        def predict(self, X, addBias=True):
            #initialize the output prediction as the input feature.
            #we will use this value to feedforward our network
            p = np.atleast_2d(X)
            
            #check to see if the bias column should be added
            if addBias:
                #insert a column of 1's as the last entry in the feature matrix
                p = np.c_[p, np.ones((p.shape[0]))]
            
            #loop over the layers
            for layer in np.arange(0, len(self.W)):
                #compute the output prediction by taking the dot product btn the current activation value 
                #p and the weight matrix associated with the current layer, then pas the value to a nonlinear
                #activation function
                p = self.sigmoid(np.dot(p, self.W[layer]))
            
            return p
        
        #Define calculate_loss() to calculate the loss across our entire training set
        def calculate_loss(self, X, targets):
            #make prediction for the input data then calculate the loss using sum squared error
            targets = np.atleast_2d(targets)
            predictions = self.predict(X, addBias=False)
            loss = 0.5 * np.sum((predictions - targets) ** 2)
            
            return loss
            
            

# Train a neural network on XOR dataset

In [3]:
# Costruct the XOR dataset
X = np.asarray([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.asarray([[0], [1], [1], [0]])

In [4]:
X.shape

(4, 2)

In [5]:
# Define the network
nn = NeuralNetwork([2, 2, 1], alpha=0.5)
# Train a network
nn.fit_partial([0,0,1], [0])

Delta length 1
Weight length 2
Activation length 2


IndexError: list index out of range

In [None]:
# Make predictions on the data point
for (x, target) in zip(X, y):
    pred = nn.predict(x)[0][0]
    step = 1 if pred > 0.5 else 0
    print("[INFO] data={}, ground_truth={}, pred={:.4f}, step={}".format(x, target[0], pred, step))