This notebook complements the lecture slides on backpropagation.

The recursion for backpropagation and the corresponding gradient calculations are implemented using Numpy.

The results are validated by comparing them with PyTorch based gradient calculations.



In [43]:
import numpy as np
import torch
import torch.nn as nn

In [44]:
# Helper function for nonlinear activations and their derivatives
# Only sigmoid and Relu are implemented; but can be extended to other activations
# Activation functions and their derivatives
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    # Calculate the sigmoid output
    sig = sigmoid(x)
    # Use the output to calculate the derivative
    return sig * (1 - sig)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

# Parameters

- W contains  L weight matrices one for each layer

    -  Ex: for a three layer network, W = [W1, W2, W3] where $Wl$ is the weight matrix at $l$-th layer, i.e., $W^{[l]}$

- activations contain the activations from input layer to output layer

    - Ex: for a three layer network, 
    
        activations = [a0, a1, a2, a3],
        
        where $al = \mathbf{a}^{[l]}$, 'a0' is the input ($\mathbf{a}^{[0]}$), and 'a3' is the output ($\mathbf{a}^{[3]}$)

- linear_values stores the output of linear function for each layer

    - Ex: for a three layer network,
        linear_values = [f1, f2, f3] 
        
        where $fl = \mathbf{f}^{[l]}$ is the output of linear function at $l$-th layer 

In [45]:
# Forward pass without bias
def forward_pass(W, x, activation_func):
    activations = [x]
    linear_values = []
    for l in range(len(W)):
        f = np.dot(W[l].T, activations[-1])
        g = activation_func(f)
        linear_values.append(f)
        activations.append(g)
        #print(f"Layer {l+1}: f (linear) = {f.ravel()}, a (activation) = {g.ravel()}")
    return activations, linear_values



$$\frac{\partial \hat{y}}{\partial W^{[l]}} =  \mathbf{a}^{[l-1]} (\delta^{[l]})^\top$$

where

$$\delta^{[l]} = \left[ \mathbf{g}'^{[l]}(\mathbf{f}^{[l]}) \right] \circ \left( W^{[l+1]} \delta^{[l+1]} \right)$$

The operator '$\circ$' indicate the Hadamard product

In [39]:
# Backward pass without bias
def backward_pass(W, activations, linear_values, y_true, activation_derivative):
    #L: number of layers
    L = len(W)
    # initialize deltas with zeros; 
    # one vector for each layer and (L+1) vectors are created to make the implementation consistent with formula
    deltas = [None] * (L + 1)
    #final activation layer is output
    y_hat = activations[-1]

    # Delta for the output layer
    deltas[L] = 2 * (y_hat - y_true) * activation_derivative(linear_values[-1])

    gradients_W = [None] * L

    for l in range(L - 1, -1, -1):
        if l > 0:
            #dg(f)/df -> diagonal elements of this matrix           
            g_derivative = activation_derivative(linear_values[l])
            #Recursion for delta values
            # The operator * indicates Hadamard product
            deltas[l] = g_derivative * np.dot(W[l], deltas[l + 1])

        
        grad_W = np.outer(activations[l], deltas[l + 1])
        gradients_W[l] = grad_W

    return gradients_W

#Simulating a three layer neural network
# Two hidden layer and one output layer
# Example weights and biases
# At hidden layer 1 and 2
#W = [[W_11, W_12], [W_21, W_22]]
# At output layer
#W = [[W_11], [W_21]]
W_np = [
    np.array([[0.2, 0.5], [0.2, 0.4]]),   #Weights for hidden layer 1
    np.array([[0.5, 0.7], [0.6, 0.8]]),   #Weights for hidden layer 2
    np.array([[0.9], [1.0]])                #Weights for output layer 
]

#Input is  a vector 
x_np = np.array([[0], [1]])
y_true_np = np.array([[1]])

# Forward and backward pass
activations_np, linear_values_np = forward_pass(W_np, x_np, relu)
gradients_W_np = backward_pass(W_np, activations_np, linear_values_np, y_true_np, relu_derivative)

In [40]:
# PyTorch network 
class ThreeLayerNet(nn.Module):
    def __init__(self):
        super(ThreeLayerNet, self).__init__()
        self.fc1 = nn.Linear(2, 2, bias=False)
        self.fc2 = nn.Linear(2, 2, bias=False)
        self.fc3 = nn.Linear(2, 1, bias=False)

    def forward(self, x):
        f1 = self.fc1(x)
        a1 = torch.relu(f1)
        #print(f"PyTorch Layer 1: f (linear) = {f1.detach().numpy().ravel()}, a (activation) = {a1.detach().numpy().ravel()}")
        
        f2 = self.fc2(a1)
        a2 = torch.relu(f2)
        #print(f"PyTorch Layer 2: f (linear) = {f2.detach().numpy().ravel()}, a (activation) = {a2.detach().numpy().ravel()}")
        
        f3 = self.fc3(a2)
        a3 = torch.relu(f3)
        #print(f"PyTorch Layer 3: f (linear) = {f3.detach().numpy().ravel()}, a (activation) = {a3.detach().numpy().ravel()}")
        
        return a1, f1, a2, f2, a3, f3

# Initialize PyTorch network
net = ThreeLayerNet()

# Initialize weights for consistency between NumPy and PyTorch
with torch.no_grad():
    net.fc1.weight.copy_(torch.tensor(W_np[0].T))
    net.fc2.weight.copy_(torch.tensor(W_np[1].T))
    net.fc3.weight.copy_(torch.tensor(W_np[2].T))


x_torch = torch.tensor([[0., 1.]], requires_grad=True)
y_true_torch = torch.tensor([[1.]])

# PyTorch forward pass
a1, f1, a2, f2, a3, f3 = net(x_torch)

# Compute loss and backward pass

# Select MSE Loss
criterion = nn.MSELoss()

#Calculation of loss
loss = criterion(a3, y_true_torch)

#Calculation of gradients
loss.backward()

# Checking the forward pass with numpy and pytorch

In [47]:
print(f"Numpy Layer 1: f (linear) = {linear_values_np[0].ravel()}, a (activation) = {activations_np[1].ravel()}")
print(f"PyTorch Layer 1: f (linear) = {f1.ravel()}, a (activation) = {a1.ravel()}")

print(f"Numpy Layer 2: f (linear) = {linear_values_np[1].ravel()}, a (activation) = {activations_np[2].ravel()}")
print(f"PyTorch Layer 2: f (linear) = {f2.ravel()}, a (activation) = {a2.ravel()}")

print(f"Numpy Layer 3: f (linear) = {linear_values_np[2].ravel()}, a (activation) = {activations_np[3].ravel()}")
print(f"PyTorch Layer 3: f (linear) = {f2.ravel()}, a (activation) = {a2.ravel()}")


Numpy Layer 1: f (linear) = [0.2 0.4], a (activation) = [0.2 0.4]
PyTorch Layer 1: f (linear) = tensor([0.2000, 0.4000], grad_fn=<ViewBackward0>), a (activation) = tensor([0.2000, 0.4000], grad_fn=<ViewBackward0>)
Numpy Layer 2: f (linear) = [0.34 0.46], a (activation) = [0.34 0.46]
PyTorch Layer 2: f (linear) = tensor([0.3400, 0.4600], grad_fn=<ViewBackward0>), a (activation) = tensor([0.3400, 0.4600], grad_fn=<ViewBackward0>)
Numpy Layer 3: f (linear) = [0.766], a (activation) = [0.766]
PyTorch Layer 3: f (linear) = tensor([0.3400, 0.4600], grad_fn=<ViewBackward0>), a (activation) = tensor([0.3400, 0.4600], grad_fn=<ViewBackward0>)


# Checking backpropagation with numpy and pytorch

In [42]:
# Compare gradients with numpy and pytorch
print(f"Layer 1 gradients for W in Numpy :\n{gradients_W_np[0]}\n")
print(f"Layer 1 gradients for W in PyTorch:\n{net.fc1.weight.grad.T}")
print(f"Layer 2 gradients for W in Numpy :\n{gradients_W_np[1]}\n")
print(f"Layer 2 gradients for W in PyTorch:\n{net.fc2.weight.grad.T}")
print(f"Layer 3 gradients for W in Numpy :\n{gradients_W_np[2]}\n")
print(f"Layer 3 gradients for W in PyTorch:\n{net.fc3.weight.grad.T}")

Layer 1 gradients for W in Numpy :
[[-0.      -0.     ]
 [-0.5382  -0.62712]]

Layer 1 gradients for W in PyTorch:
tensor([[ 0.0000,  0.0000],
        [-0.5382, -0.6271]])
Layer 2 gradients for W in Numpy :
[[-0.08424 -0.0936 ]
 [-0.16848 -0.1872 ]]

Layer 2 gradients for W in PyTorch:
tensor([[-0.0842, -0.0936],
        [-0.1685, -0.1872]])
Layer 3 gradients for W in Numpy :
[[-0.15912]
 [-0.21528]]

Layer 3 gradients for W in PyTorch:
tensor([[-0.1591],
        [-0.2153]])
