In [36]:
import numpy as np
import pandas as pd

### part 1: Implementing an autograde system - Micrograde 
### Goal : to create an automatic differentiation system that tracks mathematical operations, builds a computational graph,   and calculates gradients

#### Variable class will represent each variable in the computational graph. It will store the value, gradient, and any operation that created it. 

In [39]:
class Variable:
    def __init__(self, value, grad=0.0, _prev=(), _op=''):
        self.value = value  # The actual value of the variable
        self.grad = grad    # Gradient of the variable (initialized to 0)
        self._prev = set(_prev)  # Previous variables (inputs) used to create this variable
        self._op = _op      # The operation that produced this variable (for visualization)
        self._backward = lambda: None  # Function to compute the gradient for this variable

    def __add__(self, other):
        # Create a new Variable for the sum
        other = other if isinstance(other, Variable) else Variable(other)
        out = Variable(self.value + other.value, _prev=(self, other), _op='+')
        
        # Define the backward pass for addition
        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward
        return out

    def __mul__(self, other):
        # Create a new Variable for the product
        other = other if isinstance(other, Variable) else Variable(other)
        out = Variable(self.value * other.value, _prev=(self, other), _op='*')
        
        # Define the backward pass for multiplication
        # The backward function performs backpropagation by computing gradients for each variable in the computational graph
        def _backward():
            self.grad += other.value * out.grad
            other.grad += self.value * out.grad
        out._backward = _backward
        return out

    # The backward pass defines how gradients are calculated for each operation using the chain rule.
    def backward(self):
        # Set the gradient of the output variable to 1
        self.grad = 1.0
        # Perform a topological sort to ensure correct gradient computation order
        topo = []
        visited = set()
        
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
                
        build_topo(self)

        # Traverse backward in reverse topological order
        for v in reversed(topo):
            v._backward()

    def __repr__(self):
        return f"Variable(value={self.value}, grad={self.grad})"


In [40]:
x = Variable(5.0)
y = (x + 2) * 3
# Perform backpropagation
y.backward()
print("Value of y:", y)
print("Gradient of x:", x)

Value of y: Variable(value=21.0, grad=1.0)
Gradient of x: Variable(value=5.0, grad=3.0)


In [5]:
pip install graphviz

Note: you may need to restart the kernel to use updated packages.


In [41]:
from graphviz import Digraph

In [42]:
def draw_graph(var):
    dot = Digraph(format='png', graph_attr={'rankdir': 'LR'})
    
    # Recursive function to add nodes and edges to the graph
    def add_nodes_edges(v):
        if v not in seen:
            # Add the node for the variable
            seen.add(v)
            node_id = str(id(v))
            dot.node(node_id, f"{v._op} | {v.value:.4f} | grad={v.grad:.4f}", shape='record')
            
            # Add edges for each previous variable
            for child in v._prev:
                child_id = str(id(child))
                dot.edge(child_id, node_id)
                add_nodes_edges(child)
    
    seen = set()
    add_nodes_edges(var)
    
    return dot

# Draw the graph for y
dot = draw_graph(y)
dot.render('computational_graph', view=True)

'computational_graph.png'

## part 2: manually implement backpropagation for a 2-layer Multi-Layer Perceptron (MLP) with batch normalization, cross-entropy loss, and tanh activation to gain insights into how neural networks learn by manually following the steps of backpropagation

## Setting the MLP structure


In [20]:
class LinearLayer:
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(input_size, output_size) * 0.01
        self.biases = np.zeros(output_size)
    
    def forward(self, input):
        self.input = input  # Store input for use in backward pass
        self.output = input.dot(self.weights) + self.biases  # Store output for accuracy calculation
        return self.output
    
    def backward(self, grad_output):
        # Compute gradients for weights, biases, and input
        self.grad_weights = self.input.T.dot(grad_output)
        self.grad_biases = np.sum(grad_output, axis=0)
        grad_input = grad_output.dot(self.weights.T)
        return grad_input


### Implementing Batch Normalization

In [14]:
#Normalize the inputs to have zero mean and unit variance, making training more stable and efficient
class BatchNormalization:
    def __init__(self, dim, epsilon=1e-5):
        self.gamma = np.ones(dim)
        self.beta = np.zeros(dim)
        self.epsilon = epsilon

    def forward(self, input):
        # Store input for use in backward pass
        self.input = input
        
        # Calculate mean and variance for normalization
        self.mean = np.mean(input, axis=0)
        self.var = np.var(input, axis=0)
        
        # Normalize input
        self.x_normalized = (input - self.mean) / np.sqrt(self.var + self.epsilon)
        
        # Scale and shift
        output = self.gamma * self.x_normalized + self.beta
        return output

    def backward(self, grad_output):
        # Get batch size
        N = self.input.shape[0]
        
        # Compute gradients with respect to gamma and beta
        self.grad_gamma = np.sum(grad_output * self.x_normalized, axis=0)
        self.grad_beta = np.sum(grad_output, axis=0)

        # Backpropagate through normalization
        dx_normalized = grad_output * self.gamma
        dvar = np.sum(dx_normalized * (self.input - self.mean) * -0.5 * np.power(self.var + self.epsilon, -1.5), axis=0)
        dmean = np.sum(dx_normalized * -1.0 / np.sqrt(self.var + self.epsilon), axis=0) + dvar * np.mean(-2.0 * (self.input - self.mean), axis=0)

        dx = dx_normalized / np.sqrt(self.var + self.epsilon) + dvar * 2.0 * (self.input - self.mean) / N + dmean / N
        return dx



### Implementing the activation fucntion

In [15]:
class TanhActivation:
    def forward(self, x):
        self.output = np.tanh(x)
        return self.output
    
    def backward(self, grad_output):
        # Derivative of tanh is 1 - tanh^2(x)
        return grad_output * (1 - self.output ** 2)


### Calculating cross entropy loss function

In [16]:
class CrossEntropyLoss:
    def forward(self, predictions, targets):
        # Apply softmax to predictions
        self.predictions = np.exp(predictions - np.max(predictions, axis=1, keepdims=True))
        self.predictions /= np.sum(self.predictions, axis=1, keepdims=True)
        # Compute the loss
        N = targets.shape[0]
        self.targets = targets
        loss = -np.sum(targets * np.log(self.predictions + 1e-15)) / N
        return loss
    
    def backward(self):
        # Gradient of cross-entropy loss with softmax
        N = self.targets.shape[0]
        grad_output = (self.predictions - self.targets) / N
        return grad_output


### Assembling everything to make two layer MLP to implement forward and backward pass 

In [17]:
class MLP:
    def __init__(self, input_size, hidden_size, output_size):
        # Define the layers
        self.layer1 = LinearLayer(input_size, hidden_size)
        self.batchnorm1 = BatchNormalization(hidden_size)
        self.activation1 = TanhActivation()
        self.layer2 = LinearLayer(hidden_size, output_size)
        self.batchnorm2 = BatchNormalization(output_size)
        self.loss = CrossEntropyLoss()
    
    def forward(self, x, targets):
        # Forward pass
        out = self.layer1.forward(x)
        out = self.batchnorm1.forward(out)
        out = self.activation1.forward(out)
        out = self.layer2.forward(out)
        out = self.batchnorm2.forward(out)
        loss = self.loss.forward(out, targets)
        return loss
    
    def backward(self):
        # Backward pass
        grad_output = self.loss.backward()
        grad_output = self.batchnorm2.backward(grad_output)
        grad_output = self.layer2.backward(grad_output)
        grad_output = self.activation1.backward(grad_output)
        grad_output = self.batchnorm1.backward(grad_output)
        grad_output = self.layer1.backward(grad_output)


### Testing the MLP 

In [28]:
# Sample data
np.random.seed(0)
X = np.random.randn(10, 3)  # 10 samples, 3 features
y = np.zeros((10, 2))       # Binary classification (2 output classes)
y[np.arange(10), np.random.randint(0, 2, size=10)] = 1  # One-hot encoding for targets

# Define the accuracy function
def calculate_accuracy(predictions, targets):
    pred_labels = np.argmax(predictions, axis=1)  # Class with highest probability
    true_labels = np.argmax(targets, axis=1)      # True labels
    accuracy = np.mean(pred_labels == true_labels) * 100  # Percentage accuracy
    return accuracy

# Create the MLP
mlp = MLP(input_size=3, hidden_size=5, output_size=2)

# Forward pass
loss = mlp.forward(X, y)

# Calculate predictions and accuracy
predictions = mlp.layer2.output  # Output layer activations
accuracy = calculate_accuracy(predictions, y)

# Backward pass
mlp.backward()

# Output the loss, gradients, and accuracy
print("Loss:", loss)
print("Gradient of first layer weights:\n", mlp.layer1.grad_weights)
print("Accuracy:", accuracy, "%")


Loss: 0.9381647656067867
Gradient of first layer weights:
 [[  3.04320263  -2.24298385  -1.39316963  -5.66049313   8.20292955]
 [  3.49245225  -2.98273095   0.46860402  -2.5291769   -1.29853867]
 [-10.3840303   -9.84227041  -1.41877909  -4.45135564 -11.42525166]]
Accuracy: 60.0 %


### we can clearly see that accuracy is 60 % becuase of following reasons
#### 1. we are using very less data set that is of 10 rows
#### 2. we go through only 1 epoch
#### 3. not using optimization algorithm like gradient descent

In [33]:
# Number of epochs to train the model
num_epochs = 10

learning_rate = 0.01  # Define a learning rate

for epoch in range(num_epochs):
    # Forward pass
    loss = mlp.forward(X, y)
    
    # Calculate predictions and accuracy for monitoring
    predictions = mlp.layer2.output
    accuracy = calculate_accuracy(predictions, y)
    
    # Backward pass
    mlp.backward()
    
    # Gradient Descent Step (parameter update)
    # For each layer, we update weights and biases with the gradients
    mlp.layer1.weights -= learning_rate * mlp.layer1.grad_weights
    mlp.layer1.biases -= learning_rate * mlp.layer1.grad_biases
    mlp.layer2.weights -= learning_rate * mlp.layer2.grad_weights
    mlp.layer2.biases -= learning_rate * mlp.layer2.grad_biases
    
    # Print loss and accuracy for each epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss}, Accuracy: {accuracy}%")


Epoch 1/10, Loss: 0.28518137879923966, Accuracy: 80.0%
Epoch 2/10, Loss: 0.31056300092884503, Accuracy: 80.0%
Epoch 3/10, Loss: 0.28213459184854156, Accuracy: 80.0%
Epoch 4/10, Loss: 0.3066689973028275, Accuracy: 80.0%
Epoch 5/10, Loss: 0.2792739391772824, Accuracy: 80.0%
Epoch 6/10, Loss: 0.30292140876107176, Accuracy: 80.0%
Epoch 7/10, Loss: 0.27659827725459285, Accuracy: 80.0%
Epoch 8/10, Loss: 0.2993409089012876, Accuracy: 90.0%
Epoch 9/10, Loss: 0.2740621480535676, Accuracy: 80.0%
Epoch 10/10, Loss: 0.2958966945590865, Accuracy: 90.0%
