# Day 5: Training the MLP — Gradient Descent in Action

**Building LLMs from Scratch** — Following Andrej Karpathy's micrograd lectures.

---

## 1. Introduction

Training a neural network involves four core steps:

1. **Forward pass** — Run inputs through the model to get predictions
2. **Loss computation** — Measure how wrong the predictions are (e.g., MSE)
3. **Backward pass** — Compute gradients via backpropagation
4. **Parameter update** — Adjust weights using gradient descent: `p.data -= lr * p.grad`

We'll implement the full training loop on a tiny dataset and watch the loss decrease.

## 2. The Value Class + Neuron/Layer/MLP

Complete autograd engine and MLP from Day 4. The Value class tracks computation for backprop; Neuron/Layer/MLP build the network.

In [None]:
import math
import random

class Value:
    """A scalar value that tracks its computation graph for autograd."""
    
    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')
        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward
        return out
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        return out
    
    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers"
        out = Value(self.data ** other, (self,), f'**{other}')
        def _backward():
            self.grad += (other * self.data ** (other - 1)) * out.grad
        out._backward = _backward
        return out
    
    def __neg__(self):
        return self * -1
    
    def __sub__(self, other):
        return self + (-other)
    
    def __radd__(self, other):
        return self + other
    
    def __rmul__(self, other):
        return self * other
    
    def tanh(self):
        t = math.tanh(self.data)
        out = Value(t, (self,), 'tanh')
        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward
        return out
    
    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.grad = 1.0
        for v in reversed(topo):
            v._backward()
    
    def __repr__(self):
        return f"Value(data={self.data})"

class Neuron:
    def __init__(self, nin):
        self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(0)
    
    def __call__(self, x):
        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
        return act.tanh()
    
    def parameters(self):
        return self.w + [self.b]

class Layer:
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def __call__(self, x):
        out = [n(x) for n in self.neurons]
        return out[0] if len(out) == 1 else out
    
    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

class MLP:
    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i + 1]) for i in range(len(nouts))]
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

## 3. The Dataset

A simple 4-sample dataset: 3 inputs per sample, 1 target per sample.

In [None]:
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]
yt = [1.0, -1.0, -1.0, 1.0]

print("Dataset:")
for i, (x, y) in enumerate(zip(xs, yt)):
    print(f"  Sample {i}: x={x} -> target={y}")

## 4. The Training Loop

Full gradient descent: forward → loss → zero_grad → backward → update. Run for 50 iterations.

In [None]:
random.seed(42)
model = MLP(3, [4, 4, 1])
learning_rate = 0.05
n_iters = 50
loss_history = []

for step in range(n_iters):
    # Forward pass
    preds = [model(x) for x in xs]
    
    # MSE loss: sum of (pred - target)^2
    loss = sum((p - t)**2 for p, t in zip(preds, yt))
    loss_history.append(loss.data)
    
    # Zero gradients
    for p in model.parameters():
        p.grad = 0.0
    
    # Backward pass
    loss.backward()
    
    # Update parameters
    for p in model.parameters():
        p.data -= learning_rate * p.grad
    
    if (step + 1) % 5 == 0 or step == 0:
        print(f"Step {step + 1:3d} | Loss: {loss.data:.6f}")

print("\nTraining complete!")

## 5. Visualizing Training

Plot loss over iterations to see convergence.

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))
plt.plot(loss_history, 'b-', linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Loss (MSE)')
plt.title('Training Loss over 50 Iterations')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Final Predictions

Compare model predictions vs targets after training.

In [None]:
print("Predictions vs Targets (after training):")
print("-" * 40)
for i, (x, target) in enumerate(zip(xs, yt)):
    pred = model(x)
    pred_val = pred.data if hasattr(pred, 'data') else pred
    print(f"  Sample {i}: pred={pred_val:7.4f}  target={target:7.4f}")

## 7. Learning Rate Exploration

Try different learning rates to see how they affect convergence. Too small → slow; too large → unstable or divergence.

In [None]:
def train_with_lr(learning_rate, n_iters=50):
    """Train from scratch with given learning rate, return loss history."""
    random.seed(42)
    model = MLP(3, [4, 4, 1])
    loss_history = []
    
    for step in range(n_iters):
        preds = [model(x) for x in xs]
        loss = sum((p - t)**2 for p, t in zip(preds, yt))
        loss_history.append(loss.data)
        
        for p in model.parameters():
            p.grad = 0.0
        loss.backward()
        
        for p in model.parameters():
            p.data -= learning_rate * p.grad
    
    return loss_history

learning_rates = [0.001, 0.01, 0.1, 1.0]
plt.figure(figsize=(10, 5))

for lr in learning_rates:
    history = train_with_lr(lr)
    plt.plot(history, label=f'lr={lr}')

plt.xlabel('Iteration')
plt.ylabel('Loss (MSE)')
plt.title('Effect of Learning Rate on Convergence')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

---

**Blog:** [Day 5 — Training the MLP](https://omkarray.com/llm-day5.html)

**Prev:** [Day 4 — Neuron & MLP](llm_day04_neuron_mlp.ipynb) · **Next:** [Day 6](llm_day06.ipynb)