# Day 8: Training Loops, Loss Functions & Evaluation Splits

**Building LLMs from Scratch** — Following Andrej Karpathy's makemore lectures.

---

## 1. Introduction

Yesterday's bigram model *counted* character pairs and normalized. Today we replace counting with a **neural bigram model**: a 27×27 weight matrix trained via gradient descent. We build proper training infrastructure — forward pass, loss, backward, parameter updates — and introduce **train/dev/test splits** for evaluation.

## 2. Dataset Setup

Use the same names list from Day 6. Build bigram pairs `(xs, ys)` as integer tensors: `xs` = current character index, `ys` = next character index.

In [None]:
import torch

words = ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'mia', 'charlotte', 'amelia', 'harper', 'evelyn',
         'abigail', 'emily', 'ella', 'elizabeth', 'camila', 'luna', 'sofia', 'avery', 'mila', 'aria']

chars = sorted(list(set(''.join(words))))
stoi = {'.': 0, **{c: i + 1 for i, c in enumerate(chars)}}
itos = {i: c for c, i in stoi.items()}

print(f"Dataset: {len(words)} names")
print(f"Vocabulary size: {len(stoi)}")

# Build bigram pairs (xs, ys) as integer tensors
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for c1, c2 in zip(chs[:-1], chs[1:]):
        xs.append(stoi[c1])
        ys.append(stoi[c2])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

print(f"Bigram pairs: {len(xs)}")
print(f"xs (current): {xs[:10].tolist()}...")
print(f"ys (next):    {ys[:10].tolist()}...")

## 3. Neural Bigram Model

One-hot encode inputs, multiply by weight matrix W (27×27), apply softmax to get probabilities. We use `torch.nn.functional.one_hot` and manual softmax.

In [None]:
import torch.nn.functional as F

torch.manual_seed(42)
W = torch.randn(27, 27, requires_grad=True)

# Forward: one-hot -> W -> logits -> softmax -> probs
def forward(xs, W):
    xenc = F.one_hot(xs, num_classes=27).float()  # (N, 27)
    logits = xenc @ W                             # (N, 27)
    probs = F.softmax(logits, dim=1)
    return logits, probs

logits, probs = forward(xs, W)
print(f"Input shape: xs {xs.shape}")
print(f"One-hot shape: (N, 27)")
print(f"Logits shape: {logits.shape}")
print(f"Probs shape: {probs.shape}")
print(f"Probs sum per row: {probs.sum(1)[:5].tolist()}...")

## 4. The Training Loop

Full loop: forward → NLL loss (using `F.cross_entropy`, equivalent to manual NLL) → backward → update. Add L2 regularization. Run 200 iterations, print loss every 20.

In [None]:
torch.manual_seed(42)
W = torch.randn(27, 27, requires_grad=True)
lr = 50.0
reg = 0.01

for step in range(200):
    # Forward
    logits, probs = forward(xs, W)
    
    # Loss: cross_entropy(logits, ys) = -mean(log(softmax(logits)[i, ys[i]]))
    # Equivalent to: -mean(log(probs[range(N), ys]))
    loss_ce = F.cross_entropy(logits, ys)
    loss_reg = reg * (W ** 2).mean()
    loss = loss_ce + loss_reg
    
    # Backward
    W.grad = None
    loss.backward()
    
    # Update
    W.data -= lr * W.grad
    
    if step % 20 == 0:
        print(f"step {step:3d}: loss = {loss.item():.4f} (ce={loss_ce.item():.4f}, reg={loss_reg.item():.4f})")

print(f"\nFinal loss: {loss.item():.4f}")

In [None]:
# Verify: F.cross_entropy is equivalent to manual NLL
logits, probs = forward(xs, W)
manual_nll = -torch.log(probs[torch.arange(len(ys)), ys]).mean()
ce_loss = F.cross_entropy(logits, ys)
print(f"Manual NLL: {manual_nll.item():.4f}")
print(f"F.cross_entropy: {ce_loss.item():.4f}")
print(f"Same: {torch.allclose(manual_nll, ce_loss)}")

## 5. Train/Dev/Test Split

Split data 80/10/10. Evaluate loss on each split to detect overfitting.

In [None]:
# Shuffle indices for 80/10/10 split
torch.manual_seed(42)
perm = torch.randperm(len(xs))

n = len(xs)
n_train = int(0.8 * n)
n_dev = int(0.1 * n)
n_test = n - n_train - n_dev

train_idx = perm[:n_train]
dev_idx = perm[n_train:n_train + n_dev]
test_idx = perm[n_train + n_dev:]

xs_train = xs[train_idx]
ys_train = ys[train_idx]
xs_dev = xs[dev_idx]
ys_dev = ys[dev_idx]
xs_test = xs[test_idx]
ys_test = ys[test_idx]

print(f"Train: {len(xs_train)}, Dev: {len(xs_dev)}, Test: {len(xs_test)}")

In [None]:
# Train on train split only, evaluate on all splits
torch.manual_seed(42)
W = torch.randn(27, 27, requires_grad=True)
lr = 50.0
reg = 0.01

for step in range(200):
    logits, _ = forward(xs_train, W)
    loss_ce = F.cross_entropy(logits, ys_train)
    loss_reg = reg * (W ** 2).mean()
    loss = loss_ce + loss_reg
    
    W.grad = None
    loss.backward()
    W.data -= lr * W.grad
    
    if step % 50 == 0:
        with torch.no_grad():
            train_loss = F.cross_entropy(forward(xs_train, W)[0], ys_train).item()
            dev_loss = F.cross_entropy(forward(xs_dev, W)[0], ys_dev).item()
            test_loss = F.cross_entropy(forward(xs_test, W)[0], ys_test).item()
        print(f"step {step:3d}: train={train_loss:.4f} dev={dev_loss:.4f} test={test_loss:.4f}")

with torch.no_grad():
    train_loss = F.cross_entropy(forward(xs_train, W)[0], ys_train).item()
    dev_loss = F.cross_entropy(forward(xs_dev, W)[0], ys_dev).item()
    test_loss = F.cross_entropy(forward(xs_test, W)[0], ys_test).item()
print(f"\nFinal: train={train_loss:.4f} dev={dev_loss:.4f} test={test_loss:.4f}")

## 6. Sampling

Generate names from the trained model. Start with `.`, sample next char from softmax, repeat until `.`.

In [None]:
def sample_name(W, itos):
    out = []
    ix = 0
    while True:
        with torch.no_grad():
            xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
            logits = xenc @ W
            probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1).item()
        if ix == 0:
            break
        out.append(itos[ix])
    return ''.join(out)

print("Generated names:")
for _ in range(10):
    print(sample_name(W, itos))

## 7. Comparing to Count-Based

The neural model trained with gradient descent should converge to similar probabilities as the count-based bigram from Day 6. Build the count-based P matrix and compare.

In [None]:
import torch

# Count-based bigram (from Day 6)
N = torch.zeros((27, 27), dtype=torch.int32)
for w in words:
    chs = ['.'] + list(w) + ['.']
    for c1, c2 in zip(chs[:-1], chs[1:]):
        ix1, ix2 = stoi[c1], stoi[c2]
        N[ix1, ix2] += 1

P_count = (N + 1).float()  # add-1 smoothing
P_count = P_count / P_count.sum(1, keepdim=True)

# Neural bigram: softmax(W) per row
with torch.no_grad():
    P_neural = F.softmax(W, dim=1)

print("Count-based P (sample row 0, first 10):")
print(P_count[0, :10].tolist())
print("\nNeural P (sample row 0, first 10):")
print(P_neural[0, :10].tolist())

# NLL comparison
nll_count = -torch.log(P_count[xs, ys]).mean().item()
nll_neural = F.cross_entropy(forward(xs, W)[0], ys).item()
print(f"\nCount-based NLL: {nll_count:.4f}")
print(f"Neural NLL: {nll_neural:.4f}")
print(f"\nBoth should be ~2.45 — neural model learns the same distribution via optimization.")

---

**Blog:** [Day 8 — Training Loops & Evaluation Splits](https://omkarray.com/llm-day8.html)

**Prev:** [Day 7 — MLP Language Model](llm_day07_mlp_lm.ipynb) · **Next:** [Day 9 — Bengio 2003 MLP](llm_day09_bengio.ipynb)