# Day 20: Training GPT — From Random Noise to Coherent Text

**Building LLMs from Scratch** — Following Andrej Karpathy's makemore lectures.

---

## 1. Introduction

We have a complete GPT. Today we train it on real text and watch it learn to generate coherent output — the capstone of the series.

**What we cover:**
1. Preparing a character-level dataset
2. The training loop with AdamW and a cosine LR schedule
3. Overfitting diagnostics — train vs val loss
4. Generation: temperature, top-k, greedy
5. Scaling laws — why bigger + more data = better
6. The path from this GPT to GPT-4

## 2. Dataset Preparation

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import math
from dataclasses import dataclass

torch.manual_seed(42)

# Use tiny shakespeare or a fallback text
try:
    with open('../input.txt', 'r') as f:
        text = f.read()
    print(f"Loaded dataset: {len(text):,} characters")
except FileNotFoundError:
    # Fallback: short public domain text (first lines of Hamlet)
    text = """
To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
And by opposing end them. To die, to sleep,
No more; and by a sleep to say we end
The heart-ache and the thousand natural shocks
That flesh is heir to: 'tis a consummation
Devoutly to be wish'd. To die, to sleep;
To sleep, perchance to dream. Ay, there's the rub,
For in that sleep of death what dreams may come,
When we have shuffled off this mortal coil,
Must give us pause. There's the respect
That makes calamity of so long life.
""" * 50  # repeat to get enough data
    print(f"Using fallback text: {len(text):,} characters")

# Build vocabulary
chars = sorted(set(text))
vocab_size = len(chars)
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for c, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

print(f"Vocab size: {vocab_size}")
print(f"Chars: {''.join(chars[:20])}...")

# Train/val split
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data   = data[n:]
print(f"Train tokens: {len(train_data):,}, Val tokens: {len(val_data):,}")

## 3. Data Loader

In [None]:
def get_batch(split, block_size=64, batch_size=32):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

x, y = get_batch('train')
print(f"x shape: {x.shape}  (batch_size, block_size)")
print(f"y shape: {y.shape}  (targets are x shifted by 1)")
print(f"\nSample input:  '{decode(x[0, :20].tolist())}'")
print(f"Sample target: '{decode(y[0, :20].tolist())}'")
print("(Target is input shifted by 1 — predict next character at every position)")

## 4. GPT Model (from Day 19)

In [None]:
@dataclass
class GPTConfig:
    vocab_size: int = 65
    block_size: int = 64
    n_layer:    int = 4
    n_head:     int = 4
    n_embd:     int = 128
    dropout:    float = 0.1

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.n_head, self.n_embd = config.n_head, config.n_embd
        self.head_size = config.n_embd // config.n_head
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
        self.attn_drop = nn.Dropout(config.dropout)
        self.resid_drop = nn.Dropout(config.dropout)
        self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size)))
    def forward(self, x):
        B, T, C = x.shape; H, hs = self.n_head, self.head_size
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        q = q.view(B,T,H,hs).transpose(1,2); k = k.view(B,T,H,hs).transpose(1,2); v = v.view(B,T,H,hs).transpose(1,2)
        att = self.attn_drop(F.softmax((q@k.transpose(-2,-1))*(hs**-0.5).masked_fill(self.bias[:T,:T]==0,float('-inf')), dim=-1))
        return self.resid_drop(self.c_proj((att@v).transpose(1,2).contiguous().view(B,T,C)))

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(config.n_embd, 4*config.n_embd), nn.GELU(),
                                  nn.Linear(4*config.n_embd, config.n_embd), nn.Dropout(config.dropout))
    def forward(self, x): return self.net(x)

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd); self.attn = CausalSelfAttention(config)
        self.ln2 = nn.LayerNorm(config.n_embd); self.mlp = MLP(config)
    def forward(self, x):
        x = x + self.attn(self.ln1(x)); x = x + self.mlp(self.ln2(x)); return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte=nn.Embedding(config.vocab_size, config.n_embd),
            wpe=nn.Embedding(config.block_size, config.n_embd),
            drop=nn.Dropout(config.dropout),
            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f=nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        for p in self.parameters():
            if p.dim() >= 2: nn.init.normal_(p, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        pos = torch.arange(T, device=idx.device)
        x = self.transformer.drop(self.transformer.wte(idx) + self.transformer.wpe(pos))
        for block in self.transformer.h: x = block(x)
        logits = self.lm_head(self.transformer.ln_f(x))
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) if targets is not None else None
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            logits, _ = self(idx[:, -self.config.block_size:])
            logits = logits[:, -1, :] / temperature
            if top_k:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = float('-inf')
            idx = torch.cat([idx, torch.multinomial(F.softmax(logits, dim=-1), 1)], dim=1)
        return idx

config = GPTConfig(vocab_size=vocab_size)
model = GPT(config)
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

## 5. Training Loop with Cosine LR Schedule

In [None]:
@torch.no_grad()
def estimate_loss(model, eval_iters=50):
    model.eval()
    losses = {}
    for split in ['train', 'val']:
        ls = []
        for _ in range(eval_iters):
            x, y = get_batch(split, block_size=config.block_size)
            _, loss = model(x, y)
            ls.append(loss.item())
        losses[split] = sum(ls) / len(ls)
    model.train()
    return losses


# AdamW with cosine LR schedule (GPT-2 style)
max_lr    = 3e-4
min_lr    = max_lr / 10
max_steps = 3000
warmup    = 100

def get_lr(step):
    if step < warmup:
        return max_lr * step / warmup
    if step > max_steps:
        return min_lr
    ratio = (step - warmup) / (max_steps - warmup)
    return min_lr + 0.5 * (max_lr - min_lr) * (1 + math.cos(math.pi * ratio))

optimizer = torch.optim.AdamW(model.parameters(), lr=max_lr, betas=(0.9, 0.95), weight_decay=0.1)

train_losses, val_losses, lrs = [], [], []
eval_every = 300

for step in range(max_steps + 1):
    # Set LR
    lr = get_lr(step)
    for g in optimizer.param_groups:
        g['lr'] = lr

    if step % eval_every == 0:
        losses = estimate_loss(model)
        train_losses.append(losses['train'])
        val_losses.append(losses['val'])
        lrs.append(lr)
        print(f"step {step:4d} | train {losses['train']:.4f} | val {losses['val']:.4f} | lr {lr:.2e}")

    if step == max_steps:
        break

    x, y = get_batch('train', block_size=config.block_size)
    _, loss = model(x, y)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # gradient clipping
    optimizer.step()

## 6. Training Curves

In [None]:
steps_eval = list(range(0, max_steps + 1, eval_every))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(steps_eval, train_losses, label='Train', color='steelblue', linewidth=2)
axes[0].plot(steps_eval, val_losses,   label='Val',   color='tomato',    linewidth=2)
axes[0].set_xlabel('Step')
axes[0].set_ylabel('Loss')
axes[0].set_title('Train vs Validation Loss\n(gap = overfitting)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(steps_eval, lrs, color='green', linewidth=2)
axes[1].set_xlabel('Step')
axes[1].set_ylabel('Learning Rate')
axes[1].set_title('Cosine LR Schedule\n(warmup → cosine decay → min_lr)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Initial loss: {train_losses[0]:.4f} (expected ≈ {math.log(vocab_size):.4f} = log(vocab_size))")
print(f"Final train:  {train_losses[-1]:.4f}")
print(f"Final val:    {val_losses[-1]:.4f}")

## 7. Generate Text!

In [None]:
model.eval()

print("=" * 60)
print("Generated text (temperature=1.0, top_k=40):")
print("=" * 60)
context = torch.zeros(1, 1, dtype=torch.long)
generated = model.generate(context, max_new_tokens=300, temperature=1.0, top_k=40)
print(decode(generated[0].tolist()))

print("\n" + "=" * 60)
print("More deterministic (temperature=0.5):")
print("=" * 60)
generated = model.generate(torch.zeros(1,1,dtype=torch.long), max_new_tokens=200, temperature=0.5, top_k=10)
print(decode(generated[0].tolist()))

## 8. Scaling Laws — The Path to GPT-4

In [None]:
# Chinchilla scaling law: loss ≈ A/N^alpha + B/D^beta + C
# Optimal: N params, D = 20*N tokens (Chinchilla ratio)

models = [
    ("Our GPT (today)",     0.4e6,     1e6,      "~4.0"),
    ("GPT-2 Small",        124e6,     9e9,       "~3.3"),
    ("GPT-2 XL",          1542e6,    40e9,       "~2.9"),
    ("GPT-3",            175e9,    300e9,        "~2.0"),
    ("Chinchilla (70B)",   70e9,   1400e9,       "~1.9"),
    ("Llama 3 (70B)",      70e9,  15000e9,       "~1.7"),
]

print(f"{'Model':<25} {'Params':>12} {'Tokens':>14} {'Est. Loss':>12}")
print("-" * 68)
for name, params, tokens, loss in models:
    def fmt(n):
        if n >= 1e12: return f"{n/1e12:.0f}T"
        if n >= 1e9:  return f"{n/1e9:.0f}B"
        if n >= 1e6:  return f"{n/1e6:.0f}M"
        return f"{n/1e3:.0f}K"
    print(f"{name:<25} {fmt(params):>12} {fmt(tokens):>14} {loss:>12}")

print("\nKey insight: Loss scales predictably with compute (Hoffmann et al., 2022)")
print("More params + proportionally more data = consistently lower loss")

# Visualize (illustrative, not exact)
fig, ax = plt.subplots(figsize=(9, 5))
names = [m[0] for m in models]
params_list = [math.log10(m[1]) for m in models]
losses = [float(m[3].replace('~','')) for m in models]

scatter = ax.scatter(params_list, losses, s=150, c=range(len(models)), cmap='viridis', zorder=3)
ax.plot(params_list, losses, '--', color='gray', alpha=0.5)
for i, name in enumerate(names):
    ax.annotate(name.split('(')[0].strip(), (params_list[i], losses[i]),
                textcoords='offset points', xytext=(5, 5), fontsize=8)
ax.set_xlabel('log₁₀(Parameters)')
ax.set_ylabel('Estimated Loss (lower = better)')
ax.set_title('Scaling Laws: More Parameters → Lower Loss')
ax.grid(True, alpha=0.3)
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## 9. Series Summary

| Day | Topic | Key Concept |
|-----|-------|-------------|
| 1-5 | Micrograd | Autograd from scratch |
| 6-8 | Bigram + Tensors | Language modeling basics |
| 9-10 | MLP + Embeddings | Neural LM, lookup tables |
| 11-12 | Activations + BatchNorm | Training stability |
| 13-14 | Backprop + Cross-Entropy | Gradient mechanics |
| 15 | WaveNet | Hierarchical context fusion |
| 16 | Self-Attention | Query-key-value mechanism |
| 17 | Multi-Head + Positional Enc | Parallel attention, order |
| 18 | Transformer Block | Residuals + LayerNorm |
| 19 | GPT Architecture | Full model assembly |
| **20** | **Training GPT** | **LR schedule, generation** |

The full GPT we built today has the **same architecture** as GPT-2. The only differences between our ~400K parameter baby GPT and GPT-4 are:
- **Scale**: 400K → 1.7 trillion parameters
- **Data**: 1M tokens → 13 trillion tokens  
- **RLHF**: Reinforcement Learning from Human Feedback for alignment
- **Compute**: a MacBook → thousands of A100s for months

---

**Building LLMs from Scratch** — [Day 20: Training GPT](https://omkarray.com/llm-day20.html) | [← Prev](llm_day19_gpt.ipynb)

*Series complete. You built a GPT from scratch — from a single Value node all the way to an autoregressive transformer.*