# Day 19: GPT — Putting It All Together

**Building LLMs from Scratch** — Following Andrej Karpathy's makemore lectures.

---

## 1. Introduction

GPT (Generative Pre-trained Transformer) is the architecture behind ChatGPT, GPT-4, and most modern LLMs. Today we build the complete GPT from scratch using everything we've learned:

- Token + positional embeddings (Day 17)
- Multi-head causal self-attention (Day 16-17)
- Transformer blocks with residuals + LayerNorm (Day 18)
- Language modeling head

**GPT architecture:**
```
Input tokens
  → Token Embedding + Positional Embedding
  → Block 1 (MHA + FFN)
  → Block 2 (MHA + FFN)
  → ...
  → Block N (MHA + FFN)
  → LayerNorm
  → Linear head → logits over vocab
```

## 2. Complete GPT Implementation

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass

torch.manual_seed(42)


@dataclass
class GPTConfig:
    vocab_size: int = 65          # character-level: 65 chars
    block_size: int = 64          # context window
    n_layer:    int = 4           # number of transformer blocks
    n_head:     int = 4           # attention heads
    n_embd:     int = 128         # embedding dimension
    dropout:    float = 0.1


class CausalSelfAttention(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.head_size = config.n_embd // config.n_head
        # Q, K, V all at once
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size)))

    def forward(self, x):
        B, T, C = x.shape
        H, hs = self.n_head, self.head_size
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        q = q.view(B, T, H, hs).transpose(1, 2)
        k = k.view(B, T, H, hs).transpose(1, 2)
        v = v.view(B, T, H, hs).transpose(1, 2)
        att = (q @ k.transpose(-2,-1)) * (hs ** -0.5)
        att = att.masked_fill(self.bias[:T,:T] == 0, float('-inf'))
        att = self.attn_dropout(F.softmax(att, dim=-1))
        y = (att @ v).transpose(1, 2).contiguous().view(B, T, C)
        return self.resid_dropout(self.c_proj(y))


class MLP(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.c_fc   = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.act    = nn.GELU()
        self.dropout = nn.Dropout(config.dropout)
    def forward(self, x):
        return self.dropout(self.c_proj(self.act(self.c_fc(x))))


class Block(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp  = MLP(config)
    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


class GPT(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte  = nn.Embedding(config.vocab_size, config.n_embd),   # token embeddings
            wpe  = nn.Embedding(config.block_size, config.n_embd),   # positional embeddings
            drop = nn.Dropout(config.dropout),
            h    = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # Weight tying: share token embedding and lm_head weights
        self.transformer.wte.weight = self.lm_head.weight
        self._init_weights()

    def _init_weights(self):
        """GPT-2 style initialization."""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        assert T <= self.config.block_size
        pos = torch.arange(T, device=idx.device)
        tok_emb = self.transformer.wte(idx)   # (B, T, n_embd)
        pos_emb = self.transformer.wpe(pos)   # (T, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)              # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """Autoregressive generation."""
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.config.block_size:]  # crop to block_size
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature      # last token only
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = float('-inf')
            probs = F.softmax(logits, dim=-1)
            next_tok = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, next_tok], dim=1)
        return idx


config = GPTConfig()
model = GPT(config)
params = sum(p.numel() for p in model.parameters())
print(f"GPT config: {config}")
print(f"Parameters: {params:,}")

# Test forward pass
idx = torch.randint(0, config.vocab_size, (2, 32))
targets = torch.randint(0, config.vocab_size, (2, 32))
logits, loss = model(idx, targets)
print(f"\nlogits: {logits.shape}, loss: {loss.item():.4f}")
print(f"Expected initial loss ≈ {torch.log(torch.tensor(config.vocab_size)):.4f} (random baseline)")

## 3. Weight Tying — Why Share Token + Output Weights?

In [None]:
# Weight tying: wte.weight == lm_head.weight
# Intuition: the embedding of token i should be similar to the output vector
# that causes token i to be predicted — they're dual representations
# Also saves ~vocab_size * n_embd parameters

print("Weight tying verification:")
print(f"  wte.weight is lm_head.weight: {model.transformer.wte.weight is model.lm_head.weight}")
print(f"  Shape: {model.transformer.wte.weight.shape}")
print(f"  Saves: {config.vocab_size * config.n_embd:,} parameters")

# Parameter breakdown
print("\nParameter breakdown:")
for name, p in model.named_parameters():
    if p.requires_grad:
        print(f"  {name:<40} {str(p.shape):<25} {p.numel():>10,}")

## 4. Generation — Before Training (Random)

In [None]:
# Character vocabulary
chars = [chr(i) for i in range(32, 97)]  # space + uppercase + some punctuation = 65 chars
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for c, i in stoi.items()}

# Generate from untrained model
model.eval()
context = torch.zeros(1, 1, dtype=torch.long)  # start with token 0
generated = model.generate(context, max_new_tokens=100, temperature=1.0)[0].tolist()
text = ''.join(itos.get(i, '?') for i in generated)
print("Before training (random noise):")
print(repr(text))

## 5. Comparing GPT to Earlier Architectures

In [None]:
# Architecture comparison table
architectures = [
    ("Bigram (Day 6)",        "None",         "None",    "1-gram",   "<1K"),
    ("MLP (Day 9)",           "None",         "Concat",  "fixed",    "~50K"),
    ("WaveNet (Day 15)",      "None",         "Hierarch","log(T)",   "~150K"),
    ("GPT (today)",           "Self-Attn",    "Residual","full T",   f"{params:,}"),
]

print(f"{'Model':<25} {'Context Mixing':<15} {'Aggregation':<12} {'Receptive Field':<18} {'Params':<12}")
print("-" * 85)
for row in architectures:
    print(f"{row[0]:<25} {row[1]:<15} {row[2]:<12} {row[3]:<18} {row[4]:<12}")

print("\nKey insight: GPT's self-attention lets EVERY token attend to EVERY other token")
print("with learned, data-dependent weights — the most expressive mixing possible.")

## 6. Top-K and Temperature Sampling

In [None]:
import matplotlib.pyplot as plt

# Demonstrate temperature and top-k effects
logits = torch.tensor([3.0, 2.0, 1.0, 0.5, 0.1, -0.5, -1.0, -2.0])

fig, axes = plt.subplots(2, 3, figsize=(14, 8))

# Temperature effects
temps = [0.5, 1.0, 2.0]
for i, T in enumerate(temps):
    probs = F.softmax(logits / T, dim=0)
    axes[0, i].bar(range(len(probs)), probs.numpy(), color='steelblue')
    axes[0, i].set_title(f'Temperature = {T}\n{"peaked" if T < 1 else ("balanced" if T == 1 else "flat")}')
    axes[0, i].set_xlabel('Token')
    axes[0, i].set_ylabel('Probability')
    axes[0, i].set_ylim(0, 1)

# Top-K effects
ks = [2, 4, 8]
for i, k in enumerate(ks):
    l = logits.clone()
    v, _ = torch.topk(l, k)
    l[l < v[-1]] = float('-inf')
    probs = F.softmax(l, dim=0)
    axes[1, i].bar(range(len(probs)), probs.numpy(), color='tomato')
    axes[1, i].set_title(f'Top-K = {k}\n(zeros out {len(logits)-k} lowest tokens)')
    axes[1, i].set_xlabel('Token')
    axes[1, i].set_ylabel('Probability')
    axes[1, i].set_ylim(0, 1)

plt.suptitle('Sampling Strategies: Temperature (top) vs Top-K (bottom)', y=1.01)
plt.tight_layout()
plt.show()

print("Temperature < 1: more deterministic (good for code/facts)")
print("Temperature > 1: more random (good for creative text)")
print("Top-K: only sample from K most likely tokens (prevents rare token sampling)")

---

**Building LLMs from Scratch** — [Day 19: GPT](https://omkarray.com/llm-day19.html) | [← Prev](llm_day18_transformer_block.ipynb) | [Next →](llm_day20_training_gpt.ipynb)