In [None]:
#@title üéß Download Narration Audio & Play Introduction
import os as _os
if not _os.path.exists("/content/narration"):
    !pip install -q gdown
    import gdown
    gdown.download(id="17rFuCNZUUY1xHrMq1WTamV-JWh_IDZe8", output="/content/narration.zip", quiet=False)
    !unzip -q /content/narration.zip -d /content/narration
    !rm /content/narration.zip
    print(f"Loaded {len(_os.listdir('/content/narration'))} narration segments")
else:
    print("Narration audio already loaded.")

from IPython.display import Audio, display
display(Audio("/content/narration/04_00_intro.mp3"))


In [None]:
#@title üéß Code Walkthrough: Data Tokenizer
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_02_data_tokenizer.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# üîß Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("‚ö†Ô∏è No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

print(f"\nüì¶ Python {sys.version.split()[0]}")
print(f"üî• PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"üé≤ Random seed set to {SEED}")

%matplotlib inline

In [None]:
#@title üéß Listen: Motivation
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_01_motivation.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Listen: Intuition
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_01_intuition.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


# Building a Tiny Language Model (Mini-GPT) -- Vizuara

---

## 1. Why Does This Matter?

You have now built every component of a Transformer language model from scratch: N-gram counting, word embeddings, self-attention, multi-head attention, positional encoding, and Transformer blocks. In this notebook, we bring it all together.

We will build, train, and generate text from a **complete GPT-style language model** -- a miniature version of the architecture behind GPT-2, GPT-3, and every modern LLM. The model will learn from Shakespeare's text and generate new passages in his style.

This is the culmination of the entire "Foundations of Language Modeling" journey: from counting word pairs to training a Transformer that can write coherent text.

**What you will build:**
- A complete mini-GPT model (~1M parameters)
- A character-level tokenizer
- The training loop with next-token prediction loss
- Text generation with temperature-controlled sampling
- Perplexity evaluation and training visualizations
- Everything runs on a free T4 GPU in under 10 minutes

Let us build a language model.

---

## 2. Building Intuition

A GPT-style language model is conceptually simple:

1. Take a sequence of tokens
2. Pass them through token embeddings + positional encoding
3. Pass through N stacked Transformer blocks (with causal masking)
4. Project the final hidden states to vocabulary logits
5. The loss is: at each position, how well did the model predict the *next* token?

The training objective -- **next-token prediction** -- is self-supervised. The training data IS the labels. Every token in a sentence serves as the prediction target for the tokens before it.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import time
import math

# Let's load some text data -- Shakespeare's works
# On Colab, we can download it directly

import urllib.request
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
try:
    response = urllib.request.urlopen(url)
    text = response.read().decode('utf-8')
except:
    # Fallback: generate some sample text
    text = """First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
""" * 100

print(f"Text length: {len(text):,} characters")
print(f"First 200 characters:")
print(text[:200])

In [None]:
# Build a character-level tokenizer
# GPT uses BPE (byte-pair encoding), but character-level is simpler to understand

chars = sorted(list(set(text)))
vocab_size = len(chars)

# Character to index mapping
char2idx = {ch: i for i, ch in enumerate(chars)}
idx2char = {i: ch for ch, i in char2idx.items()}

# Encode/decode functions
def encode(s):
    return [char2idx[c] for c in s]

def decode(indices):
    return ''.join([idx2char[i] for i in indices])

print(f"Vocabulary size: {vocab_size}")
print(f"Characters: {''.join(chars[:30])}...")
print(f"\nEncoding 'Hello': {encode('Hello')}")
print(f"Decoding back: '{decode(encode('Hello'))}'")

# Encode the entire dataset
data = torch.tensor(encode(text), dtype=torch.long)
print(f"\nFull dataset: {data.shape[0]:,} tokens")

# Train/val split (90/10)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
print(f"Train: {len(train_data):,} tokens")
print(f"Val:   {len(val_data):,} tokens")

In [None]:
#@title üéß Listen: Mathematics
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_04_mathematics.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 3. The Mathematics

The training loss is **cross-entropy** summed over all positions:

$$\mathcal{L} = -\frac{1}{T}\sum_{t=1}^{T} \log P(w_t \mid w_1, w_2, \ldots, w_{t-1})$$

Where $P(w_t \mid w_{1:t-1})$ comes from softmax over the model's logits at position $t$.

The key insight: the causal mask ensures that position $t$ can only attend to positions $1, \ldots, t$, so each position makes a genuine prediction about the future.

**Perplexity** is the exponentiated average loss:

$$\text{Perplexity} = \exp(\mathcal{L})$$

A perplexity of $k$ means the model is as uncertain as if it were choosing uniformly among $k$ options at each step.

In [None]:
#@title üéß Code Walkthrough: Loss Example
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_05_loss_example.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Let's understand the loss with a concrete example

# Suppose our model processes "Hello" and we want to compute the loss
example = "Hello"
example_tokens = torch.tensor(encode(example))

print(f"Input tokens:  {example_tokens[:-1].tolist()} ‚Üí {[idx2char[i.item()] for i in example_tokens[:-1]]}")
print(f"Target tokens: {example_tokens[1:].tolist()} ‚Üí {[idx2char[i.item()] for i in example_tokens[1:]]}")

print(f"\nAt each position, the model predicts the NEXT character:")
for i in range(len(example) - 1):
    context = example[:i+1]
    target = example[i+1]
    print(f"  Given '{context}' ‚Üí predict '{target}'")

print(f"\nThe loss measures how well these predictions match reality.")
print(f"A random model with vocab_size={vocab_size} has:")
print(f"  Loss = -log(1/{vocab_size}) = {-np.log(1/vocab_size):.2f}")
print(f"  Perplexity = {vocab_size}")
print(f"\nA perfect model has Loss = 0, Perplexity = 1")

In [None]:
#@title üéß Transition: Transition Build
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_06_transition_build.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 4. Let's Build It -- Component by Component

In [None]:
#@title üéß Code Walkthrough: Hyperparameters
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_07_hyperparameters.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Hyperparameters for our mini-GPT
BATCH_SIZE = 64      # How many sequences to process in parallel
BLOCK_SIZE = 128     # Maximum context length
D_MODEL = 128        # Embedding dimension
NUM_HEADS = 4        # Number of attention heads
NUM_LAYERS = 4       # Number of transformer blocks
DROPOUT = 0.1        # Dropout rate
LEARNING_RATE = 3e-4 # Adam learning rate
MAX_ITERS = 3000     # Training iterations
EVAL_INTERVAL = 300  # Evaluate every N iterations
EVAL_ITERS = 100     # Number of batches for evaluation

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
#@title üéß Code Walkthrough: Data Batching
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_08_data_batching.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Data loading: create random batches of (context, target) pairs

def get_batch(split):
    """
    Generate a batch of training data.

    Returns:
        x: (batch_size, block_size) input tokens
        y: (batch_size, block_size) target tokens (shifted by 1)
    """
    data_split = train_data if split == 'train' else val_data

    # Random starting positions
    ix = torch.randint(len(data_split) - BLOCK_SIZE, (BATCH_SIZE,))

    x = torch.stack([data_split[i:i+BLOCK_SIZE] for i in ix])
    y = torch.stack([data_split[i+1:i+BLOCK_SIZE+1] for i in ix])

    return x.to(device), y.to(device)

# Test
xb, yb = get_batch('train')
print(f"Input batch shape:  {xb.shape}")
print(f"Target batch shape: {yb.shape}")
print(f"\nFirst sequence (first 30 chars):")
print(f"  Input:  '{decode(xb[0][:30].tolist())}'")
print(f"  Target: '{decode(yb[0][:30].tolist())}'")
print(f"\nNotice: target is input shifted right by 1 position.")

In [None]:
#@title üéß Listen: Model Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_09_model_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


Now let us build the complete model.

In [None]:
#@title üéß Code Walkthrough: 10a Model Components
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_10a_model_components.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Code Walkthrough: 10b Full Model
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_10b_full_model.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Code Walkthrough: Training Loop
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_16_training_loop.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
class MultiHeadSelfAttention(nn.Module):
    """Multi-head self-attention with causal masking."""

    def __init__(self, d_model, num_heads, block_size, dropout=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.qkv_proj = nn.Linear(d_model, 3 * d_model, bias=False)
        self.out_proj = nn.Linear(d_model, d_model, bias=False)

        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)

        # Causal mask: prevent attending to future positions
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(block_size, block_size), diagonal=1).bool()
        )

    def forward(self, x):
        B, T, C = x.shape

        # Compute Q, K, V in one projection
        qkv = self.qkv_proj(x)
        Q, K, V = qkv.chunk(3, dim=-1)

        # Reshape for multi-head
        Q = Q.view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(B, T, self.num_heads, self.d_k).transpose(1, 2)

        # Attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        scores = scores.masked_fill(self.mask[:T, :T], float('-inf'))
        weights = F.softmax(scores, dim=-1)
        weights = self.attn_dropout(weights)

        out = torch.matmul(weights, V)

        # Concatenate heads
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        out = self.resid_dropout(self.out_proj(out))

        return out


class MiniGPTBlock(nn.Module):
    """Transformer block: attention + feed-forward with residuals."""

    def __init__(self, d_model, num_heads, block_size, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = MultiHeadSelfAttention(d_model, num_heads, block_size, dropout)
        self.ln2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        # Pre-norm architecture (used in GPT-2 and later)
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x


class MiniGPT(nn.Module):
    """
    A miniature GPT language model.

    Architecture:
    - Token embedding + positional embedding
    - N stacked Transformer blocks (with causal attention)
    - Final layer norm + linear projection to vocabulary
    """

    def __init__(self, vocab_size, d_model, num_heads, num_layers,
                 block_size, dropout=0.1):
        super().__init__()
        self.block_size = block_size

        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(block_size, d_model)
        self.drop = nn.Dropout(dropout)

        self.blocks = nn.Sequential(*[
            MiniGPTBlock(d_model, num_heads, block_size, dropout)
            for _ in range(num_layers)
        ])

        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size, bias=False)

        # Weight tying: share weights between token embedding and output projection
        self.head.weight = self.token_emb.weight

        # Initialize weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        """
        Args:
            idx: (batch_size, seq_len) token indices
            targets: optional (batch_size, seq_len) target indices
        Returns:
            logits: (batch_size, seq_len, vocab_size)
            loss: scalar cross-entropy loss (if targets provided)
        """
        B, T = idx.shape

        # Token + position embeddings
        tok_emb = self.token_emb(idx)
        pos_emb = self.pos_emb(torch.arange(T, device=idx.device))
        x = self.drop(tok_emb + pos_emb)

        # Transformer blocks
        x = self.blocks(x)

        # Final norm + project to vocab
        x = self.ln_f(x)
        logits = self.head(x)

        # Compute loss if targets are provided
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Autoregressive text generation.

        At each step: predict next token ‚Üí sample ‚Üí append ‚Üí repeat.
        """
        for _ in range(max_new_tokens):
            # Crop to block_size if sequence is too long
            idx_cond = idx[:, -self.block_size:]

            # Forward pass
            logits, _ = self(idx_cond)

            # Get logits for the last position only
            logits = logits[:, -1, :] / temperature

            # Optional top-k filtering
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')

            # Sample from the distribution
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)

            # Append sampled token
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


# Create the model
model = MiniGPT(
    vocab_size=vocab_size,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    block_size=BLOCK_SIZE,
    dropout=DROPOUT
).to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"MiniGPT Model Summary:")
print(f"  Vocabulary size:  {vocab_size}")
print(f"  d_model:          {D_MODEL}")
print(f"  Attention heads:  {NUM_HEADS}")
print(f"  Transformer layers: {NUM_LAYERS}")
print(f"  Context length:   {BLOCK_SIZE}")
print(f"  Total parameters: {total_params:,}")
print(f"\n  This is {total_params/1e6:.1f}M parameters ‚Äî about 100x smaller than GPT-2 Small (124M)")

In [None]:
#@title üéß Before You Start: Todo1 Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_11_todo1_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 5. Your Turn

**TODO 1: Generate Text BEFORE Training**

Let us see what the untrained model produces -- it should be complete gibberish.

In [None]:
# TODO: Generate text from the untrained model
#
# Instructions:
# 1. Create a starting context (e.g., a newline character)
# 2. Call model.generate() with max_new_tokens=200
# 3. Decode and print the output
# 4. It should be random garbage -- the model hasn't learned anything yet

# YOUR CODE HERE
# context = torch.zeros((1, 1), dtype=torch.long, device=device)
# generated = model.generate(context, max_new_tokens=200)
# print("Untrained model output:")
# print(decode(generated[0].tolist()))

In [None]:
#@title üéß Before You Start: Todo1 After
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_12_todo1_after.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Before You Start: Todo2 Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_13_todo2_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


**TODO 2: Implement Perplexity Tracking**

Write a function that computes perplexity during training.

In [None]:
# TODO: Implement perplexity computation
#
# Recall: Perplexity = exp(average_loss)
#
# Instructions:
# 1. Average the loss over EVAL_ITERS random batches
# 2. Compute perplexity = exp(average_loss)
# 3. Report both train and val perplexity

# YOUR CODE HERE
# @torch.no_grad()
# def estimate_perplexity():
#     model.eval()
#     results = {}
#     for split in ['train', 'val']:
#         losses = []
#         for _ in range(EVAL_ITERS):
#             X, Y = get_batch(split)
#             _, loss = model(X, Y)
#             losses.append(loss.item())
#         avg_loss = np.mean(losses)
#         results[split] = {
#             'loss': avg_loss,
#             'perplexity': np.exp(avg_loss)
#         }
#     model.train()
#     return results

In [None]:
#@title üéß Before You Start: Todo2 After
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_14_todo2_after.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Transition: Training Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_15_training_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 6. Putting It All Together

Time to train the model.

In [None]:
@torch.no_grad()
def estimate_loss():
    """Estimate loss on train and val sets."""
    model.eval()
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train()
    return out

# Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

train_losses = []
val_losses = []
train_perplexities = []
val_perplexities = []

print(f"Training MiniGPT for {MAX_ITERS} iterations...")
print(f"{'Iter':>6s} | {'Train Loss':>10s} | {'Val Loss':>10s} | {'Train PPL':>10s} | {'Val PPL':>10s}")
print("-" * 65)

start_time = time.time()

for iter_num in range(MAX_ITERS):
    # Evaluate periodically
    if iter_num % EVAL_INTERVAL == 0 or iter_num == MAX_ITERS - 1:
        losses = estimate_loss()
        train_ppl = math.exp(losses['train'])
        val_ppl = math.exp(losses['val'])

        train_losses.append(losses['train'])
        val_losses.append(losses['val'])
        train_perplexities.append(train_ppl)
        val_perplexities.append(val_ppl)

        elapsed = time.time() - start_time
        print(f"{iter_num:6d} | {losses['train']:10.4f} | {losses['val']:10.4f} | "
              f"{train_ppl:10.1f} | {val_ppl:10.1f}  [{elapsed:.0f}s]")

    # Training step
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    # Gradient clipping (standard practice for Transformers)
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

total_time = time.time() - start_time
print(f"\nTraining complete in {total_time:.1f} seconds")
print(f"Final train loss: {train_losses[-1]:.4f} (perplexity: {train_perplexities[-1]:.1f})")
print(f"Final val loss:   {val_losses[-1]:.4f} (perplexity: {val_perplexities[-1]:.1f})")

In [None]:
#@title üéß Listen: Results Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_17_results_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 7. Training and Results

In [None]:
#@title üéß What to Look For: Loss Perplexity Viz
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_18_loss_perplexity_viz.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Visualize training progress

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss curves
iters = [i * EVAL_INTERVAL for i in range(len(train_losses))]
axes[0].plot(iters, train_losses, 'b-o', linewidth=2, markersize=5, label='Train')
axes[0].plot(iters, val_losses, 'r-o', linewidth=2, markersize=5, label='Validation')
axes[0].set_xlabel('Iteration', fontsize=12)
axes[0].set_ylabel('Cross-Entropy Loss', fontsize=12)
axes[0].set_title('Training and Validation Loss', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=12)
axes[0].grid(alpha=0.3)

# Perplexity curves
axes[1].plot(iters, train_perplexities, 'b-o', linewidth=2, markersize=5, label='Train')
axes[1].plot(iters, val_perplexities, 'r-o', linewidth=2, markersize=5, label='Validation')
axes[1].axhline(y=vocab_size, color='gray', linestyle='--', alpha=0.5, label=f'Random ({vocab_size})')
axes[1].set_xlabel('Iteration', fontsize=12)
axes[1].set_ylabel('Perplexity', fontsize=12)
axes[1].set_title('Perplexity (lower = better)', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=12)
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Random baseline perplexity: {vocab_size} (choosing uniformly among {vocab_size} chars)")
print(f"Our model perplexity: {val_perplexities[-1]:.1f} ‚Äî it is {vocab_size / val_perplexities[-1]:.1f}x better than random!")

In [None]:
#@title üéß Code Walkthrough: Generation Temp
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_19_generation_temp.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Wrap-Up: Closing
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_25_closing.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Generate text with different temperatures

print("=" * 70)
print("TEXT GENERATION FROM TRAINED MINI-GPT")
print("=" * 70)

temperatures = [0.5, 0.8, 1.0, 1.5]
prompts = ["\nFirst Citizen:\n", "\nWhat is ", "\nTo be or "]

for prompt in prompts:
    print(f"\n{'='*60}")
    print(f"Prompt: '{prompt.strip()}'")
    print('='*60)

    context = torch.tensor(encode(prompt), dtype=torch.long, device=device).unsqueeze(0)

    for temp in temperatures:
        generated = model.generate(context, max_new_tokens=150, temperature=temp, top_k=20)
        output_text = decode(generated[0].tolist())

        print(f"\n--- Temperature = {temp} ---")
        print(output_text[:len(prompt) + 150])

In [None]:
#@title üéß What to Look For: Temperature Viz
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_20_temperature_viz.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Compare generation quality as a function of training progress
# (If we had saved checkpoints, we would compare here.
#  Instead, let's show the effect of temperature visually.)

fig, ax = plt.subplots(figsize=(12, 6))

# Generate multiple samples at different temperatures
temps = np.linspace(0.3, 2.0, 8)
diversities = []

for temp in temps:
    samples = []
    for _ in range(10):
        context = torch.tensor(encode("\n"), dtype=torch.long, device=device).unsqueeze(0)
        gen = model.generate(context, max_new_tokens=50, temperature=temp)
        samples.append(decode(gen[0].tolist()))

    # Measure diversity: average pairwise character-level difference
    diffs = []
    for i in range(len(samples)):
        for j in range(i+1, len(samples)):
            s1, s2 = samples[i], samples[j]
            min_len = min(len(s1), len(s2))
            diff = sum(1 for a, b in zip(s1[:min_len], s2[:min_len]) if a != b) / min_len
            diffs.append(diff)
    diversities.append(np.mean(diffs))

ax.plot(temps, diversities, 'o-', color='#1E88E5', linewidth=2, markersize=8)
ax.set_xlabel('Temperature', fontsize=13)
ax.set_ylabel('Output Diversity (pairwise difference)', fontsize=13)
ax.set_title('Temperature Controls Randomness in Generation', fontsize=14, fontweight='bold')
ax.axvline(x=1.0, color='gray', linestyle='--', alpha=0.5, label='Default (1.0)')
ax.legend(fontsize=12)
ax.grid(alpha=0.3)

# Annotate regions
ax.annotate('More deterministic\n(repetitive)', xy=(0.5, diversities[1]),
            fontsize=10, color='#E53935', ha='center')
ax.annotate('More random\n(creative but noisy)', xy=(1.7, diversities[-2]),
            fontsize=10, color='#E53935', ha='center')

plt.tight_layout()
plt.show()

In [None]:
#@title üéß Listen: Final Output Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_21_final_output_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 8. Final Output

In [None]:
#@title üéß Code Walkthrough: Journey Recap
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_22_journey_recap.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# The complete journey: from N-grams to Mini-GPT

print("=" * 70)
print("THE COMPLETE JOURNEY: FOUNDATIONS OF LANGUAGE MODELING")
print("=" * 70)

print("""
Notebook 1: N-gram Language Models
  ‚Üí Predicted the next word by COUNTING word pair frequencies.
  ‚Üí Simple and fast, but zero probability for unseen pairs.
  ‚Üí No notion of word similarity.

Notebook 2: Neural Language Models
  ‚Üí Replaced counting with LEARNED embeddings and neural networks.
  ‚Üí Similar words get similar vectors (cat ‚âà dog in vector space).
  ‚Üí RNNs extended context but hit the vanishing gradient wall.

Notebook 3: Self-Attention & Transformers
  ‚Üí Every word ATTENDS to every other word directly.
  ‚Üí Multi-head attention captures multiple relationship types.
  ‚Üí Full parallelism, no vanishing gradients.

Notebook 4: Building a Tiny Language Model (this notebook)
  ‚Üí Put it ALL together into a working Mini-GPT.
  ‚Üí Trained on Shakespeare with next-token prediction.
  ‚Üí Generated coherent text from a ~1M parameter model.

The core insight: NEXT-TOKEN PREDICTION is all you need.
  ‚Üí The training data IS the labels (self-supervised).
  ‚Üí Scale up the model + data ‚Üí GPT-2 ‚Üí GPT-3 ‚Üí GPT-4.
  ‚Üí The same architecture powers every modern LLM.
""")

In [None]:
#@title üéß What to Look For: Comparison Viz
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_23_comparison_viz.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Final comparison: N-gram vs Neural LM vs our Mini-GPT
# generating from the same prompt

prompt = "the cat "

# N-gram style: just pick from bigram statistics
ngram_output = prompt + "sat on the mat"  # hand-crafted from bigram table

# Our Mini-GPT:
context = torch.tensor(encode(prompt), dtype=torch.long, device=device).unsqueeze(0)
gpt_generated = model.generate(context, max_new_tokens=60, temperature=0.8, top_k=20)
gpt_output = decode(gpt_generated[0].tolist())

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Architecture evolution
architectures = ['N-gram\n(1990s)', 'Neural LM\n(2003)', 'RNN/LSTM\n(2010s)', 'Transformer\n(2017+)']
capabilities = [2, 4, 6, 10]  # Relative capability score
colors = ['#FFCDD2', '#FFE0B2', '#C8E6C9', '#BBDEFB']

bars = axes[0].bar(architectures, capabilities, color=colors, edgecolor='white', linewidth=2)
axes[0].set_ylabel('Relative Capability', fontsize=12)
axes[0].set_title('Evolution of Language Modeling', fontsize=14, fontweight='bold')
axes[0].set_ylim(0, 12)

labels = ['Count', 'Learn', 'Remember', 'Attend']
for bar, label in zip(bars, labels):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
                 label, ha='center', fontsize=11, fontweight='bold')

# Right: Parameter scaling
models_scale = ['Our Mini-GPT', 'GPT-2\nSmall', 'GPT-2\nLarge', 'GPT-3', 'GPT-4\n(est.)']
params = [total_params/1e6, 124, 774, 175000, 1800000]

axes[1].bar(models_scale, params, color=['#4CAF50', '#2196F3', '#2196F3', '#FF9800', '#F44336'],
            edgecolor='white', linewidth=2)
axes[1].set_ylabel('Parameters (Millions)', fontsize=12)
axes[1].set_title('Model Size Scaling', fontsize=14, fontweight='bold')
axes[1].set_yscale('log')

for i, (name, p) in enumerate(zip(models_scale, params)):
    label = f'{p:.1f}M' if p < 1000 else f'{p/1000:.0f}B'
    axes[1].text(i, p * 1.5, label, ha='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nOur model: {total_params:,} parameters ({total_params/1e6:.1f}M)")
print(f"GPT-3: 175,000,000,000 parameters (175B)")
print(f"Scale difference: {175e9/total_params:.0f}x")
print(f"\nSame architecture. Same training objective. Just... bigger.")

In [None]:
#@title üéß Listen: Reflection Next Steps
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_24_reflection_next_steps.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 9. Reflection and Next Steps

**What we built:**

1. **A complete GPT-style language model** with token embeddings, positional embeddings, multi-head causal self-attention, feed-forward layers, residual connections, and layer normalization.

2. **The training pipeline:** character-level tokenization, batch data loading, next-token prediction loss, AdamW optimizer with gradient clipping.

3. **Text generation:** autoregressive sampling with temperature control and top-k filtering.

**Key takeaways from the entire course:**

- **N-grams** (Notebook 1): Language modeling as counting. Simple but brittle -- zero probability for unseen patterns, no word similarity.

- **Neural LMs** (Notebook 2): Replace counting with learning. Embeddings capture similarity, but fixed context windows and sequential processing limit capability.

- **Transformers** (Notebook 3): Self-attention lets every word attend to every other word in parallel. Multi-head attention captures multiple relationship types simultaneously.

- **Mini-GPT** (Notebook 4): Put it all together. The same architecture, scaled up 100,000x and trained on internet-scale data, gives you GPT-3/4, Claude, and Gemini.

**Where to go from here:**
- **Self-Attention from First Principles** (next pod): Dive deeper into the mathematics of attention
- **Building a Full GPT from Scratch** (later pod): Scale up with BPE tokenization, larger datasets
- **Training Pipeline Engineering**: Mixed precision, distributed training, data loading

In [None]:
print("=" * 70)
print("  COURSE COMPLETE: Foundations of Language Modeling")
print()
print("  You traced the complete journey:")
print("    Counting ‚Üí Learning ‚Üí Attending ‚Üí Building")
print()
print("  You now understand the foundations on which")
print("  every modern LLM is built. Well done!")
print("=" * 70)