# Model Comparison: The Evolution

*From amoeba to specialized mammal.*

---

This notebook loads all trained models from the Lil Transformy sequence and compares their outputs on the same prompts. The complete evolutionary journey.

**The lineage:**
- 03: Bigram — the amoeba
- 04: + Attention — eyes
- 05: + Position — knows where it is
- 06: + FFN — can think
- 07: + Residuals & LayerNorm — spine
- 08: + Stacked blocks — legs (crawled onto land)
- 09: + Multi-head — bigger brain (mammal)
- 10: + MoE — specialized brain regions

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from pathlib import Path
import json
import math

# Reproducibility for generation
torch.manual_seed(42)

# Device
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'
print(f"Using device: {device}")

Using device: mps


In [2]:
from transformers import GPT2TokenizerFast

class LilTokenizer:
    """Compact tokenizer for Lil Transformy."""
    
    def __init__(self, gpt2_to_compact, compact_to_gpt2, vocab_size):
        self.gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
        self.gpt2_to_compact = gpt2_to_compact
        self.compact_to_gpt2 = compact_to_gpt2
        self.vocab_size = vocab_size
        self.pad_id = 0
        self.unk_id = 1
        self.eos_id = 2
    
    def encode(self, text, add_eos=True):
        gpt2_tokens = self.gpt2_tokenizer.encode(text)
        compact_tokens = [self.gpt2_to_compact.get(t, self.unk_id) for t in gpt2_tokens]
        if add_eos:
            compact_tokens.append(self.eos_id)
        return compact_tokens
    
    def decode(self, token_ids):
        gpt2_tokens = []
        for tid in token_ids:
            if tid in [self.pad_id, self.unk_id, self.eos_id]:
                continue
            if tid in self.compact_to_gpt2:
                gpt2_tokens.append(self.compact_to_gpt2[tid])
        return self.gpt2_tokenizer.decode(gpt2_tokens)
    
    def __len__(self):
        return self.vocab_size
    
    @classmethod
    def load(cls, path):
        with open(path, 'r') as f:
            config = json.load(f)
        gpt2_to_compact = {int(k): v for k, v in config['gpt2_to_compact'].items()}
        compact_to_gpt2 = {int(k): v for k, v in config['compact_to_gpt2'].items()}
        return cls(gpt2_to_compact, compact_to_gpt2, config['vocab_size'])


tokenizer = LilTokenizer.load('tokenizer/tokenizer.json')
VOCAB_SIZE = len(tokenizer)
print(f"Vocabulary size: {VOCAB_SIZE:,}")

Vocabulary size: 4,096


---

## Model Definitions

We need to define each model architecture to load the weights. This will grow as we add more notebooks.

In [3]:
# === 03: Bigram ===

class BigramLM(nn.Module):
    """Notebook 03: Simplest autoregressive model. Each position predicts next from itself only."""
    
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.unembed = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        return self.unembed(self.embedding(x))
    
    def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
        self.eval()
        tokens = list(prompt_tokens)
        generated = []
        
        with torch.no_grad():
            for _ in range(max_new_tokens):
                # Bigram only looks at last token
                x = torch.tensor([[tokens[-1]]], device=next(self.parameters()).device)
                logits = self.forward(x)
                probs = F.softmax(logits[0, 0] / temperature, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1).item()
                
                generated.append(next_token)
                tokens.append(next_token)
                
                if next_token == 2:  # EOS
                    break
        
        return generated

In [4]:
# === 04: Attention ===

class CausalSelfAttention(nn.Module):
    """Single-head causal self-attention."""
    
    def __init__(self, d_model, max_seq_len=256):
        super().__init__()
        self.d_model = d_model
        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)
        self.W_o = nn.Linear(d_model, d_model, bias=False)
        
        mask = torch.triu(torch.ones(max_seq_len, max_seq_len), diagonal=1).bool()
        self.register_buffer('mask', mask)
        self.scale = math.sqrt(d_model)
    
    def forward(self, x):
        B, T, C = x.shape
        Q, K, V = self.W_q(x), self.W_k(x), self.W_v(x)
        scores = (Q @ K.transpose(-2, -1)) / self.scale
        scores = scores.masked_fill(self.mask[:T, :T], float('-inf'))
        attn = F.softmax(scores, dim=-1)
        return self.W_o(attn @ V)


class AttentionLM(nn.Module):
    """Notebook 04: Bigram + single-head attention."""
    
    def __init__(self, vocab_size, d_model, max_seq_len=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.attention = CausalSelfAttention(d_model, max_seq_len)
        self.unembed = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        return self.unembed(self.attention(self.embedding(x)))
    
    def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
        self.eval()
        tokens = list(prompt_tokens)
        generated = []
        
        with torch.no_grad():
            for _ in range(max_new_tokens):
                context = tokens[-256:]
                x = torch.tensor([context], device=next(self.parameters()).device)
                logits = self.forward(x)
                probs = F.softmax(logits[0, -1] / temperature, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1).item()
                
                generated.append(next_token)
                tokens.append(next_token)
                
                if next_token == 2:
                    break
        
        return generated


# === 05: Attention + Position ===

class PositionalAttentionLM(nn.Module):
    """Notebook 05: Attention + learned positional embeddings."""
    
    def __init__(self, vocab_size, d_model, max_seq_len=256):
        super().__init__()
        self.max_seq_len = max_seq_len
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_seq_len, d_model)
        self.attention = CausalSelfAttention(d_model, max_seq_len)
        self.unembed = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.position_embedding(torch.arange(T, device=x.device))
        return self.unembed(self.attention(tok_emb + pos_emb))
    
    def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
        self.eval()
        tokens = list(prompt_tokens)
        generated = []
        
        with torch.no_grad():
            for _ in range(max_new_tokens):
                context = tokens[-self.max_seq_len:]
                x = torch.tensor([context], device=next(self.parameters()).device)
                logits = self.forward(x)
                probs = F.softmax(logits[0, -1] / temperature, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1).item()
                
                generated.append(next_token)
                tokens.append(next_token)
                
                if next_token == 2:
                    break
        
        return generated


# === 06: Attention + Position + FFN ===

class FeedForward(nn.Module):
    """Position-wise feedforward network."""
    
    def __init__(self, d_model, d_ff=None):
        super().__init__()
        if d_ff is None:
            d_ff = 4 * d_model
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        return self.linear2(F.relu(self.linear1(x)))


class AttentionFFNLM(nn.Module):
    """Notebook 06: Attention + Position + FFN."""
    
    def __init__(self, vocab_size, d_model, d_ff=None, max_seq_len=256):
        super().__init__()
        self.max_seq_len = max_seq_len
        if d_ff is None:
            d_ff = 4 * d_model
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_seq_len, d_model)
        self.attention = CausalSelfAttention(d_model, max_seq_len)
        self.ffn = FeedForward(d_model, d_ff)
        self.unembed = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.position_embedding(torch.arange(T, device=x.device))
        attended = self.attention(tok_emb + pos_emb)
        processed = self.ffn(attended)
        return self.unembed(processed)
    
    def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
        self.eval()
        tokens = list(prompt_tokens)
        generated = []
        
        with torch.no_grad():
            for _ in range(max_new_tokens):
                context = tokens[-self.max_seq_len:]
                x = torch.tensor([context], device=next(self.parameters()).device)
                logits = self.forward(x)
                probs = F.softmax(logits[0, -1] / temperature, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1).item()
                
                generated.append(next_token)
                tokens.append(next_token)
                
                if next_token == 2:
                    break
        
        return generated


# === 07: Transformer Block (Residuals + LayerNorm) ===

class TransformerBlock(nn.Module):
    """A single transformer block with pre-norm architecture."""
    
    def __init__(self, d_model, d_ff=None, max_seq_len=256):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.attention = CausalSelfAttention(d_model, max_seq_len)
        self.ffn = FeedForward(d_model, d_ff)
    
    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x


class TransformerLM(nn.Module):
    """Notebook 07: Proper transformer block with residuals and LayerNorm."""
    
    def __init__(self, vocab_size, d_model, d_ff=None, max_seq_len=256):
        super().__init__()
        self.max_seq_len = max_seq_len
        if d_ff is None:
            d_ff = 4 * d_model
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_seq_len, d_model)
        self.block = TransformerBlock(d_model, d_ff, max_seq_len)
        self.ln_final = nn.LayerNorm(d_model)
        self.unembed = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.position_embedding(torch.arange(T, device=x.device))
        x = tok_emb + pos_emb
        x = self.block(x)
        x = self.ln_final(x)
        return self.unembed(x)
    
    def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
        self.eval()
        tokens = list(prompt_tokens)
        generated = []
        
        with torch.no_grad():
            for _ in range(max_new_tokens):
                context = tokens[-self.max_seq_len:]
                x = torch.tensor([context], device=next(self.parameters()).device)
                logits = self.forward(x)
                probs = F.softmax(logits[0, -1] / temperature, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1).item()
                
                generated.append(next_token)
                tokens.append(next_token)
                
                if next_token == 2:
                    break
        
        return generated

In [5]:
# === 08: Stacked Blocks ===

class StackedTransformerLM(nn.Module):
    """Notebook 08: Multiple transformer blocks stacked."""
    
    def __init__(self, vocab_size, d_model, n_layers, d_ff=None, max_seq_len=256):
        super().__init__()
        self.max_seq_len = max_seq_len
        if d_ff is None:
            d_ff = 4 * d_model
        
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_seq_len, d_model)
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, d_ff, max_seq_len)
            for _ in range(n_layers)
        ])
        self.ln_final = nn.LayerNorm(d_model)
        self.unembed = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.position_embedding(torch.arange(T, device=x.device))
        x = tok_emb + pos_emb
        for block in self.blocks:
            x = block(x)
        x = self.ln_final(x)
        return self.unembed(x)
    
    def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
        self.eval()
        tokens = list(prompt_tokens)
        generated = []
        
        with torch.no_grad():
            for _ in range(max_new_tokens):
                context = tokens[-self.max_seq_len:]
                x = torch.tensor([context], device=next(self.parameters()).device)
                logits = self.forward(x)
                probs = F.softmax(logits[0, -1] / temperature, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1).item()
                
                generated.append(next_token)
                tokens.append(next_token)
                
                if next_token == 2:
                    break
        
        return generated


# === 09: Multi-Head Attention ===

class MultiHeadAttention(nn.Module):
    """Multi-head causal self-attention."""
    
    def __init__(self, d_model, n_heads, max_seq_len=256):
        super().__init__()
        assert d_model % n_heads == 0
        
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        
        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)
        self.W_o = nn.Linear(d_model, d_model, bias=False)
        
        mask = torch.triu(torch.ones(max_seq_len, max_seq_len), diagonal=1).bool()
        self.register_buffer('mask', mask)
        self.scale = math.sqrt(self.head_dim)
    
    def forward(self, x):
        B, T, C = x.shape
        
        Q = self.W_q(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        K = self.W_k(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        V = self.W_v(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        
        scores = (Q @ K.transpose(-2, -1)) / self.scale
        scores = scores.masked_fill(self.mask[:T, :T], float('-inf'))
        attn = F.softmax(scores, dim=-1)
        
        out = attn @ V
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        return self.W_o(out)


class MultiHeadTransformerBlock(nn.Module):
    """Transformer block with multi-head attention."""
    
    def __init__(self, d_model, n_heads, d_ff=None, max_seq_len=256):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.attention = MultiHeadAttention(d_model, n_heads, max_seq_len)
        self.ffn = FeedForward(d_model, d_ff)
    
    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x


class MultiHeadTransformerLM(nn.Module):
    """Notebook 09: Full transformer with multi-head attention."""
    
    def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff=None, max_seq_len=256):
        super().__init__()
        self.max_seq_len = max_seq_len
        if d_ff is None:
            d_ff = 4 * d_model
        
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_seq_len, d_model)
        self.blocks = nn.ModuleList([
            MultiHeadTransformerBlock(d_model, n_heads, d_ff, max_seq_len)
            for _ in range(n_layers)
        ])
        self.ln_final = nn.LayerNorm(d_model)
        self.unembed = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.position_embedding(torch.arange(T, device=x.device))
        x = tok_emb + pos_emb
        for block in self.blocks:
            x = block(x)
        x = self.ln_final(x)
        return self.unembed(x)
    
    def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
        self.eval()
        tokens = list(prompt_tokens)
        generated = []
        
        with torch.no_grad():
            for _ in range(max_new_tokens):
                context = tokens[-self.max_seq_len:]
                x = torch.tensor([context], device=next(self.parameters()).device)
                logits = self.forward(x)
                probs = F.softmax(logits[0, -1] / temperature, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1).item()
                
                generated.append(next_token)
                tokens.append(next_token)
                
                if next_token == 2:
                    break
        
        return generated


# === 10: Mixture of Experts ===

class MoELayer(nn.Module):
    """
    Mixture of Experts layer with top-1 routing.
    Each token is routed to exactly one expert.
    """
    
    def __init__(self, d_model, d_ff=None, n_experts=2):
        super().__init__()
        if d_ff is None:
            d_ff = 4 * d_model
        
        self.n_experts = n_experts
        self.d_model = d_model
        self.experts = nn.ModuleList([
            FeedForward(d_model, d_ff) for _ in range(n_experts)
        ])
        self.router = nn.Linear(d_model, n_experts, bias=False)
    
    def forward(self, x):
        B, T, D = x.shape
        router_logits = self.router(x)
        router_probs = F.softmax(router_logits, dim=-1)
        expert_indices = router_probs.argmax(dim=-1)
        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=2)
        indices_expanded = expert_indices.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, D)
        output = expert_outputs.gather(dim=2, index=indices_expanded).squeeze(2)
        return output


class MoETransformerBlock(nn.Module):
    """Transformer block with Mixture of Experts instead of dense FFN."""
    
    def __init__(self, d_model, n_heads, d_ff=None, n_experts=2, max_seq_len=256):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.attention = MultiHeadAttention(d_model, n_heads, max_seq_len)
        self.moe = MoELayer(d_model, d_ff, n_experts)
    
    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.moe(self.ln2(x))
        return x


class MoETransformerLM(nn.Module):
    """Notebook 10: Transformer with Mixture of Experts."""
    
    def __init__(self, vocab_size, d_model, n_layers, n_heads, n_experts=2, d_ff=None, max_seq_len=256):
        super().__init__()
        self.max_seq_len = max_seq_len
        if d_ff is None:
            d_ff = 4 * d_model
        
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_seq_len, d_model)
        self.blocks = nn.ModuleList([
            MoETransformerBlock(d_model, n_heads, d_ff, n_experts, max_seq_len)
            for _ in range(n_layers)
        ])
        self.ln_final = nn.LayerNorm(d_model)
        self.unembed = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.position_embedding(torch.arange(T, device=x.device))
        x = tok_emb + pos_emb
        for block in self.blocks:
            x = block(x)
        x = self.ln_final(x)
        return self.unembed(x)
    
    def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
        self.eval()
        tokens = list(prompt_tokens)
        generated = []
        
        with torch.no_grad():
            for _ in range(max_new_tokens):
                context = tokens[-self.max_seq_len:]
                x = torch.tensor([context], device=next(self.parameters()).device)
                logits = self.forward(x)
                probs = F.softmax(logits[0, -1] / temperature, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1).item()
                
                generated.append(next_token)
                tokens.append(next_token)
                
                if next_token == 2:
                    break
        
        return generated


print("Model architectures defined.")

Model architectures defined.


---

## Load Available Models

Check which checkpoints exist and load them.

In [6]:
# Registry of models: (checkpoint_file, model_class, display_name, extra_kwargs, uses_n_layers, uses_n_heads, uses_n_experts)
MODEL_REGISTRY = [
    ('03_bigram.pt', BigramLM, '03: Bigram', {}, False, False, False),
    ('04_attention.pt', AttentionLM, '04: + Attention', {}, False, False, False),
    ('05_positional.pt', PositionalAttentionLM, '05: + Position', {'max_seq_len': 256}, False, False, False),
    ('06_ffn.pt', AttentionFFNLM, '06: + FFN', {'max_seq_len': 256}, False, False, False),
    ('07_transformer_block.pt', TransformerLM, '07: + Residual', {'max_seq_len': 256}, False, False, False),
    ('08_stacked.pt', StackedTransformerLM, '08: + 2 Blocks', {'max_seq_len': 256}, True, False, False),
    ('09_multihead.pt', MultiHeadTransformerLM, '09: + 2 Heads', {'max_seq_len': 256}, True, True, False),
    ('10_moe.pt', MoETransformerLM, '10: + MoE', {'max_seq_len': 256}, True, True, True),
]

models = {}
stats = {}

print("Loading models...")
print("=" * 60)

for checkpoint_file, model_class, name, extra_kwargs, uses_n_layers, uses_n_heads, uses_n_experts in MODEL_REGISTRY:
    path = Path(checkpoint_file)
    if path.exists():
        checkpoint = torch.load(path, map_location='cpu', weights_only=False)
        
        # Build kwargs from checkpoint
        kwargs = {'vocab_size': checkpoint['vocab_size'], 'd_model': checkpoint['d_model']}
        kwargs.update(extra_kwargs)
        if 'd_ff' in checkpoint:
            kwargs['d_ff'] = checkpoint['d_ff']
        if uses_n_layers and 'n_layers' in checkpoint:
            kwargs['n_layers'] = checkpoint['n_layers']
        if uses_n_heads and 'n_heads' in checkpoint:
            kwargs['n_heads'] = checkpoint['n_heads']
        if uses_n_experts and 'n_experts' in checkpoint:
            kwargs['n_experts'] = checkpoint['n_experts']
        
        # Create model
        model = model_class(**kwargs)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.to(device)
        model.eval()
        
        models[name] = model
        stats[name] = {
            'params': sum(p.numel() for p in model.parameters()),
            'final_ppl': checkpoint['history']['val_perplexity'][-1]
        }
        
        print(f"✓ {name}")
        print(f"    Parameters: {stats[name]['params']:,}")
        print(f"    Final perplexity: {stats[name]['final_ppl']:.1f}")
    else:
        print(f"✗ {name} (not found: {checkpoint_file})")

print()
print(f"Loaded {len(models)} models.")

Loading models...
✓ 03: Bigram
    Parameters: 1,052,672
    Final perplexity: 35.8
✓ 04: + Attention
    Parameters: 1,118,208
    Final perplexity: 25.0
✓ 05: + Position
    Parameters: 1,150,976
    Final perplexity: 17.7
✓ 06: + FFN
    Parameters: 1,282,688
    Final perplexity: 13.4
✓ 07: + Residual
    Parameters: 1,283,456
    Final perplexity: 10.9
✓ 08: + 2 Blocks
    Parameters: 1,481,216
    Final perplexity: 8.7
✓ 09: + 2 Heads
    Parameters: 1,481,216
    Final perplexity: 8.3
✓ 10: + MoE
    Parameters: 1,745,152
    Final perplexity: 8.1

Loaded 8 models.


---

## Side-by-Side Generation

Give all models the same prompt, see what they produce.

In [7]:
def compare_generations(prompt, max_tokens=50, temperature=1.0, seed=None):
    """
    Generate from all loaded models with the same prompt.

    If seed is provided, it's combined with a hash of the prompt so that:
    - Same prompt + same seed = reproducible across runs
    - Different prompts + same seed = different outputs (not stuck in attractors)
    - All models for a given prompt get the same seed (fair comparison)
    """
    prompt_tokens = tokenizer.encode(prompt, add_eos=False)

    # Combine seed with prompt hash so different prompts get different randomness
    if seed is not None:
        prompt_seed = seed + hash(prompt) % 10000
    else:
        prompt_seed = None

    for name, model in models.items():
        if prompt_seed is not None:
            torch.manual_seed(prompt_seed)

        generated = model.generate(prompt_tokens, max_new_tokens=max_tokens, temperature=temperature)
        text = tokenizer.decode(generated)

        ppl = stats[name]['final_ppl']
        print(f"{name} (ppl={ppl:.1f}):")
        print()
        print(f"Prompt: {prompt}")
        print(f"Generated: {text}")
        print()
        print("-" * 70)
        print()

In [8]:
# The classics
print("\n" + "#" * 70)
print("# CLASSIC PROMPTS")
print("#" * 70)

compare_generations("Once upon a time", seed=42)
compare_generations("The little girl", seed=42)
compare_generations("He was very", seed=42)


######################################################################
# CLASSIC PROMPTS
######################################################################
03: Bigram (ppl=35.8):

Prompt: Once upon a time
Generated: . They saw his spaceship. He had no money on leaves, Tom tried to climb. They measured the door fell out the first time, he had never found a big crystal and are a big secret in the bathe." They ran to shoot

----------------------------------------------------------------------

04: + Attention (ppl=25.0):

Prompt: Once upon a time
Generated: . So, there was a little girl named Lily. One day Tom tried to climb. But, she asked her mother him first time, she decided to catch it again, her mom if she started to jump alligator, she laughed. The train

----------------------------------------------------------------------

05: + Position (ppl=17.7):

Prompt: Once upon a time
Generated: , there was a smart frog named Timmy. Timmy loved to explore such a big castle. He alway

In [9]:
# Story starters
print("\n" + "#" * 70)
print("# STORY STARTERS")
print("#" * 70)

compare_generations("Once upon a time there was a little girl named Lily. She", seed=42)
compare_generations("The big dog and the small cat were", seed=42)
compare_generations("One sunny day, the children went to the park to", seed=42)


######################################################################
# STORY STARTERS
######################################################################
03: Bigram (ppl=35.8):

Prompt: Once upon a time there was a little girl named Lily. She
Generated:  made something was sad, leaving the magical voice.
The butterfly that sometimes we have such a kind to help?"
Lily felt cool. That is not bite. They are faster again soon became dark, and stared. One day on their bikes

----------------------------------------------------------------------

04: + Attention (ppl=25.0):

Prompt: Once upon a time there was a little girl named Lily. She
Generated:  loved to sit on a helpful friend didn't know what they are having so much fun. One day, Sarah said, "I want you did not think he doing. But remember to run away. In, and still. Can we share their bikes

----------------------------------------------------------------------

05: + Position (ppl=17.7):

Prompt: Once upon a time there was a l

In [10]:
# More challenging - requires context
print("\n" + "#" * 70)
print("# CONTEXT-DEPENDENT PROMPTS")
print("#" * 70)

# These require remembering earlier context
compare_generations("Lily had a red ball. She loved to play with her", seed=42)
compare_generations("The boy was sad because his toy was broken. His mom said", seed=42)


######################################################################
# CONTEXT-DEPENDENT PROMPTS
######################################################################
03: Bigram (ppl=35.8):

Prompt: Lily had a red ball. She loved to play with her
Generated:  toys. Buzz was thick box on, mom if they opened his mom said.

----------------------------------------------------------------------

04: + Attention (ppl=25.0):

Prompt: Lily had a red ball. She loved to play with her
Generated:  toys instead picking things to always ask her mom if they could go inside. One day, they say they see the house.

----------------------------------------------------------------------

05: + Position (ppl=17.7):

Prompt: Lily had a red ball. She loved to play with her
Generated:  toys. She put on her favourite toy mom and saw that his toy car a toy. She had a turn on the lid.

"Oh, no!" Mia tried it had made an true. It made bread the meat on the side of

--------------------------------------------

---

## Summary Statistics

In [11]:
print("=" * 60)
print("MODEL COMPARISON")
print("=" * 60)
print()
print(f"{'Model':<25} {'Parameters':<15} {'Perplexity':<12}")
print("-" * 52)

for name in models.keys():
    params = stats[name]['params']
    ppl = stats[name]['final_ppl']
    print(f"{name:<25} {params:>12,}   {ppl:>8.1f}")

print()
print("Lower perplexity = less surprised by correct answer = better.")
print()
print("From amoeba (35.8) to specialized mammal (8.1) in 8 notebooks.")

MODEL COMPARISON

Model                     Parameters      Perplexity  
----------------------------------------------------
03: Bigram                   1,052,672       35.8
04: + Attention              1,118,208       25.0
05: + Position               1,150,976       17.7
06: + FFN                    1,282,688       13.4
07: + Residual               1,283,456       10.9
08: + 2 Blocks               1,481,216        8.7
09: + 2 Heads                1,481,216        8.3
10: + MoE                    1,745,152        8.1

Lower perplexity = less surprised by correct answer = better.

From amoeba (35.8) to specialized mammal (8.1) in 8 notebooks.


---

## Try Your Own Prompts

Modify the cell below to test whatever you want.

In [12]:
# Your prompt here!
compare_generations(
    "Once upon a time there was a little girl who lived in a magical forest. She",
    max_tokens=60,
    temperature=1.0,
    seed=42  # Set to None for random each time
)

03: Bigram (ppl=35.8):

Prompt: Once upon a time there was a little girl who lived in a magical forest. She
Generated:  would finally chose a smart and the little girl enjoyed the couch, and it. You made a big smile on her mommy loved to look out. She loved to be careful not so sad and tried to guess the ground in the wood. One day, she also important to chase after a loud voice

----------------------------------------------------------------------

04: + Attention (ppl=25.0):

Prompt: Once upon a time there was a little girl who lived in a magical forest. She
Generated:  would finally chose a smart and loved to take a good job, so proud of the witch.

----------------------------------------------------------------------

05: + Position (ppl=17.7):

Prompt: Once upon a time there was a little girl who lived in a magical forest. She
Generated:  loved to play with her and finding leaves in the garden flowers, and it. One day, so he decided to try new pole and look out. The red and supp