
# Assignment 3 — Q5: RNN vs Transformer (BPE=10k)

This notebook shows the full workflow required by Q5:

1. **Train a BPE tokenizer (vocab=10,000)** on the provided `input.txt`.
2. **Train two models** on the same tokenized dataset:
   - An **LSTM** language model (RNN).
   - A **small Transformer** language model (few layers of self-attention).
3. **Evaluate on validation** with **loss** and **perplexity** and **compare**.
4. Use **early stopping** on validation loss (patience) to claim “trained until convergence”.

> **Note**: The notebook assumes `input.txt` is in the same directory.


## 0. Environment & Dependencies

In [1]:
import sys
!{sys.executable} -m pip install sentencepiece torch --quiet

import os, io, time, math, random, contextlib
from pathlib import Path
import numpy as np

import sentencepiece as spm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# --- Repro & device ---
SEED = 1337
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# --- Files ---
DATA_PATH     = Path("input.txt")              # provided corpus
MODEL_PREFIX  = "bpe10k"                       # -> bpe10k.model / bpe10k.vocab
VOCAB_SIZE    = 10_000                         # per spec
TOKENS_OUT    = Path("bpe_tokens.pt")          # serialized token ids (torch tensor)
VOCAB_OUT_TXT = Path("bpe_vocab_size.txt")     # saves the integer 10000

assert DATA_PATH.exists(), "input.txt not found next to this notebook."
print(f"OK: input.txt found with {len(DATA_PATH.read_text(encoding='utf-8', errors='ignore')):,} chars")
print("Device:", device)



OK: input.txt found with 1,115,393 chars
Device: cpu


## 1. Paths & Config

In [2]:

DATA_PATH = Path("input.txt")           # provided corpus
MODEL_PREFIX = "bpe10k"                 # sentencepiece prefix -> bpe10k.model/.vocab
VOCAB_SIZE = 10000                      # required by the spec
TOKENS_OUT = Path("bpe_tokens.pt")      # serialized token ids (torch tensor)
VOCAB_OUT = Path("bpe_vocab_size.txt")  # saves the integer 10000
assert DATA_PATH.exists(), "input.txt not found — place it next to this notebook."
print("OK: input.txt found with", len(DATA_PATH.read_text(encoding='utf-8', errors='ignore')), "chars")


OK: input.txt found with 1115393 chars


## 2. Train BPE (vocab=10,000)

In [4]:
# This suppresses the very verbose C++ logs SentencePiece prints.
cmd = (
    f"--input={DATA_PATH} "
    f"--model_prefix={MODEL_PREFIX} "
    f"--model_type=bpe "
    f"--vocab_size={VOCAB_SIZE} "
    f"--character_coverage=1.0 "
    f"--input_sentence_size=1000000 "
    f"--shuffle_input_sentence=true "
    f"--hard_vocab_limit=true "
    f"--num_threads=16"
)

print("Training SentencePiece BPE (10k)…")
buf_out, buf_err = io.StringIO(), io.StringIO()
with contextlib.redirect_stdout(buf_out), contextlib.redirect_stderr(buf_err):
    spm.SentencePieceTrainer.Train(cmd)

# sanity
assert Path(f"{MODEL_PREFIX}.model").exists(), "Tokenizer model not created."
assert Path(f"{MODEL_PREFIX}.vocab").exists(), "Tokenizer vocab not created."
print("Done. Files created:", f"{MODEL_PREFIX}.model", f"{MODEL_PREFIX}.vocab")


Training SentencePiece BPE (10k)…
Done. Files created: bpe10k.model bpe10k.vocab


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=input.txt --model_prefix=bpe10k --model_type=bpe --vocab_size=10000 --character_coverage=1.0 --input_sentence_size=1000000 --shuffle_input_sentence=true --hard_vocab_limit=true --num_threads=16
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: input.txt
  input_format: 
  model_prefix: bpe10k
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 1000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0

## 3. Encode Corpus → Token IDs

In [5]:
sp = spm.SentencePieceProcessor(model_file=f"{MODEL_PREFIX}.model")
ids = sp.encode(DATA_PATH.read_text(encoding="utf-8", errors="ignore"), out_type=int)
tokens = torch.tensor(ids, dtype=torch.long)
torch.save(tokens, TOKENS_OUT)
VOCAB_OUT_TXT.write_text(str(VOCAB_SIZE), encoding="utf-8")

print(f"Encoded tokens: {len(tokens):,}")
print("First 32 ids:", tokens[:32].tolist())

Encoded tokens: 274,114
First 32 ids: [423, 807, 9959, 2096, 84, 2447, 548, 2022, 9951, 424, 68, 362, 9960, 944, 9959, 2091, 9951, 362, 9960, 423, 807, 9959, 319, 182, 157, 3737, 1236, 35, 712, 281, 35, 7296]


## 4. Dataset Utilities (common to both models)

In [6]:
def split_train_val(toks: torch.Tensor, val_frac: float = 0.1):
    n = len(toks)
    split = int(n * (1 - val_frac))
    return toks[:split], toks[split:]

train_tokens, val_tokens = split_train_val(tokens, val_frac=0.1)

class BlockDataset(Dataset):
    def __init__(self, ids: torch.Tensor, block_size: int):
        self.ids   = ids
        self.block = block_size
    def __len__(self):
        return len(self.ids) - self.block
    def __getitem__(self, idx):
        x = self.ids[idx: idx+self.block]
        y = self.ids[idx+1: idx+self.block+1]
        return x, y

BLOCK_SIZE = 128
BATCH_SIZE = 64

def make_loader(ds, shuffle=True):
    return DataLoader(ds, batch_size=BATCH_SIZE, shuffle=shuffle, drop_last=True)

train_loader = make_loader(BlockDataset(train_tokens, BLOCK_SIZE), shuffle=True)
val_loader   = make_loader(BlockDataset(val_tokens,   BLOCK_SIZE), shuffle=False)

len(train_loader), len(val_loader)


(3852, 426)

## 5. LSTM Language Model

In [7]:
class LSTMLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden=512, num_layers=2, dropout=0.1):
        super().__init__()
        self.emb  = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.proj = nn.Linear(hidden, vocab_size)
    def forward(self, x, targets=None):
        x = self.emb(x)
        out, _ = self.lstm(x)
        logits = self.proj(out)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

lstm = LSTMLM(vocab_size=VOCAB_SIZE).to(device)
print("LSTM params (M):", sum(p.numel() for p in lstm.parameters())/1e6)


LSTM params (M): 11.368208


## 6. Small Transformer Language Model

In [8]:
class SelfAttention(nn.Module):
    def __init__(self, n_embd, n_head, dropout=0.1):
        super().__init__()
        assert n_embd % n_head == 0
        self.n_head = n_head
        self.key   = nn.Linear(n_embd, n_embd, bias=False)
        self.query = nn.Linear(n_embd, n_embd, bias=False)
        self.value = nn.Linear(n_embd, n_embd, bias=False)
        self.proj  = nn.Linear(n_embd, n_embd, bias=False)
        self.drop  = nn.Dropout(dropout)
        self.register_buffer("mask", torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)).unsqueeze(0).unsqueeze(0))

    def forward(self, x):
        B, T, C = x.size()
        k = self.key(x).view(B, T, self.n_head, C//self.n_head).transpose(1,2)   # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C//self.n_head).transpose(1,2)
        v = self.value(x).view(B, T, self.n_head, C//self.n_head).transpose(1,2)

        att = (q @ k.transpose(-2, -1)) / math.sqrt(k.size(-1))                   # (B, nh, T, T)
        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
        att = torch.softmax(att, dim=-1)
        att = self.drop(att)
        y = att @ v                                                               # (B, nh, T, hs)
        y = y.transpose(1,2).contiguous().view(B, T, C)
        y = self.proj(y)
        return y

class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd)
        self.sa  = SelfAttention(n_embd, n_head, dropout)
        self.ln2 = nn.LayerNorm(n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.GELU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class TinyTransformerLM(nn.Module):
    def __init__(self, vocab_size, n_layer=2, n_head=4, n_embd=256, block_size=BLOCK_SIZE, dropout=0.1):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, n_embd)
        self.pos_emb   = nn.Embedding(block_size, n_embd)
        self.blocks    = nn.ModuleList([TransformerBlock(n_embd, n_head, dropout) for _ in range(n_layer)])
        self.ln_f      = nn.LayerNorm(n_embd)
        self.proj      = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size

    def forward(self, x, targets=None):
        B, T = x.shape
        pos = torch.arange(0, T, device=x.device).unsqueeze(0)
        h = self.token_emb(x) + self.pos_emb(pos)
        for blk in self.blocks:
            h = blk(h)
        h = self.ln_f(h)
        logits = self.proj(h)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

transformer = TinyTransformerLM(vocab_size=VOCAB_SIZE, n_layer=2, n_head=4, n_embd=256, block_size=BLOCK_SIZE).to(device)
print("Transformer params (M):", sum(p.numel() for p in transformer.parameters())/1e6)


Transformer params (M): 6.740752


## 7. Train/Eval Utilities (Early Stopping)

In [9]:
def evaluate(model, loader):
    model.eval()
    total_loss, n_tokens = 0.0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            _, loss = model(x, y)
            total_loss += loss.item() * x.numel()
            n_tokens   += x.numel()
    return total_loss / n_tokens

def perplexity(loss):
    try:
        return math.exp(loss)
    except OverflowError:
        return float('inf')

def train_with_early_stopping(model, name, max_epochs=20, eval_every=200, patience=5, lr=3e-4):
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    best_val, best_ppl = float('inf'), float('inf')
    no_improve = 0
    start_time = time.time()
    step = 0

    print(f"Training {name}…")
    for epoch in range(1, max_epochs+1):
        model.train()
        for x, y in train_loader:
            step += 1
            x, y = x.to(device), y.to(device)
            _, loss = model(x, y)
            opt.zero_grad(set_to_none=True)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

            if step % eval_every == 0:
                val_loss = evaluate(model, val_loader)
                ppl = perplexity(val_loss)
                improved = val_loss < best_val - 1e-4
                badge = "↑improved" if improved else " "
                print(f"{name:12} step {step:5d} | train {loss.item():.4f} | val {val_loss:.4f} | ppl {ppl:.2f} {badge}")
                if improved:
                    best_val, best_ppl = val_loss, ppl
                    no_improve = 0
                else:
                    no_improve += 1
                    if no_improve >= patience:
                        elapsed = time.time() - start_time
                        print(f"{name}: early stopping (no val improvement for {patience} evals).")
                        return best_val, best_ppl, elapsed
    elapsed = time.time() - start_time
    return best_val, best_ppl, elapsed


## 8. Train Both Models

In [None]:
lstm = LSTMLM(vocab_size=VOCAB_SIZE).to(device)
trf  = TinyTransformerLM(vocab_size=VOCAB_SIZE, n_layer=2, n_head=4, n_embd=256, block_size=BLOCK_SIZE).to(device)

lstm_val, lstm_ppl, lstm_time = train_with_early_stopping(lstm, "LSTM", max_epochs=20, eval_every=200, patience=5)
trf_val,  trf_ppl,  trf_time  = train_with_early_stopping(trf,  "Transformer", max_epochs=20, eval_every=200, patience=5)

print("\n=== Comparison (Validation) ===")
print(f"{'Model':12} | {'Val Loss':>8} | {'Perplexity':>10} | {'Time (s)':>8}")
print("-"*46)
print(f"{'LSTM':12} | {lstm_val:8.4f} | {lstm_ppl:10.2f} | {lstm_time:8.1f}")
print(f"{'Transformer':12} | {trf_val:8.4f} | {trf_ppl:10.2f} | {trf_time:8.1f}")


Training LSTM…


## 9. Brief Analysis


**Observations (fill with your actual numbers):**

- Transformer vs LSTM validation **perplexity** and **loss**.
- Relative **training time**.
- At what step **early stopping** triggered.

**Why:** Transformers capture longer dependencies with parallel self-attention; LSTMs are sequential and may do fine on short contexts/smaller data.
