In [1]:
import numpy as np
import pandas as pd
import tiktoken as tk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Model Hyperparameters

- `d_model = 256`  
  - Dimension of token embeddings and hidden representations.  
  - All attention computations, residuals, and FFN inputs/outputs use this size.  

- `num_heads = 8`  
  - Number of attention heads in each MultiHeadAttention layer.  
  - Each head works in a subspace of size `head_dim = d_model / num_heads = 32`.  

- `d_ff = 1024`  
  - Hidden size of the feed-forward network inside each Transformer block.  
  - Typically 4x `d_model` in GPT architectures → expansion-bottleneck style.  

- `num_layers = 4`  
  - Number of stacked TransformerBlocks in the model.  
  - More layers → more capacity, deeper contextual understanding, but slower to train.  

- `max_len = 256`  
  - Maximum sequence length the positional embeddings can handle.  
  - Sequences longer than this will need truncation or extension of positional embeddings.  

- `vocab_size = 50257`  
  - Size of GPT-2’s BPE tokenizer vocabulary (includes special tokens).  
  - Needed for `EmbeddingLayer` and final output `head`.  

- `head_dim = 32`  
  - Dimension of each attention head (`d_model / num_heads`).  
  - Scales the attention scores: smaller head_dim → less expressive, larger → more compute.

- `lr = 2e-4`
  - learning rate of the optimization algorithm (AdamW here)
  - increase or decrease according to the dataset and needs.

- `epochs = 100`
  - Total epochs for training the model
  - decrease for faster training but poorer results

In [2]:
d_model = 256
num_heads = 8
d_ff = 1024
num_layers = 4
max_len = 256
vocab_size = 50257
head_dim = 32
lr = 2e-4
epochs = 10000
device = 'cuda'

# Tokenizing
- I use OpenAI's tiktoken library here mainly because its faster
- however, if access to GPUs is restricted, custom tokenizer from HuggingFace should be used for faster results
- The tiktoken tokenizer was also used for GPT-2, and has been sufficiently battle-tested, hence using.

In [3]:
encoder = tk.get_encoding("gpt2")
EOT= encoder.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})  ### [50256]

enc = encoder.encode("Hello world", allowed_special={"<|endoftext|>"})

dec = encoder.decode(enc)

print(enc)

print(dec)

[15496, 995]
Hello world


# Data Loader

### `enc(s: str) -> list[int]`
- Takes a string `s` and converts it into a list of token ids using the GPT-2 encoder.  
- Keeps special tokens like `<|endoftext|>` intact.  
- Returns a Python list of integers representing tokens.  
- Simple wrapper around `encoder.encode()` for consistancy.  
- Quick and light, no device or batch handlng.  


### `dec(ids: list[int]) -> str`
- Converts a list of token ids back into human readable text.  
- Wrapper around `encoder.decode()`.  
- Useful for seeing what the model actualy “says”.  
- Works with special tokens, no filtering by default.  
- Takes a list of ints, returns a single string.  

### `build_ids(data, add_eot: bool = True) -> torch.Tensor`
- Turns a string or list of strings into a 1D `LongTensor` of token ids.  
- Automatically adds an `<|endoftext|>` token after each string if `add_eot=True`.  
- Can handle both single string and list of strings transparently.  
- Concatenates all tokens into one flat tensor for trainng.  
- Returns a tensor ready to be sliced into batches for the model.  

### `batch_loader(raw_dataset, T: int = 64, B: int = 8, device: str = "cuda")`
- Creates random batches of sequences for next-token prediction.  
- `x` is the input sequence, `y` is the target sequence shifted by 1 token.  
- Samples `B` starting points from the dataset, each of length `T`.  
- Moves the batch to the specified `device` in one go for speed.  
- Raises error if dataset is too small for the requested sequnce length.

In [4]:
def enc(s: str) -> list[int]:
    return encoder.encode(s, allowed_special={"<|endoftext|>"})
def dec(ids: list[int]) -> str:
    return encoder.decode(ids)


def build_ids(data, add_eot: bool = True) -> torch.Tensor:
    
    
    """
    data: str | list[str] 
    returns: 1D LongTensor of token ids
    """

    
    if isinstance(data, str):
        txts = [data]
    else:
        txts = list(data)

    buf = []
    for s in txts:
        buf.extend(enc(s))
        if add_eot:
            buf.extend(EOT)
    return torch.tensor(buf, dtype=torch.long)





@torch.no_grad() ## Saves memory
def batch_loader(raw_dataset, T: int = 64, B: int = 8, device: str = "cuda"):
    
    
    """
    ids: 1D LongTensor [N]
    T:   sequence length (context size)
    B:   batch size
    returns: x,y each [B, T] on `device`
    """

    ## Encodes the dataset
    ids = build_ids(raw_dataset, add_eot = True)

    
    ###Check if token sequence is too small
    N = ids.size(0)
    if N <= T + 1:
        raise ValueError(f"Need more tokens (got {N}) than T+1 ({T+1}).")

    
    # sample B starting positions
    i = torch.randint(0, N - T - 1, (B,))
    
    
    # gather slices (CPU) then move once (faster than so many tiny transfers)
    x = torch.stack([ids[j:j+T]     for j in i], dim=0)
    y = torch.stack([ids[j+1:j+T+1] for j in i], dim=0)
    return x.to(device, non_blocking=True), y.to(device, non_blocking=True)

## Test/Demo

In [5]:
txts = [
    "transformers are spicy attention machines.",
    "attention is all you need, allegedly.",
    "lets build the beast today."
]

x, y = batch_loader(txts, T=6, B=8, device="cpu")
print(x.shape, y.shape)           #torch.Size([8, 64]) torch.Size([8, 64])

print(x)

for i in range(0,8):
    print(dec(x[i].tolist()))

for i in range(3):
    print("x:", dec(x[i].tolist()))
    print("y:", dec(y[i].tolist()))
    print()

torch.Size([8, 6]) torch.Size([8, 6])
tensor([[ 7910,    13, 50256,  5289,  1382,   262],
        [35636,   364,   389, 26880,  3241,  8217],
        [ 7910,    13, 50256,  5289,  1382,   262],
        [35636,   364,   389, 26880,  3241,  8217],
        [26880,  3241,  8217,    13, 50256,  1078],
        [   11,  7910,    13, 50256,  5289,  1382],
        [  364,   389, 26880,  3241,  8217,    13],
        [  389, 26880,  3241,  8217,    13, 50256]])
 allegedly.<|endoftext|>lets build the
transformers are spicy attention machines
 allegedly.<|endoftext|>lets build the
transformers are spicy attention machines
 spicy attention machines.<|endoftext|>att
, allegedly.<|endoftext|>lets build
ers are spicy attention machines.
 are spicy attention machines.<|endoftext|>
x:  allegedly.<|endoftext|>lets build the
y: .<|endoftext|>lets build the beast

x: transformers are spicy attention machines
y: ers are spicy attention machines.

x:  allegedly.<|endoftext|>lets build the
y: .<|endoftext|>let

# Embedding and Attention

### `EmbeddingLayer`
- Combines token embeddings and positional embeddings into one vector per token.  
- `self.tok_embed` maps each token id to a learnable `d_model` dimensional vector.  
- `self.pos_embed` assigns a learnable vector to each position in the sequence up to `max_len`.  
- In `forward()`, we create position ids `[0,1,...,T-1]` and look up their embeddings.  
- Returns `tok + pos` → the model can know **what the token is** and **where it is** in the sequence.  
- Nuance: the sum `tok + pos` assumes `d_model` for both; if dimensions mismatch, PyTorch will error.  
- Another nuance: positional embeddings are learned (unlike sinusoidal) — model has to figure out position info from scratch.  

### `SingleHeadAttention`
- Implements a single “self-attention head” from the Transformer paper.  
- `W_q`, `W_k`, `W_v` are linear layers projecting `d_model` -> `d_k` for queries, keys, and values.  
- `forward(x)`:
  - Q = W_q(x), K = W_k(x), V = W_v(x)  
  - Computes attention scores: `scores = Q K^T / sqrt(d_k)`  
    - Divide by `sqrt(d_k)` to stabilize gradients (so softmax isn’t too peaky).  
  - Apply `softmax` along the last dimension → each token attends to all other tokens.  
  - Multiply `attn` with `V` → weighted sum of values gives final representation for each token.  
- Returns `out` (transformed tokens) and `attn` (attention map for analysis/debug).  
- Nuances:
  - This is **full self-attention**, O(T²) complexity — slow for long sequences.  
  - No masking here — so if you use it for autoregressive generation, you’d need to mask future tokens outside this module.  
  - `d_k` can be smaller than `d_model`; if multihead is used, each head works in its own subspace.  
- Fun fact: Q, K, V are all learned projections; the model decides **what to “pay attention to”** via training.

In [6]:
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, d_model, max_len=2048):
        super().__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)   # token embedding
        self.pos_embed = nn.Embedding(max_len, d_model)      # position embedding

    def forward(self, x):
        B, T = x.shape
        # make position IDs: [0, 1, ..., T-1]
        pos = torch.arange(0, T, device=x.device).unsqueeze(0)  # [1, T]
        tok = self.tok_embed(x)       # [B, T, d_model]
        pos = self.pos_embed(pos)     # [1, T, d_model]
        return tok + pos              # [B, T, d_model]

In [7]:

class SingleHeadAttention(nn.Module):
    def __init__(self, d_model, d_k):
        super().__init__()
        
        
        # Linear projections for Q, K, V
        self.W_q = nn.Linear(d_model, d_k, bias=False)
        self.W_k = nn.Linear(d_model, d_k, bias=False)
        self.W_v = nn.Linear(d_model, d_k, bias=False)
        
    
    
    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        Q = self.W_q(x)  
        K = self.W_k(x)  
        V = self.W_v(x) 
        
        
        # Attention scores: QK^T / sqrt(d_k) jus like the goddamn paper it was hell to code aaaaaaaaaaaaa
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (Q.size(-1) ** 0.5)

        
        # Softmax over last dim
        attn = F.softmax(scores, dim=-1)
        
        ##Weighted sum with V
        out = torch.matmul(attn, V)  # (batch_size, seq_len, d_k)
        return out, attn

## Test/Demo

In [8]:
embed = EmbeddingLayer(vocab_size, d_model, max_len)
attn = SingleHeadAttention(d_model, head_dim)


emb = embed(x)   # [B, T, d_model]

out = attn(emb)  # [B, T, d_model]

# MultiHeadAttention

### `MultiHeadAttentionOld`
- Implements multi-head attention **naively by creating independent SingleHeadAttention objects** for each head.  
- `num_heads` separate SingleHeadAttention instances, each mapping `d_model -> d_head`.  
- Forward pass:
  - Loops through each head, runs forward separately: **slow, non-batched**.  
  - Concatenates outputs along feature dimension → shape `(B, L, d_model)`.  
  - Final linear `W_o` mixes all heads back together.  
- Nuances:
  - Easier to understand conceptually (each head is a standalone attention).  
  - **Extremely slow for long sequences** because each head is computed sequentially.  
  - Harder to optimize on GPU due to many small matrix multiplies.  
  - Parameter count higher if you naively duplicate weights per head.  

### `MultiHeadAttentionNew`
- Implements multi-head attention **efficiently using one big linear projection** per Q, K, V.  
- `d_model` is split into `num_heads` heads, each of size `d_head = d_model // num_heads`.  
- Forward pass:
  - Project Q, K, V all at once via `self.W_q`, `self.W_k`, `self.W_v`.  
  - Reshape to `(B, num_heads, L, d_head)` to separate heads.  
  - Compute **scaled dot-product attention** in a batched fashion: `scores = QK^T / sqrt(d_head)`.  
  - Optional mask applied to prevent attending to certain positions (useful for autoregressive tasks).  
  - Multiply attention weights by V, then merge heads back: `(B, L, d_model)`.  
  - Final linear `W_o` mixes information across heads.  
- Nuances:
  - **Batched attention** → fast, GPU-friendly, memory efficient.  
  - All heads share a **single projection layer** (big linear) → fewer params, better vectorization.  
  - Supports optional `k` and `v` inputs for cross-attention style usage.  

###  Key Differences
- **Computation style:**
  - `New` → all heads projected and computed in **one big batch** (fast, GPU optimized).  
  - `Old` → each head computed **separately in a Python loop** (slow, memory inefficient).  
- **Parameter sharing:**
  - `New` → single linear layer per Q/K/V, implicitly contains all heads.  
  - `Old` → each head has its own Q/K/V linear layers, duplicated params.  
- **Performance:**
  - `New` → fast, scalable, modern style used in GPT and Transformer implementations.  
  - `Old` → slow, mainly for learning / pedagogical purposes.  
- **Flexibility:**
  - `New` → supports optional `k` and `v` for cross-attention.  
  - `Old` → strictly self-attention unless you modify each head manually.

In [9]:
class MultiHeadAttentionOld(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.num_heads = num_heads
        self.d_head = d_model // num_heads

        #spawn a bunch of independent heads
        self.heads = nn.ModuleList([
            SingleHeadAttention(d_model, self.d_head)
            for _ in range(num_heads)
        ])

        self.W_o = nn.Linear(d_model, d_model)

    
    
    def forward(self, q, k, v, mask=None):
        ## run each head separately (slow as all hell so please dont do this one)
        out_per_head = [head(q, k, v, mask) for head in self.heads]

        concat = torch.cat(out_per_head, dim=-1)

        return self.W_o(concat)

In [10]:
class MultiHeadAttentionNew(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.d_head = d_model // num_heads

        # Single fat-ass projections
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # Final output mixer
        self.W_o = nn.Linear(d_model, d_model)

    
    
    def forward(self, q, k = None, v = None, mask=None):
        B, L, O = q.shape

        if k is None:
            k = q
        if v is None:
            v = q

        
        Q = self.W_q(q)  # (B, L, d_model)
        K = self.W_k(k)
        V = self.W_v(v)
        

        Q = Q.view(B, L, self.num_heads, self.d_head).transpose(1, 2)
        K = K.view(B, L, self.num_heads, self.d_head).transpose(1, 2)
        V = V.view(B, L, self.num_heads, self.d_head).transpose(1, 2)

        
        # scaled dot-product attention (batched!!!!!)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_head ** 0.5)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)

        out = torch.matmul(attn, V)  # (B, num_heads, L, d_head)

        # back to (B, L, d_model)
        out = out.transpose(1, 2).contiguous().view(B, L, -1)
        return self.W_o(out)

# MLP/FFNN layer

### `FeedForward`
- Implements the **position-wise feed-forward network** used in Transformers.  
- Two linear layers: `fc1` expands `d_model` -> `d_ff`, `fc2` projects back `d_ff` -> `d_model`.  
- Forward pass:
  - `fc1(x)` → expands each token vector to higher dimension (`d_ff`) for richer representation.  
  - `F.gelu(x)` → non-linear activation, smooth version of ReLU; used in GPTs.  
  - `fc2(x)` → projects back to original embedding size so residuals can be added.  
  - `Dropout` applied after second layer → prevents overfitting, stabilizes training.  
- Nuances:
  - Applied independently **per position** (no mixing across sequence here).  
  - GELU helps gradients flow better than ReLU for deep stacks.  
  - `d_ff` is typically 4x `d_model` in GPT architectures, giving a bottleneck-expansion style.  
- Fun fact: Even though simple, this tiny MLP is a key part of why Transformers can model complex relationships.

In [11]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    
    def forward(self, x):
        x = self.fc1(x)         # (B, T, d_ff, shifted to the NN size)
        x = F.gelu(x)           # nonlinearity (GELU is used in GPTs as far as i know)
        x = self.fc2(x)         # (B, T, d_model, back to original size)
        x = self.dropout(x)     # dropout for regularization, VERY IMPORTANT !!!1!1!1
        return x

# The Transformer

### `TransformerBlock`
- Represents a single Transformer block, the basic building unit of GPT.  
- Components:
  - `ln1` → LayerNorm before multi-head attention.  
  - `mha` → MultiHeadAttentionNew, performs self-attention over the sequence.  
  - `ln2` → LayerNorm before feed-forward network.  
  - `ffn` → position-wise feed-forward network (see previous doc).  
- Forward pass:
  - `x = x + mha(ln1(x))` → residual connection adds attention output back to input.  
  - `x = x + ffn(ln2(x))` → residual connection adds FFN output back.  
- Nuances:
  - Pre-LayerNorm style (norm before sub-layer) used in GPTs → improves stability for deep stacks.  
  - Residual connections allow gradients to flow through deep networks easily.  
  - Mask can be passed to attention for autoregressive tasks (prevent looking ahead).  
- Fun fact: stacking multiple blocks lets the model capture **hierarchical patterns in sequences**.  

### `Transformer`
- Full GPT-style Transformer for language modeling.  
- Components:
  - `embed` → EmbeddingLayer (token + positional embeddings).  
  - `blocks` → stack of `num_layers` TransformerBlock instances.  
  - `ln_final` → final LayerNorm before output.  
  - `head` → linear layer mapping `d_model` → `vocab_size` for logits.  
- Forward pass:
  - Embed input tokens → `(B, T, d_model)`.  
  - Pass through each TransformerBlock sequentially.  
  - Apply final LayerNorm.  
  - Output logits for each token → `(B, T, vocab_size)`.  
- Nuances:
  - Can handle arbitrary batch sizes and sequence lengths up to `max_len`.  
  - Residual connections + LayerNorm in each block stabilize training for deep stacks.  
  - Output logits are **raw, unnormalized scores**, suitable for `CrossEntropyLoss`.  
  - Fully autoregressive if used with causal masking in attention.  
- Fun fact: This is essentially a “mini GPT” — with enough layers, heads, and parameters, it can learn impressive language patterns.

In [12]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        
        self.ln1 = nn.LayerNorm(d_model)
        
        self.ln2 = nn.LayerNorm(d_model)
        
        self.mha = MultiHeadAttentionNew(d_model, num_heads)
        
        self.ffn = FeedForward(d_model, d_ff)

    def forward(self, x, mask=None):
        
        # Multi-head attention with residuals attached
        x = x + self.mha(self.ln1(x), mask=mask)

        ## Feed-forward with residual
        x = x + self.ffn(self.ln2(x))
        
        return x

In [13]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_heads=8, d_ff=1024, num_layers=7):
        super().__init__()
        
        self.embed = EmbeddingLayer(vocab_size, d_model, max_len=512)
        
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff)
            for _ in range(num_layers)
        ])
        
        self.ln_final = nn.LayerNorm(d_model)
        
        self.head = nn.Linear(d_model, vocab_size, bias=False)  # output logits

    def forward(self, x, mask=None):
        
        x = self.embed(x)
        
        for block in self.blocks:
            x = block(x, mask=mask)
        
        x = self.ln_final(x)
        
        return self.head(x)

# Generator

### `generate`
- Autoregressive text generation function for a trained Transformer / GPT model.  
- Input:
  - `start_text` → initial prompt to kick off generation.  
  - `max_tokens` → maximum number of tokens to generate.  
  - `temperature` → controls randomness: higher → more diverse, lower → more deterministic.  
- Forward pass:
  - Encode `start_text` using the tokenizer → initial tensor `[1, seq_len]`.  
  - Loop for `max_tokens`:
    - Pass current sequence `x` through model → get logits `[1, seq_len, vocab_size]`.  
    - Only consider **last token’s logits** for next token prediction (`logits[:, -1, :]`).
    - Set a `token penalty` to avoid repeating words
    - `TOP-K Filtering` to cut down on probable next tokens- avoiding confusion
    - `TOP-P Filtering` to set a minimum required softmax probability treshold  
    - Scale logits by `temperature` and apply `softmax` → probability distribution.  
    - Sample next token from this distribution using `torch.multinomial`.  
    - Append next token to the sequence.  
    - Stop if the generated token is `<|endoftext|>` (EOT).  
- Output:
  - Decodes the full sequence of token ids back to a human-readable string.  
- Nuances:
  - Uses `@torch.no_grad()` → no gradient tracking, saves memory, faster inference.  
  - Temperature scaling allows control over creativity vs coherence.  
  - Sampling (instead of argmax) introduces stochasticity → multiple runs produce different continuations.  
  - Sequence grows dynamically, no need for fixed context window in this simple version.

In [14]:
@torch.no_grad()
def generate(
    model, 
    start_text, 
    tokenizer = encoder, 
    max_tokens=50, 
    temperature=0.7, 
    top_k= 15, 
    top_p= 0.9, 
    repetition_penalty = 1.5,
    device="cuda"
):
    model.eval()
    
    # Encode starting text
    x = torch.tensor([tokenizer.encode(start_text)], dtype=torch.long, device=device)  # [1, seq_len]
    
    for _ in range(max_tokens):
        logits = model(x)  # [1, seq_len, vocab_size]
        logits = logits[:, -1, :] / temperature   # last token’s logits, scaled

        for token_id in set(x[0].tolist()):
            if logits[0, token_id] < 0:
                logits[0, token_id] *= repetition_penalty
            else:
                logits[0, token_id] /= repetition_penalty
        
        # --- Top-K filtering ---
        if top_k is not None:
            top_k = min(top_k, logits.size(-1))  # safety
            values, _ = torch.topk(logits, top_k)
            min_val = values[:, -1].unsqueeze(-1)  # cutoff threshold
            logits = torch.where(logits < min_val, torch.full_like(logits, -float("Inf")), logits)
        
        # --- Top-P (nucleus) filtering ---
        if top_p is not None:
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            cumulative_probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
            
            # Mask out tokens above nucleus probability
            mask = cumulative_probs > top_p
            
            # Shift mask right to keep at least one token
            mask[..., 1:] = mask[..., :-1].clone()
            mask[..., 0] = False
            
            sorted_logits[mask] = -float("Inf")
            # Re-map back to original indices
            logits = torch.full_like(logits, -float("Inf"))
            logits.scatter_(1, sorted_indices, sorted_logits)
        
        # Turn logits into probabilities
        probs = torch.softmax(logits, dim=-1)    
        
        # Sample from distribution
        next_token = torch.multinomial(probs, num_samples=1)  
        
        x = torch.cat([x, next_token], dim=1)  # append to sequence

        # Stop if we hit <|endoftext|>
        if next_token.item() == tokenizer.eot_token:
            break

    # Decode back to text
    return tokenizer.decode(x[0].tolist())


## Test/Demo

In [15]:
vocab_size = 50257
model = Transformer(vocab_size=vocab_size)  # <-- create an instance

# now pass input through forward()
logits = model(x)  # x: [batch_size, seq_len]
print(logits.shape)

torch.Size([8, 6, 50257])


# Training Loop

### Mini-GPT Training Loop

- **Model setup:**  
  - Instantiate `Transformer` with your chosen hyperparams (`d_model`, `num_heads`, `d_ff`, `num_layers`, etc).  
  - Move model to `device` (GPU if available).  

- **Loss & optimizer:**  
  - `CrossEntropyLoss` used for next-token prediction.  
  - `AdamW` optimizer with learning rate `3e-4`.  
  - Optional: gradient clipping (`clip_grad_norm_`) for stability.  

- **Batching:**  
  - Use your `batch_loader` function to sample `[B, T]` sequences.  
  - `x_batch` = input tokens, `y_batch` = next-token targets.  

- **Training step:**  
  1. `optimizer.zero_grad()` → reset gradients.  
  2. `logits = model(x_batch)` → forward pass.  
  3. Flatten logits & targets: `[B*T, V]` vs `[B*T]` for `CrossEntropyLoss`.  
  4. `loss.backward()` → compute gradients.  
  5. `optimizer.step()` → update weights.  
  6. Accumulate loss for logging.  

- **Sampling / checking progress:**  
  - Use `generate(model, start_text="The Emperor")` to see if model learns flavor.  
  - Sampled text can be truncated for quick checks.  

- **Nuances:**  
  - Sequence flattening is important because `CrossEntropyLoss` expects `[N, C]` logits vs `[N]` targets.  
  - Gradient clipping prevents explosions, especially for untrained mini-GPTs.  
  - Keep `seq_len` and `batch_size` small enough if you’re on a limited GPU.  
  - You can increase `epochs` and feed more batches as dataset grows.

In [16]:
model = Transformer(vocab_size, d_model, num_heads, d_ff, num_layers).to(device)

criterion = nn.CrossEntropyLoss()  ### expects logits [B, T, V] and target [B, T]
optimizer = optim.AdamW(model.parameters(), lr=lr)

def train(raw_dataset, epochs = 2000, seq_len = 64, batch_size = 10, device = 'cuda'):
 for epoch in range(epochs):
     model.train()
     total_loss = 0.0

     # raw_datase given
     x_batch, y_batch = batch_loader(raw_dataset, T=seq_len, B=batch_size, device=device)
    
     optimizer.zero_grad()
     logits = model(x_batch)  # [B, T, V]
    
     # reshape for CrossEntropy: [B*T, V] vs [B*T]
     loss = criterion(logits.view(-1, vocab_size), y_batch.view(-1))
     loss.backward()
     torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # gradient clipping
    
     optimizer.step()
    
     total_loss += loss.item()

     # Optional: sample a few tokens every few epoch to check flavor

     if epoch%500 == 0:
         print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss:.4f}")
         sample_text = generate(model, start_text="Potter was very excited, he")
         print("Sample:", sample_text[:200], "...\n")

In [17]:
with open("/kaggle/input/harry-potter-lstm/Harry_Potter_all_char_separated.txt") as f:
    raw_dataset = f.read()


def chunk_text_words(text, chunk_size=10000):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

chapters = chunk_text_words(raw_dataset)

In [None]:
def train_on_chapters(train_fn, chapters, epochs=1):
    for e in range(epochs):
        print(f"Epoch {e+1}/{epochs}")
        for i, ch in enumerate(chapters):
            print(f"  Training on chapter {i+1}/{len(chapters)}...")
            train_fn(ch)   

train_on_chapters(train, chapters, epochs = 3)

Epoch 1/3
  Training on chapter 1/134...
Epoch 1/2000 | Loss: 11.0356
Sample: Potter was very excited, he Bucket Umb COMM Simply Feed RollingburyEnd Fruitmercerue Supply campingprison meticulously46 ninth Johnsomes Oct 1897 pri depends refuge certify hopelessathered Tray nutri ...

Epoch 501/2000 | Loss: 1.7629
Sample: Potter was very excited, he didn’s that they were like this as she always got so if before back on his mother on cat it people who lived two presents at least seen the way here about anything looked u ...

Epoch 1001/2000 | Loss: 0.2080
Sample: Potter was very excited, he and then remember later put to bed that lay silent woken as Kent Mr Dursley drove around its eyes fixed unD for work dolphins it nice yes would soon quite plainly even wort ...

Epoch 1501/2000 | Loss: 0.0733
Sample: Potter was very excited, he Potter are possible at Uncle Vernon through purs yourself should expect astonishing their leather weren could phone there too Hagrid swung up close by surprise 

In [None]:
generate(model, start_text="harry was quite sad when...")