### üß± Step 1 ‚Äì Notebook setup

In [1]:
# Step 1: Basic setup
import torch
import torch.nn as nn
import torch.nn.functional as F

# For reproducibility
torch.manual_seed(0)

# Choose device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


### üß± Step 2 ‚Äì Tiny toy ‚Äúcorpus‚Äù

In [2]:
# Step 2: Tiny toy dataset (very small)
corpus = [
    "hello world",
    "hi there",
    "how are you",
    "hello there",
    "hi world",
]

for i, s in enumerate(corpus):
    print(f"{i}: {s}")


0: hello world
1: hi there
2: how are you
3: hello there
4: hi world


### üß± Step 3 ‚Äì Character-level tokenizer

In [3]:
# Step 3: Character-level tokenizer

class CharTokenizer:
    def __init__(self):
        # Base character set: lowercase letters + space + punctuation we need
        base_chars = list("abcdefghijklmnopqrstuvwxyz ,!?")

        # Special tokens at the beginning
        self.special_tokens = ["<pad>", "<bos>", "<eos>"]
        self.itos = self.special_tokens + base_chars  # index ‚Üí string
        self.stoi = {ch: i for i, ch in enumerate(self.itos)}  # string ‚Üí index

    @property
    def pad_id(self):
        return self.stoi["<pad>"]

    @property
    def bos_id(self):
        return self.stoi["<bos>"]

    @property
    def eos_id(self):
        return self.stoi["<eos>"]

    @property
    def vocab_size(self):
        return len(self.itos)

    def encode(self, text, add_special_tokens=True):
        text = text.lower()
        ids = []
        if add_special_tokens:
            ids.append(self.bos_id)
        for ch in text:
            if ch in self.stoi:
                ids.append(self.stoi[ch])
            # if char not in vocab, we just skip it for now
        if add_special_tokens:
            ids.append(self.eos_id)
        return ids

    def decode(self, ids, skip_special=True):
        chars = []
        special_set = set(self.special_tokens) if skip_special else set()
        for i in ids:
            ch = self.itos[int(i)]
            if ch in special_set:
                continue
            chars.append(ch)
        return "".join(chars)


In [4]:
tok = CharTokenizer()
print("Vocab size:", tok.vocab_size)
print("Token IDs of 'hello world':", tok.encode("hello world"))
print("Decoded back:",
      tok.decode(tok.encode("hello world")))

Vocab size: 33
Token IDs of 'hello world': [1, 10, 7, 14, 14, 17, 29, 25, 17, 20, 14, 6, 2]
Decoded back: hello world


### üß± Step 4 ‚Äì Build a small batch of token IDs

In [5]:
# Step 4: Encode the corpus and create a padded batch

# Encode each sentence
encoded = [tok.encode(s) for s in corpus]
print("Encoded sequences:")
for s, ids in zip(corpus, encoded):
    print(f"{s!r} -> {ids}")

Encoded sequences:
'hello world' -> [1, 10, 7, 14, 14, 17, 29, 25, 17, 20, 14, 6, 2]
'hi there' -> [1, 10, 11, 29, 22, 10, 7, 20, 7, 2]
'how are you' -> [1, 10, 17, 25, 29, 3, 20, 7, 29, 27, 17, 23, 2]
'hello there' -> [1, 10, 7, 14, 14, 17, 29, 22, 10, 7, 20, 7, 2]
'hi world' -> [1, 10, 11, 29, 25, 17, 20, 14, 6, 2]


In [6]:
# Find max length
max_len = max(len(ids) for ids in encoded)
print("Max length:", max_len)

Max length: 13


In [7]:
# Pad with <pad> so all sequences = max_len
pad_id = tok.pad_id
padded = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded]

In [8]:
# Convert to tensor (B, T)
input_ids = torch.tensor(padded, dtype=torch.long, device=device)
print("input_ids shape:", input_ids.shape)
print(input_ids)

input_ids shape: torch.Size([5, 13])
tensor([[ 1, 10,  7, 14, 14, 17, 29, 25, 17, 20, 14,  6,  2],
        [ 1, 10, 11, 29, 22, 10,  7, 20,  7,  2,  0,  0,  0],
        [ 1, 10, 17, 25, 29,  3, 20,  7, 29, 27, 17, 23,  2],
        [ 1, 10,  7, 14, 14, 17, 29, 22, 10,  7, 20,  7,  2],
        [ 1, 10, 11, 29, 25, 17, 20, 14,  6,  2,  0,  0,  0]], device='cuda:0')


### üß© Step 5 ‚Äì Model config

In [9]:
# Step 5: Model config

from dataclasses import dataclass

@dataclass
class GPTConfig:
    vocab_size: int
    d_model: int = 64      # embedding size
    n_heads: int = 4       # number of attention heads
    n_layers: int = 2      # transformer blocks
    block_size: int = 32   # max sequence length
    dropout: float = 0.1

cfg = GPTConfig(
    vocab_size=tok.vocab_size,
    d_model=64,
    n_heads=4,
    n_layers=2,
    block_size=input_ids.shape[1],  # just use current max length
    dropout=0.1,
)

print(cfg)


GPTConfig(vocab_size=33, d_model=64, n_heads=4, n_layers=2, block_size=13, dropout=0.1)


### üß© Step 6 ‚Äì Embedding layer (token + position)

In [10]:
# Step 6: Embedding module (token + positional)

class TokenPositionalEmbedding(nn.Module):
    def __init__(self, cfg: GPTConfig):
        super().__init__()
        self.cfg = cfg
        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
        self.pos_emb = nn.Embedding(cfg.block_size, cfg.d_model)
        self.dropout = nn.Dropout(cfg.dropout)

    def forward(self, input_ids):
        """
        input_ids: (B, T) of token indices
        returns: (B, T, d_model)
        """
        B, T = input_ids.shape
        device = input_ids.device

        # 1) Token embeddings: (B, T, d_model)
        tok = self.tok_emb(input_ids)

        # 2) Positional embeddings:
        # positions = [0, 1, ..., T-1]
        positions = torch.arange(T, device=device).unsqueeze(0)  # (1, T)
        pos = self.pos_emb(positions)  # (1, T, d_model) broadcast over B

        # 3) Add them
        x = tok + pos

        # 4) Optional dropout
        x = self.dropout(x)

        return x

# Instantiate and test
embed = TokenPositionalEmbedding(cfg).to(device)
x = embed(input_ids)  # (B, T, d_model)
print("Embedding output shape:", x.shape)


Embedding output shape: torch.Size([5, 13, 64])


### üß© Step 7 ‚Äì Causal Self-Attention

In [11]:
# Step 7: Causal self-attention

class CausalSelfAttention(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        assert cfg.d_model % cfg.n_heads == 0, "d_model must be divisible by n_heads"
        self.cfg = cfg
        self.n_heads = cfg.n_heads
        self.head_dim = cfg.d_model // cfg.n_heads

        # One linear layer to get Q, K, V together: (d_model -> 3 * d_model)
        self.qkv = nn.Linear(cfg.d_model, 3 * cfg.d_model)
        self.proj = nn.Linear(cfg.d_model, cfg.d_model)
        self.dropout = nn.Dropout(cfg.dropout)

    def forward(self, x):
        """
        x: (B, T, d_model)
        returns: (B, T, d_model)
        """
        B, T, C = x.shape  # C = d_model

        # 1) Project to Q, K, V
        qkv = self.qkv(x)               # (B, T, 3C)
        q, k, v = qkv.chunk(3, dim=-1)  # each: (B, T, C)

        # 2) Reshape for multi-head: (B, n_heads, T, head_dim)
        q = q.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)

        # 3) Attention scores: Q K^T / sqrt(d_head)
        att = q @ k.transpose(-2, -1)   # (B, n_heads, T, T)
        att = att / (self.head_dim ** 0.5)

        # 4) Build causal mask so position t cannot see > t
        mask = torch.tril(torch.ones(T, T, device=x.device))
        # mask: (T, T) ‚Üí lower triangle = 1, upper = 0
        mask = mask.view(1, 1, T, T)    # (1,1,T,T) broadcast over B and heads

        att = att.masked_fill(mask == 0, float('-inf'))

        # 5) Softmax ‚Üí probabilities over source positions
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)

        # 6) Weighted sum of values
        y = att @ v                     # (B, n_heads, T, head_dim)

        # 7) Merge heads back
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # (B,T,C)

        # 8) Final projection
        y = self.proj(y)
        y = self.dropout(y)
        return y


In [12]:
attn = CausalSelfAttention(cfg).to(device)

with torch.no_grad():
    y = attn(x)  # x from: x = embed(input_ids)
print("Attention output shape:", y.shape)


Attention output shape: torch.Size([5, 13, 64])


### üß© Step 8 ‚Äì Transformer Block

In [13]:
# Step 8: Transformer Block (Attention + MLP + residuals)

class Block(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.ln1 = nn.LayerNorm(cfg.d_model)
        self.attn = CausalSelfAttention(cfg)
        self.ln2 = nn.LayerNorm(cfg.d_model)
        self.mlp = nn.Sequential(
            nn.Linear(cfg.d_model, 4 * cfg.d_model),
            nn.GELU(),
            nn.Linear(4 * cfg.d_model, cfg.d_model),
            nn.Dropout(cfg.dropout),
        )

    def forward(self, x):
        # x: (B, T, d_model)

        # 1) Attention sub-layer
        x = x + self.attn(self.ln1(x))   # LN ‚Üí Attn ‚Üí add residual

        # 2) MLP sub-layer
        x = x + self.mlp(self.ln2(x))    # LN ‚Üí MLP ‚Üí add residual

        return x


In [14]:
block = Block(cfg).to(device)

with torch.no_grad():
    x_block = block(x)  # x from embedding layer
print("Block output shape:", x_block.shape)


Block output shape: torch.Size([5, 13, 64])


### üß© Step 9 ‚Äì Full Decoder-only LLM

In [15]:
# Step 9: Full tiny decoder-only LLM

class DecoderOnlyLM(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.embed = TokenPositionalEmbedding(cfg)
        self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
        self.ln_f = nn.LayerNorm(cfg.d_model)
        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)

    def forward(self, input_ids, targets=None):
        """
        input_ids: (B, T) token IDs
        targets: (B, T) token IDs for computing next-token loss (optional)
        returns: logits (B, T, vocab_size), loss (or None)
        """
        B, T = input_ids.shape

        # 1) Embedding: tokens + positions ‚Üí (B, T, d_model)
        x = self.embed(input_ids)

        # 2) Transformer blocks
        for blk in self.blocks:
            x = blk(x)

        # 3) Final LayerNorm
        x = self.ln_f(x)

        # 4) LM head: per-token distribution over vocab
        logits = self.lm_head(x)   # (B, T, vocab_size)

        loss = None
        if targets is not None:
            # Next-token prediction:
            # logits for positions 0..T-2 predict targets at 1..T-1
            logits_shifted = logits[:, :-1, :].contiguous()   # (B, T-1, V)
            targets_shifted = targets[:, 1:].contiguous()     # (B, T-1)

            loss = F.cross_entropy(
                logits_shifted.view(-1, logits_shifted.size(-1)),
                targets_shifted.view(-1),
                ignore_index=tok.pad_id,  # don't penalize pad tokens
            )

        return logits, loss


In [16]:
model = DecoderOnlyLM(cfg).to(device)

logits, loss = model(input_ids, targets=input_ids)
print("Logits shape:", logits.shape)
print("Initial loss:", loss.item())


Logits shape: torch.Size([5, 13, 33])
Initial loss: 3.646306276321411


### üß© Step 10 ‚Äì Simple training loop

In [17]:
# Step 10: Simple training loop

# Reuse: model, cfg, input_ids, tok from before

learning_rate = 1e-3
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

num_steps = 300  # keep small at first

model.train()
for step in range(1, num_steps + 1):
    # Forward pass: we use input_ids as both inputs and targets
    logits, loss = model(input_ids, targets=input_ids)

    # Backprop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print every 50 steps
    if step % 50 == 0 or step == 1:
        print(f"Step {step:3d} | loss = {loss.item():.4f}")


Step   1 | loss = 3.5746
Step  50 | loss = 0.3736
Step 100 | loss = 0.1986
Step 150 | loss = 0.1684
Step 200 | loss = 0.1640
Step 250 | loss = 0.1525
Step 300 | loss = 0.1584


### üß© Step 11 ‚Äì Add a generate function

In [18]:
# Step 11: Text generation helper

@torch.no_grad()
def generate_text(model, tok, prompt, max_new_tokens=20, greedy=True):
    model.eval()

    # 1) Encode prompt
    ids = tok.encode(prompt)  # includes <bos> and <eos>
    # we don't want to stop at eos yet, so let's drop eos
    if tok.eos_id in ids:
        ids = ids[:-1]

    input_ids_gen = torch.tensor([ids], dtype=torch.long, device=device)  # (1, T)
    
    for _ in range(max_new_tokens):
        # 2) Run model to get logits
        logits, _ = model(input_ids_gen)  # (1, T, vocab_size)
        last_logits = logits[:, -1, :]   # (1, vocab_size)

        # 3) Turn logits into probabilities
        probs = F.softmax(last_logits, dim=-1)  # (1, vocab_size)

        # 4) Choose next token
        if greedy:
            next_id = probs.argmax(dim=-1, keepdim=True)  # (1,1)
        else:
            next_id = torch.multinomial(probs, num_samples=1)  # (1,1)

        # 5) Append to sequence
        input_ids_gen = torch.cat([input_ids_gen, next_id], dim=1)

        # Optional: stop if we hit eos
        if next_id.item() == tok.eos_id:
            break

    # Decode the whole sequence (skip special tokens)
    output_text = tok.decode(input_ids_gen[0].tolist())
    return output_text


### üß© Step 12 ‚Äì Try generating some text

In [19]:
# Try a few prompts

prompts = ["h", "hello", "hi", "how", "he"]

for p in prompts:
    out = generate_text(model, tok, p, max_new_tokens=20, greedy=True)
    print(f"Prompt: {p!r} -> {out!r}")


Prompt: 'h' -> 'hi there'
Prompt: 'hello' -> 'hello world'
Prompt: 'hi' -> 'hi there'
Prompt: 'how' -> 'how are you'
Prompt: 'he' -> 'hello world'


In [24]:
generate_text(model, tok, "hello", max_new_tokens=20, greedy=True)

'hello world'