# Day 04 — Multi-Head Attention & Positional Encoding

## Why this day matters
Single-head attention explains relevance.
Multi-head attention explains **reasoning diversity**.

Today’s work assembles the **first real Transformer block** —
the core unit behind GPT, BERT, LLaMA, and modern LLMs.

## What is implemented
- Multi-head masked self-attention (from scratch)
- Learned positional embeddings
- Feed-forward network
- Residual connections + LayerNorm
- A complete mini Transformer language model

## Key realization
Transformers are not magic.
They are **repeatable blocks** composed of simple, well-motivated parts.


In [1]:
#making raw dataset
text = """ In the beginning the universe was created. This has made a lot of people very angry and been widely regarded as bad move"""
print(text)


 In the beginning the universe was created. This has made a lot of people very angry and been widely regarded as bad move


In [2]:
# tokenizations
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("Charatcters:",chars)
print("Vocab Size:",vocab_size)


Charatcters: [' ', '.', 'I', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
Vocab Size: 25


In [3]:
# build mapping
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s:[stoi[c] for c in s]
decode = lambda l:"".join([itos[i] for i in l])

print(encode("the"))
print(decode(encode("the")))



[20, 11, 8]
the


In [4]:
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data[:20])
print("Total tokens:",len(data))

block_size = 8 # context  lenghth
batch_size = 4


tensor([ 0,  2, 15,  0, 20, 11,  8,  0,  5,  8, 10, 12, 15, 15, 12, 15, 10,  0,
        20, 11])
Total tokens: 121


In [5]:
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data[:20])
print("Total tokens:",len(data))

block_size = 8 # context  lenghth
batch_size = 4


tensor([ 0,  2, 15,  0, 20, 11,  8,  0,  5,  8, 10, 12, 15, 15, 12, 15, 10,  0,
        20, 11])
Total tokens: 121


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [8]:
#SINGLE ATTENTION HEAD (MASKED)
class Head(nn.Module):
    def __init__(self, embed_size, head_size, block_size):
        super().__init__()
        self.key = nn.Linear(embed_size, head_size, bias=False)
        self.query = nn.Linear(embed_size, head_size, bias=False)
        self.value = nn.Linear(embed_size, head_size, bias=False)

        self.register_buffer(
            "tril", torch.tril(torch.ones(block_size, block_size))
        )

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        weights = q @ k.transpose(-2, -1) / (k.shape[-1] ** 0.5)
        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        weights = F.softmax(weights, dim=-1)

        out = weights @ v
        return out


In [9]:
#Now we combine heads.
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, num_heads, block_size):
        super().__init__()
        head_size = embed_size // num_heads
        self.heads = nn.ModuleList([
            Head(embed_size, head_size, block_size)
            for _ in range(num_heads)
        ])
        self.proj = nn.Linear(embed_size, embed_size)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out


In [10]:
#POSITIONAL EMBEDDINGS (LEARNED)
#GPT uses learned positional embeddings
class PositionalEncoding(nn.Module):
    def __init__(self, block_size, embed_size):
        super().__init__()
        self.position_embedding = nn.Embedding(block_size, embed_size)

    def forward(self, x):
        B, T = x.shape
        positions = torch.arange(T, device=x.device)
        return self.position_embedding(positions)



In [11]:
#FEED-FORWARD NETWORK
class FeedForward(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),
            nn.ReLU(),
            nn.Linear(4 * embed_size, embed_size)
        )

    def forward(self, x):
        return self.net(x)


In [12]:
#TRANSFORMER BLOCK (ATTN + FFN)
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, num_heads, block_size):
        super().__init__()
        self.attn = MultiHeadAttention(embed_size, num_heads, block_size)
        self.ffn = FeedForward(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x


In [13]:
class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size=64, block_size=8, num_heads=4):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_size)
        self.pos_embed = PositionalEncoding(block_size, embed_size)
        self.block = TransformerBlock(embed_size, num_heads, block_size)
        self.ln = nn.LayerNorm(embed_size)
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x, targets=None):
        tok = self.token_embed(x)
        pos = self.pos_embed(x)
        x = tok + pos
        x = self.block(x)
        x = self.ln(x)
        logits = self.fc(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss


## Observations

- Multi-head attention stabilizes training
- Different heads learn different token relationships
- Positional embeddings restore sequence order
- Transformer block is a reusable, composable unit

## Key Insight (Day 4)
Transformers scale not because of size,
but because **the same block can be stacked arbitrarily deep**.
