# Understanding Transformers using PyTorch
https://www.geeksforgeeks.org/deep-learning/transformer-using-pytorch/

<img src="https://media.geeksforgeeks.org/wp-content/uploads/20250325174552667398/transformer.png">

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

## MultiHeadAttention

In [87]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model # embedding size (e.g. 512)
        self.num_heads = num_heads # number of attention heads (e.g. 8)

        # Ensuring each head gets equal dimensions
        self.d_k = d_model // num_heads # (e.g. 512 // 8 = 64)
        print(f"Number of dimensions each head gets: {self.d_k}")

        # Creating Q(Query), K(Key), V(Value) from the input embeddings.
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # Output Layer: This combines all attention heads back into one vector.
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        '''
            This is the core attention computation.
            Compute attention scores by taking the dot product of Q and K, scaling the result and applying softmax to normalise.
            - Measures similarity between Q and K
            - Division by √d_k prevents extremely large values → stabilizes training
            - Mask used for:
                - Padding mask
                - Causal (future-token) masking
            - Apply softmax = Converts scores into probabilities.
            - Softmax example: 
                tensor([-0.8058, -0.9375,  1.2299,  0.2358, -1.0952,  0.0997,  0.8335,  2.3506, -0.3834,  0.1132]) ----> 
                tensor([0.0207, 0.0182, 0.1587, 0.0587, 0.0155, 0.0512, 0.1067, 0.4867, 0.0316, 0.0519])
            - Multiply attn_probs with V
        '''
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k) # (batch, heads, seq_len, seq_len) seq_len = -1 and -2

        # Applying mask if needed,
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # Converts scores into probabilities.
        attn = torch.softmax(scores, dim=-1)

        # Multiply attn_probs with V
        output = torch.matmul(attn, V)
        return output

    def split_heads(self, x):
        '''
            - Input Shape: (batch_size, seq_length, d_model)
            - Transform to: (batch_size, num_heads, seq_length, d_k)
            - ✔ Allows parallel attention across heads.
        '''
        batch_size, seq_length, d_model = x.size() # example: (32, 512, 512)
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2) # example: (32, 8, 512, 64)
        
    def combine_heads(self, x):
        '''
            - Input shape: (batch, heads, seq_len, d_k)
            - Output shape: (batch, seq_len, d_model)
            - ✔ Merges all heads into a single vector.
        '''
        batch_size, _, seq_len, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)   # Returns a contiguous in-memory tensor containing the same data as self tensor. If self tensor is already in the specified memory format, this function returns the self tensor.

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask=None)

        output = self.W_o(self.combine_heads(attn_output))
        return output        

### Shape Summary

| Step          | Shape                          |
| ------------- | ------------------------------ |
| Input         | `(batch, seq_len, d_model)`    |
| Split heads   | `(batch, heads, seq_len, d_k)` |
| Attention     | `(batch, heads, seq_len, d_k)` |
| Combine heads | `(batch, seq_len, d_model)`    |
| Final output  | `(batch, seq_len, d_model)`    |

### Why Multi-Head Attention is Powerful

- ✔ Captures multiple relationships
- ✔ Works in parallel (faster than RNNs)
- ✔ Handles long-range dependencies
- ✔ Foundation of Transformers

## Position-Wise Feed Forward

In a Transformer, after attention mixes information across tokens, the FFN:
- Processes each token independently
- Adds non-linearity
- Expands and compresses feature space

Think of it as a small neural network applied to every token separately.

In [88]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        '''
            d_model: embedding size
            d_ff: hidden size of FFN
        '''
        super(PositionWiseFeedForward, self).__init__()

        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

        self.relu = nn.ReLU() # introduces non-linearity

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

### Positional Encoding
This block defines the Positional Encoding class, which adds positional information to the token embeddings, allowing the model to retain information about word positions in the input sequence.

This class implements Sinusoidal Positional Encoding, which is essential for Transformers because attention alone has no sense of order.

Transformers:
- Do not use RNNs or CNNs
- Process all tokens in parallel
- Have no inherent notion of sequence order

So we inject position information into embeddings.

This class creates fixed (non-learned) sinusoidal positional encodings.

In [89]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        # Create Empty Positional Encoding Matrix
        pe = torch.zeros(max_seq_length, d_model)

        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        '''
            creates:
                [[0],
                 [1],
                 [2],
                 ...
                 [max_seq_length-1]]
            shape:
                (max_seq_length, 1)
        '''

        # This generates different wavelengths for different dimensions.
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)
        )

        # Apply Sine to Even Indices # Encodes position smoothly
        pe[:, 0::2] = torch.sin(position * div_term)
        # Apply Cosine to Odd Indices # Paired with sine for phase-shift encoding
        pe[:, 1::2] = torch.cos(position * div_term)

        # register_buffer so that during training it doesn't get updated 
        self.register_buffer('pe', pe.unsqueeze(0)) # (1, max_seq_length, d_model)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

## Encoder Layer
This block defines the Encoder Layer class, which contains the multi-head attention mechanism and the position-wise feed-forward network, with layer normalisation and dropout applied.

This class is one complete Transformer Encoder block.

It combines everything you’ve learned so far:
- Multi-Head Self-Attention
- Position-wise Feed-Forward Network
- Residual connections
- Layer Normalisation
- Dropout

#### What Is an Encoder Layer?
In a Transformer encoder, each layer does two things:
- Self-attention → tokens look at each other
- Feed-forward network → each token is refined individually

This block is stacked N times (e.g., 6 or 12 layers).

In [90]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        '''
            - d_model: embedding size (e.g. 512)
            - num_heads: attention heads (e.g. 8)
            - d_ff: FFN hidden size (e.g. 2048)
            - dropout: regularisation probability
        '''
        
        super(EncoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model=d_model, d_ff=d_ff)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [91]:
encoder = EncoderLayer(512, 8, 2048, 0.2)

Number of dimensions each head gets: 64


In [92]:
encoder

EncoderLayer(
  (self_attn): MultiHeadAttention(
    (W_q): Linear(in_features=512, out_features=512, bias=True)
    (W_k): Linear(in_features=512, out_features=512, bias=True)
    (W_v): Linear(in_features=512, out_features=512, bias=True)
    (W_o): Linear(in_features=512, out_features=512, bias=True)
  )
  (feed_forward): PositionWiseFeedForward(
    (fc1): Linear(in_features=512, out_features=2048, bias=True)
    (fc2): Linear(in_features=2048, out_features=512, bias=True)
    (relu): ReLU()
  )
  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [95]:

# Hyperparameters
batch_size = 2
seq_len = 5
d_model = 8
num_heads = 2
d_ff = 32
dropout = 0.1

# Random input
x = torch.randn(batch_size, seq_len, d_model)
x

tensor([[[-0.0620, -0.5188, -1.4813, -0.3911, -0.1161,  0.0293, -2.4177,
           0.8640],
         [-1.0250, -1.9186,  0.5580, -1.3278,  0.0911,  2.6778, -1.0107,
           0.5521],
         [-0.5026, -1.4435, -1.2836,  0.0404, -1.4624, -0.6389,  2.6402,
          -0.2110],
         [-0.1491,  0.2950,  0.9525,  0.4953,  0.0340, -1.3236,  0.1164,
          -0.1951],
         [-1.3562, -1.1630,  0.8303,  1.5787, -0.6076, -1.3429,  0.4135,
          -0.5871]],

        [[ 0.3376,  0.4662,  0.3402, -1.0618,  0.5530,  0.8334, -0.1288,
           0.8086],
         [-1.2023, -0.9707,  0.9154,  1.1936,  1.7496,  0.6047, -0.5812,
           0.6910],
         [-1.2275, -0.3438,  0.6108, -0.8516,  0.8066, -0.2044, -1.7381,
           0.5873],
         [-0.0929,  0.4559,  1.3546,  0.7567, -0.1896,  1.8262, -1.1430,
          -0.6867],
         [-1.9211,  0.0942, -0.0342, -0.8717, -0.0406,  1.1138,  0.4076,
           0.1987]]])

In [94]:
# Optional mask (1 = keep, 0 = mask)
mask = torch.ones(batch_size, 1, 1, seq_len)

# Encoder
encoder = EncoderLayer(d_model, num_heads, d_ff, dropout)

# Forward pass
output = encoder(x, mask)

# Results
print("Input shape :", x.shape)
print("Output shape:", output.shape)
print("\nSample output tensor:\n", output)

Number of dimensions each head gets: 4
Input shape : torch.Size([2, 5, 8])
Output shape: torch.Size([2, 5, 8])

Sample output tensor:
 tensor([[[ 0.3806,  0.1950, -0.2536, -1.4823, -1.0373, -0.2126,  2.0797,
           0.3305],
         [ 0.2097, -1.2543, -0.2978,  0.7728, -1.4621, -0.4739,  0.9399,
           1.5656],
         [-0.5283,  0.2674, -0.0336, -0.6224, -1.2941,  0.0087,  2.3583,
          -0.1560],
         [ 0.9262,  1.6344, -1.0152, -0.0511, -0.5642, -1.3574, -0.5562,
           0.9835],
         [ 1.1277, -0.0372, -0.9250, -0.9798, -1.4394,  0.4068,  1.6193,
           0.2276]],

        [[-0.3407, -1.3028, -0.3811,  2.1654,  0.0465, -0.7679,  0.8343,
          -0.2536],
         [-0.1522, -1.0708,  0.3779,  0.4899,  1.6798,  0.0306, -1.8409,
           0.4857],
         [ 0.2907,  1.6394, -0.8440,  0.6338, -0.5269, -0.1237, -1.8092,
           0.7399],
         [-0.3635,  1.2514,  0.0345,  1.3420,  0.8607, -0.9406, -1.5861,
          -0.5984],
         [ 0.6888, -0.7602