## Assignment No-4
**Name** - Vishal Pattar  
**Roll No** - 43556  
**Class** - BE AIML  
**Batch** - Batch D  
**Subject** - Natural Language Processing

### Problem Statement

Create a transformer from scratch using the Pytorch library 

In [1]:
import torch
import torch.nn as nn
import math

In [2]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

In [3]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_k = d_model // num_heads
        self.num_heads = num_heads

        self.qkv = nn.Linear(d_model, d_model * 3)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, x):
        B, T, D = x.size()
        qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, self.d_k).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]  # each: (B, heads, T, d_k)

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        weights = torch.softmax(scores, dim=-1)
        attended = torch.matmul(weights, v)

        attended = attended.transpose(1, 2).contiguous().reshape(B, T, D)
        return self.out(attended)

In [4]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, ff_hidden_dim, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadSelfAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)

        self.ff = nn.Sequential(
            nn.Linear(d_model, ff_hidden_dim),
            nn.ReLU(),
            nn.Linear(ff_hidden_dim, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        attn_out = self.dropout1(self.attn(x))
        x = self.norm1(x + attn_out)

        ff_out = self.dropout2(self.ff(x))
        x = self.norm2(x + ff_out)
        return x

In [5]:
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, d_model, num_layers, num_heads, ff_hidden_dim, max_len=512):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, ff_hidden_dim)
            for _ in range(num_layers)
        ])
        self.out = nn.Linear(d_model, input_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x)
        return self.out(x)

In [None]:
# Dummy input: batch of token sequences
vocab_size = 100
model = TransformerEncoder(input_dim=vocab_size, d_model=64, num_layers=2, num_heads=4, ff_hidden_dim=128)

input_ids = torch.randint(0, vocab_size, (2, 10))  # (batch_size, seq_len)
output = model(input_ids)

print("Input shape:", input_ids.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([2, 10])
Output shape: torch.Size([2, 10, 100])
