### Transformer library Coded

In [None]:
import torch
import torch.nn as nn
import math

class UrduTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6, 
                 dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=nhead,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True
            ),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        
        # Positional encoding
        position = torch.arange(0, 5000).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(5000, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x, src_mask=None, src_key_padding_mask=None):
        # x shape: (batch, seq)
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = x + self.pe[:, :x.size(1)]
        
        # Apply transformer encoder
        x = self.pos_encoder(x, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        
        # Output projection
        return self.fc_out(x)

# Create model with PyTorch transformer
vocab_size = 30000  # Urdu vocabulary size
model = UrduTransformer(vocab_size)

# Dummy input (batch_size, seq_length)
dummy_input = torch.randint(0, vocab_size, (32, 10))
output = model(dummy_input)
print("Output shape:", output.shape)  # Expected: (32, 10, 30000)

### Transformer Scratch

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

# Transformer Encoder Block
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, src):
        src2 = self.self_attn(src, src, src)[0]
        src = self.norm1(src + src2)
        src2 = self.linear2(F.relu(self.linear1(src)))
        src = self.norm2(src + self.dropout(src2))
        return src

# Full Transformer Model
class CustomTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([TransformerEncoderLayer(d_model, nhead) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.encoder_layers:
            x = layer(x)
        return self.fc_out(x)

# Model parameters
vocab_size = 30000  # Urdu vocabulary size
d_model = 512
nhead = 8
num_layers = 6

# Instantiate model
model = CustomTransformer(vocab_size, d_model, nhead, num_layers)

# Dummy input
dummy_input = torch.randint(0, vocab_size, (10, 32))  # (seq_length, batch_size)
output = model(dummy_input)

print("Output shape:", output.shape)  # Expected: (seq_length, batch_size, vocab_size)
