In [None]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)


class CausalSelfAttention(nn.Module):
    def __init__(self, d_model, nhead):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, nhead, batch_first=True)
    
    def forward(self, x):
        seq_len = x.size(1)
        # Create a lower triangular mask to prevent attending to future tokens
        mask = torch.tril(torch.ones(seq_len, seq_len)).to(x.device)  # (seq_len, seq_len)
        mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return self.attn(x, x, x, attn_mask=mask)[0]


class TransformerDecoderBlock(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048):
        super().__init__()
        self.self_attn = CausalSelfAttention(d_model, nhead)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x2 = self.self_attn(x)
        x = self.norm1(x + x2)  # Residual connection
        x2 = self.linear2(torch.relu(self.linear1(x)))
        x = self.norm2(x + self.dropout(x2))  # Another residual connection
        return x


class GPT(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, max_len=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.decoder_layers = nn.ModuleList([TransformerDecoderBlock(d_model, nhead) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)  # Token embeddings
        x = self.positional_encoding(x)  # Add positional encoding
        for layer in self.decoder_layers:
            x = layer(x)  # Pass through transformer blocks
        return self.fc_out(x)  # Predict next token

# Model parameters
vocab_size = 30000  # Urdu vocabulary size
d_model = 512
nhead = 8
num_layers = 6

# Instantiate model
model = GPT(vocab_size, d_model, nhead, num_layers)

# Dummy input
dummy_input = torch.randint(0, vocab_size, (10, 32))  # (seq_length, batch_size)
output = model(dummy_input)

print("Output shape:", output.shape)  # (seq_length, batch_size, vocab_size)


In [None]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Register buffer instead of assigning as attribute
        self.register_buffer('pe', pe.unsqueeze(0))  # Shape: (1, max_len, d_model)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]  # No need for .to(x.device) with register_buffer


class CausalSelfAttention(nn.Module):
    def __init__(self, d_model, nhead):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, nhead, batch_first=True)
    
    def forward(self, x):
        seq_len = x.size(1)
        # Create causal mask (lower triangular)
        mask = torch.triu(torch.ones(seq_len, seq_len) * float('-inf'), diagonal=1).to(x.device)
        # Apply attention with mask
        return self.attn(x, x, x, attn_mask=mask)[0]


class TransformerDecoderBlock(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.self_attn = CausalSelfAttention(d_model, nhead)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(self, x):
        # First attention block with residual connection and normalization
        x = self.norm1(x + self.dropout1(self.self_attn(x)))
        # Feed-forward block with residual connection and normalization
        ff_output = self.linear2(self.dropout2(self.activation(self.linear1(x))))
        x = self.norm2(x + ff_output)
        return x


class GPT(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, max_len=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.decoder_layers = nn.ModuleList([
            TransformerDecoderBlock(d_model, nhead) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model
    
    def forward(self, x):
        # x shape: (batch_size, seq_len)
        x = self.embedding(x) * math.sqrt(self.d_model)  # Scale embeddings
        x = self.positional_encoding(x)
        for layer in self.decoder_layers:
            x = layer(x)
        return self.fc_out(x)  # Returns logits for next token prediction
    
    
    
# Model parameters
vocab_size = 30000  # Urdu vocabulary size
d_model = 512
nhead = 8
num_layers = 6

# Instantiate model
model = GPT(vocab_size, d_model, nhead, num_layers)

# Dummy input with the correct shape (batch_size, seq_len)
dummy_input = torch.randint(0, vocab_size, (32, 10))  
output = model(dummy_input)

print("Output shape:", output.shape)  # Expected: (32, 10, 30000)