In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [9]:
class DummySequenceDataset(Dataset):
    def __init__(self, num_sequences=1000, seq_len=5, vocab_size=20):
        self.vocab_size = vocab_size
        self.data = []
        for _ in range(num_sequences):
            start = torch.randint(0, vocab_size - seq_len, (1,)).item()
            seq = torch.arange(start, start + seq_len + 1) % vocab_size
            self.data.append(seq)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        full_seq = self.data[idx]
        return full_seq[:-1], full_seq[1:]  # Input sequence, Target sequence

In [10]:
#Encodes the position of variable length inputs up to max_len tokens
class LearnablePositionalEncoding(nn.Module):
    def __init__(self, max_len: int, d_model: int):
        super().__init__()
        self.max_len = max_len
        # Randomly initilizes learnable positional embedding
        self.pos_embedding = nn.Parameter(torch.randn(self.max_len, d_model))

    def forward(self, x):
        # x: (batch_size, seq_len, d_model)
        seq_len = x.size(1)
        if seq_len > self.max_len:
            raise ValueError(f"Input sequence length {seq_len}"
                             f" exceeds maximum positional encoding length {self.max_len}.")
        pos_embed = self.pos_embedding[:seq_len]  # (seq_len, d_model)
        return x + pos_embed.unsqueeze(0)         # broadcast to (batch_size, seq_len, d_model)

In [11]:
# Example transformer using the created positional encoding
class SimpleTransformerModel(nn.Module):
    def __init__(self, vocab_size, seq_len, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoding = LearnablePositionalEncoding(seq_len, embed_dim)
        
        
        self.transformer_block = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=2)
        self.transformer = nn.TransformerEncoder(self.transformer_block, num_layers=1)
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        x = self.pos_encoding(x)
        x = x.permute(1, 0, 2)  # Transformer expects (seq_len, batch, embed_dim)
        x = self.transformer(x)
        x = x.permute(1, 0, 2)
        return self.fc_out(x)

def train_model():
    # Hyperparameters
    vocab_size = 20
    seq_len = 5
    embed_dim = 32
    batch_size = 32
    epochs = 10

    dataset = DummySequenceDataset(vocab_size=vocab_size)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model = SimpleTransformerModel(vocab_size, seq_len, embed_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(epochs):
        total_loss = 0
        for x, y in dataloader:
            logits = model(x)  # (batch_size, seq_len, vocab_size)
            loss = criterion(logits.view(-1, vocab_size), y.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

train_model()



Epoch 1 | Loss: 40.7638
Epoch 2 | Loss: 12.3947
Epoch 3 | Loss: 6.6316
Epoch 4 | Loss: 4.0531
Epoch 5 | Loss: 2.7611
Epoch 6 | Loss: 1.9551
Epoch 7 | Loss: 1.5061
Epoch 8 | Loss: 1.1706
Epoch 9 | Loss: 0.9389
Epoch 10 | Loss: 0.7955
