In [1]:
import torch
print(torch.__version__)

2.8.0+cu126


In [2]:
from datasets import load_dataset

multi30k_dataset = load_dataset("bentrevett/multi30k")

print(multi30k_dataset['train'][0])

{'en': 'Two young, White males are outside near many bushes.', 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}


In [3]:
dataset = load_dataset("bentrevett/multi30k")

# The dataset is a 'DatasetDict' containing train, validation, and test splits.
print("\nDataset loaded successfully!")
print("Dataset structure:")
print(dataset)

# Let's inspect the first example from the training set.
# Each example is a dictionary with 'en' (English) and 'de' (German) keys.
print("\n--- First Training Example ---")
first_example = dataset['train'][0]
print(f"English: {first_example['en']}")
print(f"German:  {first_example['de']}")
print("----------------------------")


Dataset loaded successfully!
Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

--- First Training Example ---
English: Two young, White males are outside near many bushes.
German:  Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
----------------------------


In [4]:
# step2_tokenize.py

import os
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

def get_text_iterator(dataset, lang):
    """
    Returns an iterator that yields sentences from the dataset for a given language.
    """
    for example in dataset:
        yield example[lang]

def train_tokenizer(dataset, lang, vocab_size=10000):
    """
    Trains a BPE tokenizer from a dataset iterator and saves it.
    """
    # Initialize a tokenizer with a BPE model
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()

    # Define the trainer with special tokens required for a seq2seq model
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"]
    )

    # Get an iterator for the text data
    text_iterator = get_text_iterator(dataset['train'], lang=lang)

    # Train the tokenizer
    print(f"Training tokenizer for language: '{lang}'...")
    tokenizer.train_from_iterator(text_iterator, trainer=trainer)
    print("Training complete.")

    # Save the tokenizer
    output_path = f"tokenizer_{lang}.json"
    tokenizer.save(output_path)
    print(f"Tokenizer saved to {output_path}")
    return tokenizer


print("--- Starting Tokenization ---")
# Load the dataset from Step 1
dataset = load_dataset("bentrevett/multi30k")

# Train and save the English tokenizer
tokenizer_en = train_tokenizer(dataset, lang='en')

# Train and save the German tokenizer
tokenizer_de = train_tokenizer(dataset, lang='de')

print("\n--- Tokenization Demo ---")
# Example sentence
english_sentence = "A man in a blue shirt is running."
german_sentence = "Ein Mann in einem blauen Hemd rennt."

# Encode the English sentence
encoded_en = tokenizer_en.encode(english_sentence)
print(f"\nOriginal English: {english_sentence}")
print(f"Encoded IDs: {encoded_en.ids}")
print(f"Tokens: {encoded_en.tokens}")

# Encode the German sentence
encoded_de = tokenizer_de.encode(german_sentence)
print(f"\nOriginal German: {german_sentence}")
print(f"Encoded IDs: {encoded_de.ids}")
print(f"Tokens: {encoded_de.tokens}")
print("-------------------------")


--- Starting Tokenization ---
Training tokenizer for language: 'en'...
Training complete.
Tokenizer saved to tokenizer_en.json
Training tokenizer for language: 'de'...
Training complete.
Tokenizer saved to tokenizer_de.json

--- Tokenization Demo ---

Original English: A man in a blue shirt is running.
Encoded IDs: [30, 93, 83, 57, 191, 160, 101, 353, 15]
Tokens: ['A', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'running', '.']

Original German: Ein Mann in einem blauen Hemd rennt.
Encoded IDs: [109, 124, 100, 111, 276, 265, 561, 14]
Tokens: ['Ein', 'Mann', 'in', 'einem', 'blauen', 'Hemd', 'rennt', '.']
-------------------------


In [None]:
import torch
import torch.nn as nn
import math

# This is a standard building block for Transformers.
# It adds information about the position of each token in the sequence,
# as the self-attention mechanism itself doesn't consider order.
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# Helper module that converts token IDs into embeddings.
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# The main model that brings everything together.
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        
        # Using PyTorch's built-in Transformer components
        self.transformer = nn.Transformer(d_model=emb_size,
                                          nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout)
        
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self,
                src,
                trg,
                src_mask,
                tgt_mask,
                src_padding_mask,
                tgt_padding_mask,
                memory_key_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        
        return self.generator(outs)

    def encode(self, src, src_mask):
        return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt, memory, tgt_mask):
        return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)


def generate_square_subsequent_mask(sz, device):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


# --- Main execution block to demonstrate the model ---

# These parameters are chosen based on the project plan's advice to keep the model small. 
SRC_VOCAB_SIZE = 10000  # Placeholder from our tokenizer
TGT_VOCAB_SIZE = 10000  # Placeholder from our tokenizer
EMB_SIZE = 256          # d_model = 256 
NHEAD = 8               # Number of attention heads
FFN_HID_DIM = 512       # Feedforward network hidden dimension
NUM_ENCODER_LAYERS = 3  # 3-4 layers recommended 
NUM_DECODER_LAYERS = 3  # 3-4 layers recommended 

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Instantiate the model

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                    NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                    FFN_HID_DIM)

# Move model to the appropriate device
transformer = transformer.to(DEVICE)

# Create some dummy input tensors to test the model's forward pass
SRC_LEN = 10  # Length of the source sentence
TGT_LEN = 12  # Length of the target sentence
BATCH_SIZE = 4

src = torch.randint(0, SRC_VOCAB_SIZE, (SRC_LEN, BATCH_SIZE)).to(DEVICE)
tgt = torch.randint(0, TGT_VOCAB_SIZE, (TGT_LEN, BATCH_SIZE)).to(DEVICE)

# Create the necessary masks for the Transformer
tgt_mask = generate_square_subsequent_mask(TGT_LEN, DEVICE)
src_mask = torch.zeros((SRC_LEN, SRC_LEN), device=DEVICE).type(torch.bool)

src_padding_mask = torch.zeros((BATCH_SIZE, SRC_LEN), device=DEVICE).type(torch.bool)
tgt_padding_mask = torch.zeros((BATCH_SIZE, TGT_LEN), device=DEVICE).type(torch.bool)

# Get the model's output
logits = transformer(src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

print("\n--- Model Test ---")
print(f"Model instantiated successfully.")
print(f"Shape of dummy source input: {src.shape} (SeqLen, BatchSize)")
print(f"Shape of dummy target input: {tgt.shape} (SeqLen, BatchSize)")
print(f"Shape of model output logits: {logits.shape} (SeqLen, BatchSize, TgtVocabSize)")
print("--------------------")

Using device: cuda





--- Model Test ---
Model instantiated successfully.
Shape of dummy source input: torch.Size([10, 4]) (SeqLen, BatchSize)
Shape of dummy target input: torch.Size([12, 4]) (SeqLen, BatchSize)
Shape of model output logits: torch.Size([12, 4, 10000]) (SeqLen, BatchSize, TgtVocabSize)
--------------------




In [6]:
# step4_training_loop.py

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from tokenizers import Tokenizer
from tqdm import tqdm # For a nice progress bar
import math

# --- 1. Hyperparameters and Setup ---
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Define special symbols and their indices
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3

# Model Hyperparameters (must match step3_model.py)
SRC_VOCAB_SIZE = 10000
TGT_VOCAB_SIZE = 10000
EMB_SIZE = 256
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

# Training Hyperparameters
BATCH_SIZE = 32 # Reduced for easier training on consumer GPUs
NUM_EPOCHS = 5
LEARNING_RATE = 0.0001

# --- 2. Data Loading and Preprocessing ---
# Load tokenizers
tokenizer_en = Tokenizer.from_file("tokenizer_en.json")
tokenizer_de = Tokenizer.from_file("tokenizer_de.json")

# Load dataset
dataset = load_dataset("bentrevett/multi30k")

# This function is crucial for preparing batches of data.
# It tokenizes, adds special tokens, pads sequences to the same length,
# and converts everything to tensors.
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for item in batch:
        src_sentence, tgt_sentence = item['en'], item['de']
        
        # Tokenize and add Start-of-Sentence (SOS) and End-of-Sentence (EOS) tokens
        src_tokens = [SOS_IDX] + tokenizer_en.encode(src_sentence).ids + [EOS_IDX]
        tgt_tokens = [SOS_IDX] + tokenizer_de.encode(tgt_sentence).ids + [EOS_IDX]
        
        src_batch.append(torch.tensor(src_tokens))
        tgt_batch.append(torch.tensor(tgt_tokens))

    # Pad sequences to the length of the longest sequence in the batch
    src_padded = nn.utils.rnn.pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_batch, padding_value=PAD_IDX)
    
    return src_padded, tgt_padded

# Create DataLoaders
train_dataloader = DataLoader(dataset['train'], batch_size=BATCH_SIZE, collate_fn=collate_fn)
val_dataloader = DataLoader(dataset['validation'], batch_size=BATCH_SIZE, collate_fn=collate_fn)

# --- 3. Model, Loss, and Optimizer Initialization ---
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
transformer = transformer.to(DEVICE)

# Initialize weights with a common strategy for Transformers
for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

# Loss function: CrossEntropyLoss, ignoring padding tokens
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Optimizer: Adam is a standard choice for Transformers
optimizer = torch.optim.Adam(transformer.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.98), eps=1e-9)

# --- 4. Training and Evaluation Functions ---
def train_epoch(model, optimizer, dataloader):
    model.train()
    losses = 0
    
    for src, tgt in tqdm(dataloader, desc="Training"):
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        # Prepare target data for decoder input (shifted right) and loss calculation
        tgt_input = tgt[:-1, :]
        tgt_out = tgt[1:, :]

        # Create masks
        src_seq_len = src.shape[0]
        tgt_seq_len = tgt_input.shape[0]
        
        tgt_mask = generate_square_subsequent_mask(tgt_seq_len, DEVICE)
        src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)
        
        src_padding_mask = (src == PAD_IDX).transpose(0, 1)
        tgt_padding_mask = (tgt_input == PAD_IDX).transpose(0, 1)

        # Forward pass
        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

        # Zero gradients, calculate loss, backpropagate, and update weights
        optimizer.zero_grad()
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()
        optimizer.step()
        losses += loss.item()

    return losses / len(list(dataloader))

def evaluate(model, dataloader):
    model.eval()
    losses = 0

    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Validating"):
            src = src.to(DEVICE)
            tgt = tgt.to(DEVICE)

            tgt_input = tgt[:-1, :]
            tgt_out = tgt[1:, :]

            src_seq_len = src.shape[0]
            tgt_seq_len = tgt_input.shape[0]

            tgt_mask = generate_square_subsequent_mask(tgt_seq_len, DEVICE)
            src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)
            
            src_padding_mask = (src == PAD_IDX).transpose(0, 1)
            tgt_padding_mask = (tgt_input == PAD_IDX).transpose(0, 1)

            logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
            
            loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
            losses += loss.item()

    return losses / len(list(dataloader))


# --- 5. Main Training Loop ---
best_val_loss = float('inf')

for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = train_epoch(transformer, optimizer, train_dataloader)
    val_loss = evaluate(transformer, val_dataloader)

    print(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}")
    
    # Save the model if it has the best validation loss so far
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(transformer.state_dict(), 'best_model.pth')
        print(f"Saved new best model to 'best_model.pth'")

print("Training complete. Best model saved as 'best_model.pth'")

Using device: cuda


Training: 100%|██████████| 907/907 [00:23<00:00, 38.83it/s]
Validating: 100%|██████████| 32/32 [00:00<00:00, 113.15it/s]


Epoch: 1, Train loss: 5.386, Val loss: 4.471
Saved new best model to 'best_model.pth'


Training: 100%|██████████| 907/907 [00:22<00:00, 39.73it/s]
Validating: 100%|██████████| 32/32 [00:00<00:00, 107.64it/s]


Epoch: 2, Train loss: 4.074, Val loss: 3.836
Saved new best model to 'best_model.pth'


Training: 100%|██████████| 907/907 [00:23<00:00, 38.17it/s]
Validating: 100%|██████████| 32/32 [00:00<00:00, 103.10it/s]


Epoch: 3, Train loss: 3.567, Val loss: 3.441
Saved new best model to 'best_model.pth'


Training: 100%|██████████| 907/907 [00:23<00:00, 38.32it/s]
Validating: 100%|██████████| 32/32 [00:00<00:00, 93.80it/s]


Epoch: 4, Train loss: 3.216, Val loss: 3.167
Saved new best model to 'best_model.pth'


Training: 100%|██████████| 907/907 [00:23<00:00, 39.24it/s]
Validating: 100%|██████████| 32/32 [00:00<00:00, 101.20it/s]


Epoch: 5, Train loss: 2.950, Val loss: 2.944
Saved new best model to 'best_model.pth'
Training complete. Best model saved as 'best_model.pth'
