# Seq2Seq Neural Machine Translation

This notebook implements a sequence-to-sequence model for machine translation. The implementation is organized into modular sections for clarity and maintainability.

## Table of Contents
1. Imports and Setup
2. Data Handling
3. Model Architecture
4. Training Utilities
5. Translation Utilities
6. Training and Evaluation

## 1. Imports and Setup

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils
import polars as pl
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
from typing import Dict, List, Tuple

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


## 2. Data Handling

### 2.1 Vocabulary Builder

In [18]:
def build_vocab(token_list: List[str], min_freq: int = 1) -> Tuple[Dict[str, int], int]:
    """Build vocabulary from list of tokens."""
    # Count token frequencies
    counter = Counter(token_list)
    
    # Filter by minimum frequency
    filtered_tokens = [token for token, count in counter.items() if count >= min_freq]
    
    # Create vocabulary with special tokens
    special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>"]
    token_to_idx = {token: idx for idx, token in enumerate(special_tokens + filtered_tokens)}
    
    return token_to_idx, len(token_to_idx)

### 2.2 Dataset Class

In [19]:
class TranslationDataset(Dataset):
    def __init__(self, dataframe, src_vocab, tgt_vocab, src_col="en", tgt_col="it"):
        self.src_sentences = dataframe[src_col].to_list()
        self.tgt_sentences = dataframe[tgt_col].to_list()
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        # Convert tokens to indices using vocabulary
        src_indices = [self.src_vocab.get(token, 0) for token in self.src_sentences[idx]]
        tgt_indices = [self.tgt_vocab.get(token, 0) for token in self.tgt_sentences[idx]]
        
        return {
            "src": torch.tensor(src_indices, dtype=torch.long),
            "tgt": torch.tensor(tgt_indices, dtype=torch.long)
        }

### 2.3 Data Loading Utilities

In [20]:
def collate_fn(batch):
    """Custom collate function for padding sequences in a batch."""
    src_batch = [item['src'] for item in batch]
    tgt_batch = [item['tgt'] for item in batch]

    # Pad sequences in the batch to the same length
    src_padded = rnn_utils.pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = rnn_utils.pad_sequence(tgt_batch, batch_first=True, padding_value=0)

    return {"src": src_padded, "tgt": tgt_padded}

def create_dataloaders(train_dataset, val_dataset, batch_size, shuffle=True):
    """Create train and validation dataloaders."""
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        collate_fn=collate_fn
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        collate_fn=collate_fn
    )
    
    return train_loader, val_loader

## 3. Model Architecture

In [21]:
class Seq2Seq(nn.Module):
    def __init__(self, input_dim: int, output_dim: int, emb_dim: int, 
                 hid_dim: int, n_layers: int, dropout: float):
        super().__init__()
        
        self.encoder = nn.Embedding(input_dim, emb_dim)
        self.decoder = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, tgt):
        # Encode source
        src_emb = self.dropout(self.encoder(src))
        _, (hidden, cell) = self.rnn(src_emb)

        # Decode target
        tgt_emb = self.dropout(self.decoder(tgt))
        outputs, _ = self.rnn(tgt_emb, (hidden, cell))

        predictions = self.fc_out(outputs)
        return predictions

## 4. Training Utilities

In [22]:
def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    epochs: int,
    learning_rate: float,
    device: torch.device
) -> Tuple[List[float], List[float]]:
    """Train the model and return training history."""
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    model.to(device)
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for batch in train_loader:
            src = batch['src'].to(device)
            tgt = batch['tgt'].to(device)

            optimizer.zero_grad()
            output = model(src, tgt[:, :-1])

            output_dim = output.shape[-1]
            output = output.reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)

            loss = criterion(output, tgt)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        val_loss = validate_model(model, val_loader, criterion, device)
        
        train_losses.append(train_loss / len(train_loader))
        val_losses.append(val_loss)

        print(f"Epoch: {epoch + 1}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")

    return train_losses, val_losses

def validate_model(
    model: nn.Module,
    val_loader: DataLoader,
    criterion: nn.Module,
    device: torch.device
) -> float:
    """Validate the model and return validation loss."""
    model.eval()
    val_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            src = batch['src'].to(device)
            tgt = batch['tgt'].to(device)

            output = model(src, tgt[:, :-1])
            output_dim = output.shape[-1]
            output = output.reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)

            loss = criterion(output, tgt)
            val_loss += loss.item()

    return val_loss / len(val_loader)

def plot_training_history(train_losses: List[float], val_losses: List[float]):
    """Plot training and validation loss history."""
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label="Train Loss")
    plt.plot(val_losses, label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training History")
    plt.legend()
    plt.grid(True)
    plt.show()

## 5. Translation Utilities

In [23]:
def translate_sentence(
    model: nn.Module,
    sentence: list,
    src_vocab: dict,
    tgt_vocab: dict,
    device: torch.device,
    max_len: int = 50
) -> list:
    """Translate a sentence using the trained model."""
    model.eval()
    
    # Convert tokens to indices
    src_indices = [src_vocab.get(token, src_vocab['<unk>']) for token in sentence]
    src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        src_emb = model.encoder(src_tensor)
        _, (hidden, cell) = model.rnn(src_emb)

    # Initialize with start token
    outputs = [tgt_vocab['<sos>']]

    for _ in range(max_len):
        tgt_tensor = torch.tensor(outputs, dtype=torch.long).unsqueeze(0).to(device)
        
        with torch.no_grad():
            output = model.decoder(tgt_tensor)
            output, (hidden, cell) = model.rnn(output, (hidden, cell))
            pred_token = output.argmax(2)[:, -1].item()

        outputs.append(pred_token)
        if pred_token == tgt_vocab['<eos>']:
            break

    # Convert indices back to tokens
    idx_to_token = {idx: token for token, idx in tgt_vocab.items()}
    translated_tokens = [idx_to_token.get(idx, '<unk>') for idx in outputs[1:-1]]  # Remove <sos> and <eos>
    
    return translated_tokens

## 6. Training and Evaluation

In [25]:
# Configuration
BATCH_SIZE = 32
EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5
LEARNING_RATE = 0.001
EPOCHS = 10

# Load data
input_ready = pl.read_ndjson("../data/output/processed.json")
print("Data loaded successfully")
input_ready.head()

Data loaded successfully


it,en
list[str],list[str]
"[""grazie"", ""amico""]","[""thank"", ""buddy""]"
"[""di il""]","[""say""]"
"[""trifosfare"", ""sodio"", … ""sodio""]","[""sodium"", ""triphosphate"", … ""tripolyphosphate""]"
"[""invero"", ""avidare"", … ""ricchezzo""]","[""surely"", ""ardent"", … ""wealth""]"
"[""allegare""]","[""annex""]"


In [26]:
# Build vocabularies
english_tokens = [token for sentence in input_ready["en"].to_list() for token in sentence]
italian_tokens = [token for sentence in input_ready["it"].to_list() for token in sentence]

english_vocab, input_dim = build_vocab(english_tokens)
italian_vocab, output_dim = build_vocab(italian_tokens)

print(f"English Vocabulary Size: {input_dim}")
print(f"Italian Vocabulary Size: {output_dim}")

English Vocabulary Size: 130049
Italian Vocabulary Size: 198442


In [27]:
# Split data and create datasets
train_data, val_data = train_test_split(input_ready, test_size=0.2, random_state=42)

train_dataset = TranslationDataset(train_data, english_vocab, italian_vocab)
val_dataset = TranslationDataset(val_data, english_vocab, italian_vocab)

train_loader, val_loader = create_dataloaders(train_dataset, val_dataset, BATCH_SIZE)
print("Datasets and dataloaders created successfully")

Datasets and dataloaders created successfully


In [None]:
# Initialize and train model
model = Seq2Seq(input_dim, output_dim, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
train_losses, val_losses = train_model(model, train_loader, val_loader, EPOCHS, LEARNING_RATE, device)

# Plot training history
plot_training_history(train_losses, val_losses)

In [None]:
# Save model
torch.save(model.state_dict(), "models/translator.pt")
print("Model saved successfully")

In [None]:
# Test translation
test_sentence = ["hello", "world"]
translated = translate_sentence(model, test_sentence, english_vocab, italian_vocab, device)
print(f"Input: {' '.join(test_sentence)}")
print(f"Translation: {' '.join(translated)}")