## Sequence to Sequence Learning with Neural Networks



### Implementation Overview

- This notebook represents a personal implementation of the seq2seq architecture as introduced by Sutskever et al. (2014) focusing on understanding core concepts and challenges in neural machine translation.

- The approach addresses the sequence transformation problem of converting German sentences (variable length) into English sentences (different variable length) while maintaining semantic meaning throughout the process.

- The implementation uses an encoder-decoder architecture where the encoder compresses source sentences into fixed-size representations and the decoder generates target sequences from these representations. The key insight involves using LSTM networks for both encoding and decoding phases with teacher forcing during training.

#### Dependencies and Setup

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate


seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

#### Data Loading

In [None]:
# Loading the dataset - this provides German-English sentence pairs
dataset = datasets.load_dataset("bentrevett/multi30k")

# Examining the structure to understand the data format
print("Dataset structure:", dataset)
print("\nFirst training example:", dataset["train"][0])

# Splitting into train/validation/test
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"], 
    dataset["test"]
)

In [None]:
# Loading language models for tokenization
# These need to be downloaded first: python -m spacy download en_core_web_sm de_core_news_sm
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

# Testing the tokenizers to see how they work
test_en = "Hello, world! How are you?"
test_de = "Hallo, Welt! Wie geht es dir?"

print("English tokens:", [token.text for token in en_nlp.tokenizer(test_en)])
print("German tokens:", [token.text for token in de_nlp.tokenizer(test_de)])

#### Data Preprocessing

In [None]:
def preprocess_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    """
    Preprocessing function - tokenize and prepare the data
    
    Key decisions made:
    - Truncating to max_length to avoid memory issues
    - Converting to lowercase for vocabulary reduction
    - Adding special tokens for sequence boundaries
    """
    # Tokenize both languages
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    
    # Lowercase if specified - this is a trade-off between vocab size and information
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    
    # Add sequence markers - essential for the model to understand boundaries
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

# Apply preprocessing to all data
max_length = 1000  # Should be sufficient for image captions
lower = True       # Reducing vocabulary complexity
sos_token = "<sos>"
eos_token = "<eos>"

preprocessing_args = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(preprocess_example, fn_kwargs=preprocessing_args)
valid_data = valid_data.map(preprocess_example, fn_kwargs=preprocessing_args)
test_data = test_data.map(preprocess_example, fn_kwargs=preprocessing_args)

# Check the result
print("Preprocessed example:", train_data[0])

In [None]:
# Building vocabularies - converting tokens to indices
min_freq = 2  # Ignore rare tokens to reduce noise
unk_token = "<unk>"  # For unknown tokens
pad_token = "<pad>"  # For padding sequences to same length

special_tokens = [unk_token, pad_token, sos_token, eos_token]

print(f"Building vocabularies with min_freq={min_freq}...")

# Build English vocabulary from training data only (avoid data leakage)
en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

# Build German vocabulary
de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

print(f"English vocabulary size: {len(en_vocab)}")
print(f"German vocabulary size: {len(de_vocab)}")
print(f"Most common English tokens: {en_vocab.get_itos()[:10]}")
print(f"Most common German tokens: {de_vocab.get_itos()[:10]}")

In [None]:
# Set up unknown token handling
unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

# These should be the same for both vocabularies due to special_tokens ordering
assert en_vocab[unk_token] == de_vocab[unk_token], "Vocab mismatch!"
assert en_vocab[pad_token] == de_vocab[pad_token], "Vocab mismatch!"

# Set default behavior for unknown tokens
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

print(f"Unknown token index: {unk_index}")
print(f"Padding token index: {pad_index}")

# Test unknown token handling
test_token = "supercalifragilisticexpialidocious"
print(f"Unknown token '{test_token}' maps to index: {en_vocab[test_token]}")

In [None]:
def convert_to_indices(example, en_vocab, de_vocab):
    """
    Convert token strings to vocabulary indices
    This is the final step before the data can be fed to the model
    """
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

# Apply to all datasets
vocab_args = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(convert_to_indices, fn_kwargs=vocab_args)
valid_data = valid_data.map(convert_to_indices, fn_kwargs=vocab_args)
test_data = test_data.map(convert_to_indices, fn_kwargs=vocab_args)

# Convert to PyTorch tensors for efficiency
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(type=data_type, columns=format_columns, output_all_columns=True)
valid_data = valid_data.with_format(type=data_type, columns=format_columns, output_all_columns=True)
test_data = test_data.with_format(type=data_type, columns=format_columns, output_all_columns=True)

print("Converted example:", train_data[0])

In [None]:
def create_collate_function(pad_index):
    """
    Create a collation function for batching variable-length sequences
    
    My reasoning: Different sentences have different lengths, but neural networks
    need fixed-size inputs. Solution: pad shorter sequences with pad_token.
    """
    def collate_batch(batch):
        # Extract English and German sequences
        en_sequences = [example["en_ids"] for example in batch]
        de_sequences = [example["de_ids"] for example in batch]
        
        # Pad sequences to the same length
        en_padded = nn.utils.rnn.pad_sequence(en_sequences, padding_value=pad_index)
        de_padded = nn.utils.rnn.pad_sequence(de_sequences, padding_value=pad_index)
        
        return {
            "en_ids": en_padded,
            "de_ids": de_padded,
        }
    
    return collate_batch

def create_dataloader(dataset, batch_size, pad_index, shuffle=False):
    """
    Create a DataLoader with proper collation
    """
    collate_fn = create_collate_function(pad_index)
    return torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )

# Create data loaders
batch_size = 128  # Balance between memory usage and training efficiency

train_loader = create_dataloader(train_data, batch_size, pad_index, shuffle=True)
valid_loader = create_dataloader(valid_data, batch_size, pad_index, shuffle=False)
test_loader = create_dataloader(test_data, batch_size, pad_index, shuffle=False)

print(f"Created data loaders with batch size {batch_size}")
print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(valid_loader)}")

#### Model Architecture

In [None]:
class MyEncoder(nn.Module):
    """
    Encoder implementation
    
    Purpose: Convert variable-length German sequence into fixed-size context vector
    
    Architectural decisions:
    - Embedding layer: Convert token indices to dense vectors
    - LSTM: Capture sequential patterns and long-range dependencies  
    - Multi-layer: Increase model capacity for complex patterns
    - Dropout: Prevent overfitting
    """
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate):
        super().__init__()
        
        # Store dimensions for later use
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        # Components
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
    
    def forward(self, source_sequence):
        """
        Forward pass logic:
        1. Embed the token indices into dense vectors
        2. Apply dropout for regularization
        3. Pass through LSTM to get context vectors
        4. Return final hidden and cell states as context
        """
        # source_sequence shape: [seq_len, batch_size]
        embedded = self.dropout(self.embedding(source_sequence))
        # embedded shape: [seq_len, batch_size, embedding_dim]
        
        # LSTM returns: outputs, (final_hidden, final_cell)
        outputs, (hidden, cell) = self.lstm(embedded)
        
        # Only the final states are needed as context for the decoder
        return hidden, cell

print("Encoder implementation complete")

In [None]:
class MyDecoder(nn.Module):
    """
    Decoder implementation
    
    Purpose: Generate English words one at a time using context from encoder
    
    Key insight: This is an autoregressive model - each prediction depends on
    previous predictions and the encoder context.
    """
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate):
        super().__init__()
        
        # Store dimensions
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        # Components
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout_rate)
        self.output_projection = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout_rate)
    
    def forward(self, input_token, hidden_state, cell_state):
        """
        Decoding step:
        1. Embed the current input token
        2. Pass through LSTM with previous hidden/cell states
        3. Project LSTM output to vocabulary size
        4. Return prediction and updated states
        """
        # input_token shape: [batch_size] - single token per batch item
        
        # Add sequence dimension for LSTM
        input_token = input_token.unsqueeze(0)  # [1, batch_size]
        
        # Embed and apply dropout
        embedded = self.dropout(self.embedding(input_token))
        # embedded shape: [1, batch_size, embedding_dim]
        
        # LSTM forward pass
        lstm_output, (new_hidden, new_cell) = self.lstm(embedded, (hidden_state, cell_state))
        
        # Remove sequence dimension and project to vocabulary
        prediction = self.output_projection(lstm_output.squeeze(0))
        # prediction shape: [batch_size, vocab_size]
        
        return prediction, new_hidden, new_cell

print("Decoder implementation complete")

#### Seq2Seq Model


In [None]:
class MySeq2Seq(nn.Module):
    """
    My complete sequence-to-sequence model
    
    This orchestrates the encoder and decoder to perform translation.
    
    Key design decision: Teacher forcing during training
    - Sometimes use ground truth target tokens (teacher forcing)
    - Sometimes use model's own predictions (preparing for inference)
    """
    
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        # Sanity checks - encoder and decoder must be compatible
        assert encoder.hidden_dim == decoder.hidden_dim, "Hidden dimensions must match!"
        assert encoder.n_layers == decoder.n_layers, "Number of layers must match!"
    
    def forward(self, source_seq, target_seq, teacher_forcing_ratio):
        """
        My translation process:
        1. Encode source sequence to get context
        2. Initialize decoder with context
        3. Generate target sequence one token at a time
        4. Use teacher forcing probabilistically during training
        """
        # Get dimensions
        batch_size = target_seq.shape[1]
        target_length = target_seq.shape[0]
        vocab_size = self.decoder.vocab_size
        
        # Tensor to store all predictions
        predictions = torch.zeros(target_length, batch_size, vocab_size).to(self.device)
        
        # Step 1: Encode source sequence
        hidden_context, cell_context = self.encoder(source_seq)
        
        # Step 2: Initialize decoder with <sos> token
        decoder_input = target_seq[0, :]  # First token is <sos>
        hidden_state, cell_state = hidden_context, cell_context
        
        # Step 3: Generate target sequence
        for timestep in range(1, target_length):
            # Predict next token
            prediction, hidden_state, cell_state = self.decoder(
                decoder_input, hidden_state, cell_state
            )
            
            # Store prediction
            predictions[timestep] = prediction
            
            # Decide on next input: teacher forcing or model prediction
            use_teacher_forcing = random.random() < teacher_forcing_ratio
            
            if use_teacher_forcing:
                # Use ground truth token
                decoder_input = target_seq[timestep]
            else:
                # Use model's prediction
                decoder_input = prediction.argmax(dim=1)
        
        return predictions

print("Complete Seq2Seq model implemented")

## Model Instantiation and Setup


In [None]:
# Model hyperparameters - my design choices
input_vocab_size = len(de_vocab)    # German vocabulary size
output_vocab_size = len(en_vocab)   # English vocabulary size
embedding_dim = 256                 # Reasonable embedding size
hidden_dim = 512                   # LSTM hidden state size
n_layers = 2                       # Multi-layer for more capacity
dropout_rate = 0.5                 # Regularization strength

# Device setup - use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create model components
encoder = MyEncoder(
    vocab_size=input_vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    n_layers=n_layers,
    dropout_rate=dropout_rate
)

decoder = MyDecoder(
    vocab_size=output_vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    n_layers=n_layers,
    dropout_rate=dropout_rate
)

# Complete model
model = MySeq2Seq(encoder, decoder, device).to(device)

print(f"Model created with {sum(p.numel() for p in model.parameters() if p.requires_grad):,} parameters")

In [None]:
def initialize_weights(model):
    """
    My weight initialization strategy
    
    Using uniform distribution [-0.08, 0.08] as suggested in the original paper.
    This helps with gradient flow in the early stages of training.
    """
    for name, param in model.named_parameters():
        if param.requires_grad:
            nn.init.uniform_(param.data, -0.08, 0.08)
            print(f"Initialized {name}: {param.shape}")

# Initialize model weights
model.apply(initialize_weights)
print("\nWeight initialization complete")

#### Training Setup


In [None]:
# Training components
optimizer = optim.Adam(model.parameters())  # Adam usually works well
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)  # Ignore padding tokens

def train_one_epoch(model, dataloader, optimizer, criterion, clip_value, teacher_forcing_ratio, device):
    """
    My training loop for one epoch
    
    Key aspects:
    - Teacher forcing for stable training
    - Gradient clipping for RNN stability
    - Loss calculation excluding padding tokens
    """
    model.train()
    total_loss = 0
    
    for batch_idx, batch in enumerate(dataloader):
        # Move data to device
        src_seq = batch["de_ids"].to(device)
        tgt_seq = batch["en_ids"].to(device)
        
        # Reset gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(src_seq, tgt_seq, teacher_forcing_ratio)
        
        # Prepare for loss calculation
        # Skip first prediction (corresponds to <sos>)
        predictions = predictions[1:].reshape(-1, predictions.shape[-1])
        targets = tgt_seq[1:].reshape(-1)
        
        # Calculate loss
        loss = criterion(predictions, targets)
        
        # Backward pass
        loss.backward()
        
        # Clip gradients to prevent explosion
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
        
        # Update parameters
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion, device):
    """
    My evaluation function
    
    Key difference from training: no teacher forcing - model must rely
    entirely on its own predictions.
    """
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            src_seq = batch["de_ids"].to(device)
            tgt_seq = batch["en_ids"].to(device)
            
            # No teacher forcing during evaluation
            predictions = model(src_seq, tgt_seq, teacher_forcing_ratio=0.0)
            
            # Calculate loss
            predictions = predictions[1:].reshape(-1, predictions.shape[-1])
            targets = tgt_seq[1:].reshape(-1)
            loss = criterion(predictions, targets)
            
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

print("Training functions ready")

#### Training Loop

In [None]:
# Training hyperparameters
n_epochs = 10
gradient_clip = 1.0
teacher_forcing_ratio = 0.5  # 50% teacher forcing

best_valid_loss = float('inf')
model_save_path = 'my_seq2seq_model.pt'

print("Starting training...")

for epoch in range(n_epochs):
    print(f"\nEpoch {epoch+1}/{n_epochs}")
    
    # Training phase
    train_loss = train_one_epoch(
        model=model,
        dataloader=train_loader,
        optimizer=optimizer,
        criterion=criterion,
        clip_value=gradient_clip,
        teacher_forcing_ratio=teacher_forcing_ratio,
        device=device
    )
    
    # Validation phase
    valid_loss = evaluate_model(model, valid_loader, criterion, device)
    
    # Save best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_save_path)
        print(f"New best model saved! Validation loss: {valid_loss:.3f}")
    
    # Progress report
    train_ppl = np.exp(train_loss)
    valid_ppl = np.exp(valid_loss)
    
    print(f"Train Loss: {train_loss:.3f} | Train Perplexity: {train_ppl:.3f}")
    print(f"Valid Loss: {valid_loss:.3f} | Valid Perplexity: {valid_ppl:.3f}")

print(f"\nTraining complete! Best validation loss: {best_valid_loss:.3f}")

In [None]:
def translate_sentence(
    sentence, model, de_nlp, en_nlp, de_vocab, en_vocab,
    lower, sos_token, eos_token, device, max_length=25
):
    """
    My sentence translation function
    
    This is the real test - can the model translate arbitrary German sentences?
    
    Process:
    1. Tokenize and preprocess German sentence
    2. Encode to get context
    3. Decode step by step until <eos> or max length
    4. Convert indices back to English tokens
    """
    model.eval()
    
    with torch.no_grad():
        # Preprocess input sentence
        if isinstance(sentence, str):
            tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            tokens = list(sentence)
        
        if lower:
            tokens = [token.lower() for token in tokens]
        
        # Add special tokens
        tokens = [sos_token] + tokens + [eos_token]
        
        # Convert to indices and tensor
        indices = de_vocab.lookup_indices(tokens)
        src_tensor = torch.LongTensor(indices).unsqueeze(-1).to(device)
        
        # Encode source sentence
        hidden_context, cell_context = model.encoder(src_tensor)
        
        # Initialize decoding with <sos>
        generated_indices = [en_vocab[sos_token]]
        hidden_state, cell_state = hidden_context, cell_context
        
        # Generate translation token by token
        for _ in range(max_length):
            # Current input token
            current_input = torch.LongTensor([generated_indices[-1]]).to(device)
            
            # Get next token prediction
            prediction, hidden_state, cell_state = model.decoder(
                current_input, hidden_state, cell_state
            )
            
            # Get most likely next token
            next_token_idx = prediction.argmax(-1).item()
            generated_indices.append(next_token_idx)
            
            # Stop if we hit end-of-sequence
            if next_token_idx == en_vocab[eos_token]:
                break
        
        # Convert indices back to tokens
        translated_tokens = en_vocab.lookup_tokens(generated_indices)
        
    return translated_tokens

print("Translation function ready")

#### Model Eval

In [None]:
# Load best model
model.load_state_dict(torch.load(model_save_path))
print("Best model loaded")

# Test on a few examples
test_sentences = [
    "Ein Mann sitzt auf einer Bank.",  # A man sits on a bench
    "Zwei Kinder spielen im Park.",    # Two children play in the park
    "Die Katze schläft auf dem Sofa.", # The cat sleeps on the sofa
]

print("\nTesting translation on sample sentences:")
for german_sentence in test_sentences:
    translation = translate_sentence(
        sentence=german_sentence,
        model=model,
        de_nlp=de_nlp,
        en_nlp=en_nlp,
        de_vocab=de_vocab,
        en_vocab=en_vocab,
        lower=lower,
        sos_token=sos_token,
        eos_token=eos_token,
        device=device
    )
    
    # Clean up translation (remove special tokens)
    clean_translation = ' '.join(translation[1:-1])  # Remove <sos> and <eos>
    
    print(f"German: {german_sentence}")
    print(f"English: {clean_translation}")
    print()

#### BLEU Score Calculation

In [None]:
# Generate translations for entire test set
print("Generating translations for test set...")

test_translations = []
for example in tqdm.tqdm(test_data):
    translation = translate_sentence(
        sentence=example["de"],
        model=model,
        de_nlp=de_nlp,
        en_nlp=en_nlp,
        de_vocab=de_vocab,
        en_vocab=en_vocab,
        lower=lower,
        sos_token=sos_token,
        eos_token=eos_token,
        device=device
    )
    test_translations.append(translation)

# Prepare data for BLEU calculation
def create_tokenizer_function(nlp, lower):
    def tokenize(text):
        tokens = [token.text for token in nlp.tokenizer(text)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens
    return tokenize

tokenizer = create_tokenizer_function(en_nlp, lower)

# Format predictions and references for BLEU
predictions = [' '.join(translation[1:-1]) for translation in test_translations]  # Remove <sos>/<eos>
references = [[example["en"]] for example in test_data]  # BLEU expects list of references

# Calculate BLEU score
bleu_metric = evaluate.load("bleu")
bleu_results = bleu_metric.compute(
    predictions=predictions,
    references=references,
    tokenizer=tokenizer
)

print(f"\nFinal BLEU Score: {bleu_results['bleu']:.4f}")
print(f"Precision scores: {bleu_results['precisions']}")
print(f"Brevity penalty: {bleu_results['brevity_penalty']:.4f}")

# Show a few example translations
print("\nSample translations:")
for i in range(3):
    print(f"German: {test_data[i]['de']}")
    print(f"Expected: {test_data[i]['en']}")
    print(f"My model: {predictions[i]}")
    print()