In [2]:
import os
import math
import time
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from tqdm import tqdm
from collections import Counter
import re
import random
import warnings
warnings.filterwarnings('ignore')

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [40]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/sunil/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
class Vocabulary:
    """
    Vocabulary class for handling word/token to index mapping
    """
    def __init__(self, special_tokens=None):
        if special_tokens is None:
            self.special_tokens = ['<unk>', '<pad>', '<bos>', '<eos>']
        else:
            self.special_tokens = special_tokens
        
        # Initialize word to index and index to word mappings
        self.word2idx = {token: idx for idx, token in enumerate(self.special_tokens)}
        self.idx2word = {idx: token for idx, token in enumerate(self.special_tokens)}
        self.word_count = {}  # For tracking word frequencies
        
        # Token indices
        self.UNK_IDX = self.word2idx['<unk>']
        self.PAD_IDX = self.word2idx['<pad>']
        self.BOS_IDX = self.word2idx['<bos>']
        self.EOS_IDX = self.word2idx['<eos>']
        
        # Current vocabulary size
        self.n_words = len(self.special_tokens)
    
    def add_word(self, word):
        """Add a word to the vocabulary"""
        if word not in self.word2idx:
            self.word2idx[word] = self.n_words
            self.word_count[word] = 1
            self.idx2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word_count[word] = self.word_count.get(word, 0) + 1
        
        return self.word2idx[word]
    
    def add_words(self, words):
        """Add multiple words to the vocabulary"""
        indices = []
        for word in words:
            indices.append(self.add_word(word))
        return indices
    
    def prune_vocab(self, min_freq=2, max_size=None):
        """
        Prune vocabulary to include only frequent words
        
        Args:
            min_freq: Minimum frequency required for keeping a token
            max_size: Maximum vocabulary size (including special tokens)
        """
        # Sort words by frequency
        sorted_words = sorted(self.word_count.items(), key=lambda x: x[1], reverse=True)
        
        # Apply max_size limit if provided
        if max_size is not None:
            sorted_words = sorted_words[:max_size - len(self.special_tokens)]
        
        # Filter by frequency
        filtered_words = [word for word, count in sorted_words if count >= min_freq]
        
        # Reset vocabulary
        self.word2idx = {token: idx for idx, token in enumerate(self.special_tokens)}
        self.idx2word = {idx: token for idx, token in enumerate(self.special_tokens)}
        self.n_words = len(self.special_tokens)
        
        # Add filtered words
        for word in filtered_words:
            self.word2idx[word] = self.n_words
            self.idx2word[self.n_words] = word
            self.n_words += 1
        
        print(f"Pruned vocabulary from {len(self.word_count)} to {self.n_words} tokens")
        
        # Keep word_count intact for reference
    
    def __len__(self):
        return self.n_words
    
    def to_json(self):
        """Serialize vocabulary to JSON format"""
        return {
            'word2idx': self.word2idx,
            'idx2word': {str(idx): word for idx, word in self.idx2word.items()},
            'word_count': self.word_count,
            'n_words': self.n_words,
            'special_tokens': self.special_tokens
        }
    
    @classmethod
    def from_json(cls, json_data):
        """Create vocabulary from JSON data"""
        vocab = cls(special_tokens=json_data['special_tokens'])
        vocab.word2idx = json_data['word2idx']
        vocab.idx2word = {int(idx): word for idx, word in json_data['idx2word'].items()}
        vocab.word_count = json_data['word_count']
        vocab.n_words = json_data['n_words']
        
        # Update special token indices
        vocab.UNK_IDX = vocab.word2idx['<unk>']
        vocab.PAD_IDX = vocab.word2idx['<pad>']
        vocab.BOS_IDX = vocab.word2idx['<bos>']
        vocab.EOS_IDX = vocab.word2idx['<eos>']
        
        return vocab
    
    def save(self, path):
        """Save vocabulary to file"""
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(self.to_json(), f, ensure_ascii=False, indent=2)
    
    @classmethod
    def load(cls, path):
        """Load vocabulary from file"""
        with open(path, 'r', encoding='utf-8') as f:
            return cls.from_json(json.load(f))


In [6]:
class SimpleTokenizer:
    """Simple tokenizer for text processing"""
    
    def __init__(self, vocab, language='en'):
        self.vocab = vocab
        self.language = language
    
    def tokenize(self, text):
        """Tokenize text to list of tokens"""
        # Simple word-level tokenization
        # Can be replaced with more sophisticated tokenizers
        tokens = text.lower().split()
        return tokens
    
    def encode(self, text):
        """Convert text to list of token indices"""
        tokens = self.tokenize(text)
        indices = []
        for token in tokens:
            if token in self.vocab.word2idx:
                indices.append(self.vocab.word2idx[token])
            else:
                indices.append(self.vocab.UNK_IDX)
        return indices
    
    def decode(self, indices):
        """Convert list of token indices to text"""
        tokens = [self.vocab.idx2word.get(idx, '<unk>') for idx in indices]
        return ' '.join(tokens)

In [7]:
class GermanEnglishDataset(Dataset):
    """Dataset for German-English translation pairs"""
    
    def __init__(self, german_texts, english_texts, src_tokenizer, tgt_tokenizer, 
                 max_len=100, is_train=True):
        self.german_texts = german_texts
        self.english_texts = english_texts
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.max_len = max_len
        self.is_train = is_train
        
        # Reference to vocabulary objects
        self.src_vocab = src_tokenizer.vocab
        self.tgt_vocab = tgt_tokenizer.vocab
        
    def __len__(self):
        return len(self.german_texts)
    
    def __getitem__(self, idx):
        german_text = self.german_texts[idx]
        english_text = self.english_texts[idx]
        
        # Tokenize and encode
        german_tokens = self.src_tokenizer.encode(german_text)
        english_tokens = self.tgt_tokenizer.encode(english_text)
        
        # Add BOS and EOS tokens to target (English)
        english_tokens = [self.tgt_vocab.BOS_IDX] + english_tokens + [self.tgt_vocab.EOS_IDX]
        
        # Pad or truncate sequences
        if len(german_tokens) < self.max_len:
            german_tokens = german_tokens + [self.src_vocab.PAD_IDX] * (self.max_len - len(german_tokens))
        else:
            german_tokens = german_tokens[:self.max_len]
            
        if len(english_tokens) < self.max_len:
            english_tokens = english_tokens + [self.tgt_vocab.PAD_IDX] * (self.max_len - len(english_tokens))
        else:
            english_tokens = english_tokens[:self.max_len]
            
        return {
            'source': torch.tensor(german_tokens, dtype=torch.long),
            'target': torch.tensor(english_tokens, dtype=torch.long),
            'source_text': german_text,
            'target_text': english_text
        }

In [8]:
def load_parallel_text(src_file, tgt_file):
    """Load parallel text from files"""
    with open(src_file, 'r', encoding='utf-8') as f:
        src_texts = [line.strip() for line in f]
    
    with open(tgt_file, 'r', encoding='utf-8') as f:
        tgt_texts = [line.strip() for line in f]
    
    assert len(src_texts) == len(tgt_texts), "Source and target files have different number of lines"
    
    return src_texts, tgt_texts

In [9]:
def build_vocab_from_texts(texts, min_freq=2, max_size=None):
    """Build vocabulary from texts"""
    vocab = Vocabulary()
    
    # Add all words to vocabulary
    for text in texts:
        words = text.lower().split()
        vocab.add_words(words)
    
    # Prune vocabulary to include only frequent words
    vocab.prune_vocab(min_freq=min_freq, max_size=max_size)
    
    return vocab

In [10]:
def prepare_data(src_file, tgt_file, src_vocab=None, tgt_vocab=None, 
                min_freq=2, max_vocab_size=50000, max_len=100, batch_size=64,
                train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, seed=42):
    """Prepare data for training, validation, and testing"""
    # Load text
    src_texts, tgt_texts = load_parallel_text(src_file, tgt_file)
    print(f"Loaded {len(src_texts)} parallel sentences")
    
    # Build vocabularies if not provided
    if src_vocab is None:
        src_vocab = build_vocab_from_texts(src_texts, min_freq=min_freq, max_size=max_vocab_size)
        print(f"Built source vocabulary with {len(src_vocab)} tokens")
    
    if tgt_vocab is None:
        tgt_vocab = build_vocab_from_texts(tgt_texts, min_freq=min_freq, max_size=max_vocab_size)
        print(f"Built target vocabulary with {len(tgt_vocab)} tokens")
    
    # Create tokenizers
    src_tokenizer = SimpleTokenizer(src_vocab, language='de')
    tgt_tokenizer = SimpleTokenizer(tgt_vocab, language='en')
    
    # Split data into train, validation, and test sets
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1"
    
    # Ensure reproducibility
    random.seed(seed)
    
    # Shuffle indices
    indices = list(range(len(src_texts)))
    random.shuffle(indices)
    
    # Calculate split sizes
    train_size = int(len(indices) * train_ratio)
    val_size = int(len(indices) * val_ratio)
    
    # Split indices
    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    test_indices = indices[train_size + val_size:]
    
    # Create datasets
    train_dataset = GermanEnglishDataset(
        [src_texts[i] for i in train_indices],
        [tgt_texts[i] for i in train_indices],
        src_tokenizer, tgt_tokenizer, max_len=max_len, is_train=True
    )
    
    val_dataset = GermanEnglishDataset(
        [src_texts[i] for i in val_indices],
        [tgt_texts[i] for i in val_indices],
        src_tokenizer, tgt_tokenizer, max_len=max_len, is_train=False
    )
    
    test_dataset = GermanEnglishDataset(
        [src_texts[i] for i in test_indices],
        [tgt_texts[i] for i in test_indices],
        src_tokenizer, tgt_tokenizer, max_len=max_len, is_train=False
    )
    
    # Create data loaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
    
    print(f"Split data into {len(train_dataset)} train, {len(val_dataset)} validation, and {len(test_dataset)} test samples")
    
    return {
        'train_dataloader': train_dataloader,
        'val_dataloader': val_dataloader,
        'test_dataloader': test_dataloader,
        'src_vocab': src_vocab,
        'tgt_vocab': tgt_vocab,
        'src_tokenizer': src_tokenizer,
        'tgt_tokenizer': tgt_tokenizer
    }

In [11]:
class PositionalEncoding(nn.Module):
    """
    Adds positional encoding to the token embeddings to introduce a notion of word order.
    """
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Create a tensor of shape (max_len, d_model)
        pe = torch.zeros(max_len, d_model)
        
        # Create a tensor of shape (max_len, 1)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        # Calculate the division term
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term)
        
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Add batch dimension
        pe = pe.unsqueeze(0)
        
        # Register buffer (not a parameter, but should be saved and moved to device)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        """
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [12]:
class MultiHeadAttention(nn.Module):
    """
    Multi-Head Attention module as described in "Attention is All You Need"
    """
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        # Linear projections for Q, K, V
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        
        # Output projection
        self.output = nn.Linear(d_model, d_model)
        
        # Dropout for attention
        self.dropout = nn.Dropout(p=dropout)
    
    def forward(self, q, k, v, mask=None):
        """
        Args:
            q: Query tensor, shape [batch_size, seq_len_q, d_model]
            k: Key tensor, shape [batch_size, seq_len_k, d_model]
            v: Value tensor, shape [batch_size, seq_len_v, d_model]
            mask: Optional mask tensor, shape [batch_size, seq_len_q, seq_len_k]
        """
        batch_size = q.size(0)
        
        # Apply linear projections and reshape to [batch_size, num_heads, seq_len, d_k]
        q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        # Calculate attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply mask if provided
        if mask is not None:
            # Add head dimension to mask
            if mask.dim() == 3:  # [batch_size, seq_len_q, seq_len_k]
                mask = mask.unsqueeze(1)  # [batch_size, 1, seq_len_q, seq_len_k]
            scores = scores.masked_fill(mask == 0, -1e9)
        
        # Apply softmax to get attention weights
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        # Apply attention weights to values
        output = torch.matmul(attn_weights, v)
        
        # Reshape back to [batch_size, seq_len, d_model]
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        # Apply output projection
        return self.output(output), attn_weights

In [13]:
class PositionwiseFeedforward(nn.Module):
    """
    Position-wise Feed-Forward Network as described in "Attention is All You Need"
    """
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedforward, self).__init__()
        
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x):
        """
        Args:
            x: Input tensor, shape [batch_size, seq_len, d_model]
        """
        x = self.dropout(F.relu(self.linear1(x)))
        x = self.linear2(x)
        return x

In [14]:
class EncoderLayer(nn.Module):
    """
    Encoder layer in a Transformer model.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff, dropout)
        
        # Layer normalization layers
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        # Dropout for residual connections
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x, mask=None):
        """
        Args:
            x: Input tensor, shape [batch_size, seq_len, d_model]
            mask: Optional mask tensor, shape [batch_size, seq_len, seq_len]
        """
        # Self-attention with residual connection and layer normalization
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed-forward with residual connection and layer normalization
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

In [15]:
class DecoderLayer(nn.Module):
    """
    Decoder layer in a Transformer model.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff, dropout)
        
        # Layer normalization layers
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
        # Dropout for residual connections
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        """
        Args:
            x: Target sequence, shape [batch_size, tgt_seq_len, d_model]
            enc_output: Encoder output, shape [batch_size, src_seq_len, d_model]
            src_mask: Source sequence mask, shape [batch_size, 1, src_seq_len]
            tgt_mask: Target sequence mask, shape [batch_size, tgt_seq_len, tgt_seq_len]
        """
        # Self-attention with residual connection and layer normalization
        self_attn_output, _ = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn_output))
        
        # Cross-attention with encoder output
        cross_attn_output, _ = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(cross_attn_output))
        
        # Feed-forward with residual connection and layer normalization
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        
        return x

In [16]:
class Transformer(nn.Module):
    """
    Complete Transformer model for Neural Machine Translation.
    """
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_heads=8,
                 num_encoder_layers=6, num_decoder_layers=6, d_ff=2048, max_len=100,
                 dropout=0.1, pad_idx=1):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.pad_idx = pad_idx
        
        # Embedding layers
        self.src_embedding = nn.Embedding(src_vocab_size, d_model, padding_idx=pad_idx)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model, padding_idx=pad_idx)
        
        # Positional encoding
        self.positional_encoding = PositionalEncoding(d_model, max_len, dropout)
        
        # Encoder and decoder layers
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_encoder_layers)
        ])
        
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_decoder_layers)
        ])
        
        # Output projection
        self.output_projection = nn.Linear(d_model, tgt_vocab_size)
        
        # Initialize parameters
        self._init_parameters()
        
    def _init_parameters(self):
        """
        Initialize model parameters.
        """
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
                
    def _create_src_mask(self, src):
        """
        Create padding mask for source sequence.
        """
        # src: [batch_size, src_seq_len]
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        # src_mask: [batch_size, 1, 1, src_seq_len]
        return src_mask
    
    def _create_tgt_mask(self, tgt):
        """
        Create both padding and look-ahead mask for target sequence.
        """
        # tgt: [batch_size, tgt_seq_len]
        batch_size, tgt_seq_len = tgt.size()
        
        # Padding mask
        padding_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(2)
        # padding_mask: [batch_size, 1, 1, tgt_seq_len]
        
        # Look-ahead mask
        look_ahead_mask = torch.ones(tgt_seq_len, tgt_seq_len, device=device).triu(diagonal=1).eq(0)
        # look_ahead_mask: [tgt_seq_len, tgt_seq_len]
        
        # Combine padding and look-ahead masks
        combined_mask = padding_mask & look_ahead_mask
        # combined_mask: [batch_size, 1, tgt_seq_len, tgt_seq_len]
        
        return combined_mask
    
    def encode(self, src, src_mask=None):
        """
        Encode source sequence.
        """
        # src: [batch_size, src_seq_len]
        if src_mask is None:
            src_mask = self._create_src_mask(src)
        
        # Apply embedding and positional encoding
        src_embedded = self.src_embedding(src) * math.sqrt(self.d_model)
        src_embedded = self.positional_encoding(src_embedded)
        
        # Apply encoder layers
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
            
        return enc_output, src_mask
    
    def decode(self, tgt, enc_output, src_mask=None, tgt_mask=None):
        """
        Decode target sequence given encoded source.
        """
        # tgt: [batch_size, tgt_seq_len]
        if tgt_mask is None:
            tgt_mask = self._create_tgt_mask(tgt)
            
        # Apply embedding and positional encoding
        tgt_embedded = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
        tgt_embedded = self.positional_encoding(tgt_embedded)
        
        # Apply decoder layers
        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
            
        # Apply output projection
        output = self.output_projection(dec_output)
        
        return output
    
    def forward(self, src, tgt):
        """
        Forward pass of the Transformer model.
        """
        # src: [batch_size, src_seq_len]
        # tgt: [batch_size, tgt_seq_len]
        
        # Create masks
        src_mask = self._create_src_mask(src)
        tgt_mask = self._create_tgt_mask(tgt)
        
        # Encode source
        enc_output, src_mask = self.encode(src, src_mask)
        
        # Decode target
        output = self.decode(tgt, enc_output, src_mask, tgt_mask)
        
        return output

In [17]:
def beam_search(model, src, tgt_vocab, max_len=100, beam_size=5, device=device):
    """
    Beam Search for inference.
    
    Args:
        model: Trained Transformer model
        src: Source sequence tensor, shape [1, src_seq_len]
        tgt_vocab: Target vocabulary
        max_len: Maximum length of generated sequence
        beam_size: Beam size
        device: Device to run inference on
    """
    model.eval()
    
    # Constants
    batch_size = src.size(0)
    PAD_IDX = tgt_vocab.PAD_IDX
    BOS_IDX = tgt_vocab.BOS_IDX
    EOS_IDX = tgt_vocab.EOS_IDX
    
    # Encode source
    src_mask = model._create_src_mask(src)
    enc_output, src_mask = model.encode(src, src_mask)
    
    # Initialize beams with <BOS> token
    beams = torch.full((batch_size * beam_size, 1), BOS_IDX, dtype=torch.long, device=device)
    beam_scores = torch.zeros(batch_size, beam_size, device=device)
    
    # Track completed sequences
    completed_seqs = [[] for _ in range(batch_size)]
    completed_scores = [[] for _ in range(batch_size)]
    
    # Repeat encoder outputs for beam size
    enc_output = enc_output.repeat_interleave(beam_size, dim=0)
    src_mask = src_mask.repeat_interleave(beam_size, dim=0)
    
    # Generate tokens step by step
    for step in range(max_len - 1):
        tgt = beams
        
        # Get model predictions
        with torch.no_grad():
            logits = model.decode(tgt, enc_output, src_mask)
            
        # Get predictions for next token
        next_token_logits = logits[:, -1, :]  # [batch_size*beam_size, vocab_size]
        next_token_scores = F.log_softmax(next_token_logits, dim=-1)  # [batch_size*beam_size, vocab_size]
        
        # Reshape for beam search
        next_token_scores = next_token_scores.view(batch_size, beam_size, -1)  # [batch_size, beam_size, vocab_size]
        vocab_size = next_token_scores.size(-1)
        
        # Calculate cumulative scores
        beam_scores_expanded = beam_scores.unsqueeze(-1).expand(-1, -1, vocab_size)  # [batch_size, beam_size, vocab_size]
        next_scores = beam_scores_expanded + next_token_scores  # [batch_size, beam_size, vocab_size]
        
        # Flatten for top-k selection
        next_scores_flat = next_scores.view(batch_size, -1)  # [batch_size, beam_size*vocab_size]
        
        # Select top-k scores
        topk_scores, topk_indices = next_scores_flat.topk(beam_size, dim=1, largest=True, sorted=True)
        
        # Convert indices to beam and token indices
        beam_indices = topk_indices // vocab_size  # [batch_size, beam_size]
        token_indices = topk_indices % vocab_size  # [batch_size, beam_size]
        
        # Update beam scores
        beam_scores = topk_scores  # [batch_size, beam_size]
        
        # Prepare for next iteration
        new_beams = []
        for batch_idx in range(batch_size):
            for beam_idx in range(beam_size):
                # Get selected beam and token
                prev_beam = beam_indices[batch_idx, beam_idx]
                token = token_indices[batch_idx, beam_idx]
                score = beam_scores[batch_idx, beam_idx]
                
                # Get previous sequence
                prev_seq = beams[batch_idx * beam_size + prev_beam]
                
                # Check if sequence is completed
                if token.item() == EOS_IDX:
                    completed_seqs[batch_idx].append(prev_seq.tolist())
                    completed_scores[batch_idx].append(score.item())
                    # Add a pad token to maintain beam size
                    new_seq = torch.cat([prev_seq, torch.tensor([PAD_IDX], device=device)])
                else:
                    # Add token to sequence
                    new_seq = torch.cat([prev_seq, token.unsqueeze(0)])
                
                new_beams.append(new_seq)
        
        # Stack beams for next iteration
        beams = torch.stack(new_beams).view(batch_size * beam_size, -1)
        
        # Check if all beams have completed
        if all(len(seqs) >= beam_size for seqs in completed_seqs):
            break
    
    # Select best sequences from each batch
    results = []
    for batch_idx in range(batch_size):
        # Add any remaining beams to completed_seqs if needed
        if len(completed_seqs[batch_idx]) < beam_size:
            for beam_idx in range(beam_size):
                seq = beams[batch_idx * beam_size + beam_idx].tolist()
                if EOS_IDX in seq:
                    # Truncate at EOS
                    seq = seq[:seq.index(EOS_IDX) + 1]
                else:
                    # Add EOS if not present
                    seq = seq + [EOS_IDX]
                
                completed_seqs[batch_idx].append(seq)
                completed_scores[batch_idx].append(beam_scores[batch_idx, beam_idx].item())
        
        # Get best sequence based on score
        best_idx = max(range(len(completed_scores[batch_idx])), 
                      key=lambda i: completed_scores[batch_idx][i])
        best_seq = completed_seqs[batch_idx][best_idx]
        
        # Remove special tokens (keeping only the needed ones)
        if BOS_IDX in best_seq:
            best_seq = best_seq[1:]  # Remove BOS
        if EOS_IDX in best_seq:
            best_seq = best_seq[:best_seq.index(EOS_IDX)]  # Remove everything after EOS
            
        results.append(best_seq)
    
    # Convert token indices to words
    translated_sentences = []
    for seq in results:
        words = [tgt_vocab.idx2word.get(idx, '<unk>') for idx in seq]
        translated_sentences.append(" ".join(words))
    
    return translated_sentences

In [18]:
def count_parameters(model):
    """Count the number of trainable parameters in the model"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [19]:
def initialize_weights(m):
    """Initialize weights for the model"""
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [20]:
def train_epoch(model, dataloader, optimizer, criterion, clip=1.0):
    """
    Train the model for one epoch.
    
    Args:
        model: Transformer model
        dataloader: Training data loader
        optimizer: Optimizer
        criterion: Loss function
        clip: Gradient clipping value
    """
    model.train()
    epoch_loss = 0
    
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    
    for batch in progress_bar:
        # Get source and target sequences
        src = batch['source'].to(device)
        tgt = batch['target'].to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass - teacher forcing
        # Input: all but the last token
        # Target: all but the first token (<sos>)
        output = model(src, tgt[:, :-1])
        
        # Reshape output and target for loss computation
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        tgt = tgt[:, 1:].contiguous().view(-1)
        
        # Compute loss
        loss = criterion(output, tgt)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Update parameters
        optimizer.step()
        
        # Update progress bar
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=epoch_loss / (progress_bar.n + 1))
    
    return epoch_loss / len(dataloader)


In [21]:
def evaluate(model, dataloader, criterion):
    """
    Evaluate the model on validation or test data.
    
    Args:
        model: Transformer model
        dataloader: Validation/Test data loader
        criterion: Loss function
    """
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            # Get source and target sequences
            src = batch['source'].to(device)
            tgt = batch['target'].to(device)
            
            # Forward pass - teacher forcing
            output = model(src, tgt[:, :-1])
            
            # Reshape output and target for loss computation
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            tgt = tgt[:, 1:].contiguous().view(-1)
            
            # Compute loss
            loss = criterion(output, tgt)
            
            # Update loss
            epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)

In [22]:
def translate(model, src_tokenizer, tgt_vocab, src_text, max_len=100, beam_size=5):
    """
    Translate a source text from German to English.
    
    Args:
        model: Trained Transformer model
        src_tokenizer: Source tokenizer (German)
        tgt_vocab: Target vocabulary (English)
        src_text: Source text in German
        max_len: Maximum length of generated sequence
        beam_size: Beam size for beam search
    """
    model.eval()
    
    # Tokenize source text
    tokens = src_tokenizer.encode(src_text)
    
    # Add batch dimension
    src = torch.tensor([tokens], dtype=torch.long).to(device)
    
    # Perform beam search
    translated = beam_search(model, src, tgt_vocab, max_len, beam_size, device)
    
    return translated[0]

In [23]:
def calculate_bleu(model, test_dataloader, src_tokenizer, tgt_vocab, max_len=100, beam_size=5):
    """
    Calculate BLEU score on test data.
    
    Args:
        model: Trained Transformer model
        test_dataloader: Test data loader
        src_tokenizer: Source tokenizer (German)
        tgt_vocab: Target vocabulary (English)
        max_len: Maximum length of generated sequence
        beam_size: Beam size for beam search
    """
    model.eval()
    references = []
    hypotheses = []
    
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Calculating BLEU", leave=False):
            # Get source and reference texts
            src_texts = batch['source_text']
            trg_texts = batch['target_text']
            
            # Translate each source text
            for src_text, ref_text in zip(src_texts, trg_texts):
                # Get reference tokens (removing special tokens)
                ref_tokens = nltk.word_tokenize(ref_text.lower())
                references.append([ref_tokens])
                
                # Translate source text
                translated = translate(model, src_tokenizer, tgt_vocab, src_text, max_len, beam_size)
                
                # Tokenize translated text
                hyp_tokens = nltk.word_tokenize(translated.lower())
                hypotheses.append(hyp_tokens)
    
    # Calculate BLEU score
    smoothing = SmoothingFunction().method1
    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothing)
    
    return bleu_score

In [24]:
def train_model(train_dataloader, valid_dataloader, src_vocab_size, tgt_vocab_size, 
                d_model=512, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, 
                d_ff=2048, max_len=100, dropout=0.1, lr=0.0005, n_epochs=10, 
                clip=1.0, warmup_steps=4000, pad_idx=1):
    """
    Train the Transformer model.
    
    Args:
        train_dataloader: Training data loader
        valid_dataloader: Validation data loader
        src_vocab_size: Size of source vocabulary
        tgt_vocab_size: Size of target vocabulary
        d_model: Dimension of model
        num_heads: Number of attention heads
        num_encoder_layers: Number of encoder layers
        num_decoder_layers: Number of decoder layers
        d_ff: Dimension of feed-forward network
        max_len: Maximum sequence length
        dropout: Dropout rate
        lr: Learning rate
        n_epochs: Number of training epochs
        clip: Gradient clipping value
        warmup_steps: Number of warmup steps for learning rate scheduler
        pad_idx: Padding index
    """
    # Initialize model
    model = Transformer(
        src_vocab_size=src_vocab_size,
        tgt_vocab_size=tgt_vocab_size,
        d_model=d_model,
        num_heads=num_heads,
        num_encoder_layers=num_encoder_layers,
        num_decoder_layers=num_decoder_layers,
        d_ff=d_ff,
        max_len=max_len,
        dropout=dropout,
        pad_idx=pad_idx
    ).to(device)
    
    print(f"The model has {count_parameters(model):,} trainable parameters")
    
    # Initialize weights
    model.apply(initialize_weights)
    
    # Define optimizer and learning rate scheduler
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)
    
    # Learning rate scheduler with warmup
    def lr_lambda(step):
        # Linear warmup for warmup_steps steps
        if step < warmup_steps:
            return float(step) / float(max(1, warmup_steps))
        # Decrease learning rate after warmup
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * (step - warmup_steps) / n_epochs)))
    
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    
    # Define loss function (ignore padding index)
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    
    # Track best validation loss
    best_valid_loss = float('inf')
    
    # Training and validation losses history
    train_losses = []
    valid_losses = []
    
    for epoch in range(n_epochs):
        print(f"Epoch {epoch+1}/{n_epochs}")
        
        # Train model for one epoch
        start_time = time.time()
        train_loss = train_epoch(model, train_dataloader, optimizer, criterion, clip)
        train_losses.append(train_loss)
        
        # Evaluate on validation set
        valid_loss = evaluate(model, valid_dataloader, criterion)
        valid_losses.append(valid_loss)
        
        # Update learning rate
        scheduler.step()
        
        # Save model if validation loss improves
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'best_model.pt')
            print("* New best model saved *")
        
        # Print epoch stats
        end_time = time.time()
        epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
        print(f"Time: {epoch_mins}m {epoch_secs:.2f}s | Train Loss: {train_loss:.4f} | Val Loss: {valid_loss:.4f}")
    
    # Plot training and validation losses
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(valid_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Losses')
    plt.legend()
    plt.grid(True)
    plt.savefig('training_loss.png')
    plt.close()
    
    return model, train_losses, valid_losses

In [27]:
def create_sample_dataset():
    """
    Create a small sample dataset for demonstration purposes.
    """
    de_samples = [
        "Ich gehe zur Schule.",
        "Obwohl es regnete, ging sie ohne Regenschirm spazieren.",
        "Der Klimawandel ist ein globales Problem.",
        "Berlin ist die Hauptstadt von Deutschland.",
        "Das Buch, das auf dem Tisch liegt, gehört meinem Bruder.",
        "Sie arbeitet als Ärztin in einem Krankenhaus.",
        "Er hat gestern seinen Geburtstag gefeiert.",
        "Wir sollten mehr Obst und Gemüse essen.",
        "Die Kinder spielen im Park.",
        "Ich lerne seit zwei Jahren Deutsch."
    ]
    
    en_samples = [
        "I am going to school.",
        "Although it was raining, she went for a walk without an umbrella.",
        "Climate change is a global problem.",
        "Berlin is the capital of Germany.",
        "The book that is on the table belongs to my brother.",
        "She works as a doctor in a hospital.",
        "He celebrated his birthday yesterday.",
        "We should eat more fruit and vegetables.",
        "The children are playing in the park.",
        "I have been learning German for two years."
    ]
    
    # Save samples to files
    os.makedirs('data', exist_ok=True)
    
    with open('data/sample.de', 'w', encoding='utf-8') as f:
        for text in de_samples:
            f.write(f"{text}\n")
    
    with open('data/sample.en', 'w', encoding='utf-8') as f:
        for text in en_samples:
            f.write(f"{text}\n")
    
    print("Created sample dataset in data/sample.de and data/sample.en")
    
    return 'data/sample.de', 'data/sample.en'

In [25]:
src_file = 'data/train.de'
tgt_file = 'data/train.en'

In [28]:
if not (os.path.exists(src_file) and os.path.exists(tgt_file)):
    print("Training data not found. Creating sample dataset.")
    src_file, tgt_file = create_sample_dataset()

Training data not found. Creating sample dataset.
Created sample dataset in data/sample.de and data/sample.en


In [29]:
batch_size = 64
max_len = 100
min_freq = 2
max_vocab_size = 50000

In [30]:
data = prepare_data(
    src_file=src_file, 
    tgt_file=tgt_file,
    min_freq=min_freq,
    max_vocab_size=max_vocab_size,
    max_len=max_len,
    batch_size=batch_size
)

Loaded 10 parallel sentences
Pruned vocabulary from 60 to 9 tokens
Built source vocabulary with 9 tokens
Pruned vocabulary from 61 to 12 tokens
Built target vocabulary with 12 tokens
Split data into 8 train, 1 validation, and 1 test samples


In [31]:
train_dataloader = data['train_dataloader']
val_dataloader = data['val_dataloader']
test_dataloader = data['test_dataloader']
src_vocab = data['src_vocab']
tgt_vocab = data['tgt_vocab']
src_tokenizer = data['src_tokenizer']
tgt_tokenizer = data['tgt_tokenizer']

In [32]:
os.makedirs('models', exist_ok=True)
src_vocab.save('models/vocab.de.json')
tgt_vocab.save('models/vocab.en.json')

In [33]:
d_model = 512
num_heads = 8
num_encoder_layers = 6
num_decoder_layers = 6
d_ff = 2048
dropout = 0.1

In [34]:
lr = 0.0005
n_epochs = 10
clip = 1.0
warmup_steps = 4000

In [37]:
model, train_losses, valid_losses = train_model(
        train_dataloader, 
        val_dataloader, 
        len(src_vocab), 
        len(tgt_vocab),
        d_model,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        d_ff,
        max_len,
        dropout,
        lr,
        n_epochs,
        clip,
        warmup_steps,
        pad_idx=src_vocab.PAD_IDX
)

The model has 44,155,404 trainable parameters
Epoch 1/10


                                                                                  

* New best model saved *
Time: 0.0m 0.38s | Train Loss: 2.7949 | Val Loss: 3.1422
Epoch 2/10


                                                                                  

* New best model saved *
Time: 0.0m 0.38s | Train Loss: 2.7894 | Val Loss: 3.1140
Epoch 3/10


                                                                                  

* New best model saved *
Time: 0.0m 0.36s | Train Loss: 2.8140 | Val Loss: 3.0584
Epoch 4/10


                                                                                  

* New best model saved *
Time: 0.0m 0.36s | Train Loss: 2.6038 | Val Loss: 2.9771
Epoch 5/10


                                                                                  

* New best model saved *
Time: 0.0m 0.36s | Train Loss: 2.6476 | Val Loss: 2.8729
Epoch 6/10


                                                                                  

* New best model saved *
Time: 0.0m 0.36s | Train Loss: 2.5282 | Val Loss: 2.7479
Epoch 7/10


                                                                                  

* New best model saved *
Time: 0.0m 0.37s | Train Loss: 2.3646 | Val Loss: 2.6087
Epoch 8/10


                                                                                  

* New best model saved *
Time: 0.0m 0.37s | Train Loss: 2.5238 | Val Loss: 2.4605
Epoch 9/10


                                                                                  

* New best model saved *
Time: 0.0m 0.38s | Train Loss: 2.2681 | Val Loss: 2.3120
Epoch 10/10


                                                                                  

* New best model saved *
Time: 0.0m 0.36s | Train Loss: 2.1335 | Val Loss: 2.1738


In [38]:
torch.save(model.state_dict(), 'models/final_model.pt')

In [42]:
print("Calculating BLEU score on test data...")
bleu_score = calculate_bleu(model, test_dataloader, src_tokenizer, tgt_vocab)
print(f"BLEU score: {bleu_score:.4f}")

Calculating BLEU score on test data...


                                                                                  

BLEU score: 0.0000




In [44]:
print("\nExample translations:")
test_sentences = [
    "Ich gehe zur Schule.",
    "Obwohl es regnete, ging sie ohne Regenschirm spazieren.",
    "Der Klimawandel ist ein globales Problem.",
    "Berlin ist die Hauptstadt von Deutschland.",
    "Das Buch, das auf dem Tisch liegt, gehört meinem Bruder."
]


Example translations:


In [46]:
for sent in test_sentences:
    translated = translate(model, src_tokenizer, tgt_vocab, sent)
    print(f"DE: {sent}")
    print(f"EN: {translated}")
    print("-" * 50)

print("=" * 80)
print("Neural Machine Translation model training and evaluation complete!")

DE: Ich gehe zur Schule.
EN: <unk> <unk> <unk> <unk> she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she she
--------------------------------------------------
DE: Obwohl es regnete, ging sie ohne Regenschirm spazieren.
EN: <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <u