In [123]:
import os
import re
import nltk
import gzip
import time
import spacy
import random
import zipfile
import unicodedata
import numpy as np
import pandas as pd
import urllib.request

from tqdm import tqdm
from collections import Counter
from typing import List, Dict, Tuple, Optional, Iterator
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [4]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x1143e0270>

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## Data Preprocessing

In [6]:
MULTI30K_URL = "https://github.com/multi30k/dataset/raw/master/data/task1/raw"

TRAIN_FILES = {
    'de': "train.de.gz",
    'en': "train.en.gz"
}
VAL_FILES = {
    'de': "val.de.gz",
    'en': "val.en.gz"
}
TEST_FILES = {
    'de': "test_2016_flickr.de.gz",
    'en': "test_2016_flickr.en.gz"
}

In [7]:
DEFAULT_DATA_DIR = "data/multi30k"
SPACY_DE_MODEL = "de_core_news_sm"
SPACY_EN_MODEL = "en_core_web_sm"

### 1. Downloading the datasets from Multi30K

In [8]:
def download_and_extract_data(data_dir):
    os.makedirs(data_dir, exist_ok=True)
    
    for split, files in zip(["train", "val", "test"], [TRAIN_FILES, VAL_FILES, TEST_FILES]):
        for lang, filename in files.items():
            url = f"{MULTI30K_URL}/{filename}"
            output_path = os.path.join(data_dir, filename)

            # Skip if file already exists
            if os.path.exists(output_path.replace('.gz', '')):
                print("File already exists")
                continue

            # Download the file to the specified directory
            urllib.request.urlretrieve(url, output_path)
            with gzip.open(output_path, 'rb') as f_in:
                with open(output_path.replace('.gz', ''), 'wb') as f_out:
                    f_out.write(f_in.read())
            
            # Remove .gz file
            os.remove(output_path)

In [9]:
download_and_extract_data(DEFAULT_DATA_DIR)

File already exists
File already exists
File already exists
File already exists
File already exists
File already exists


### 2. Loading the Spacy models

In [10]:
def load_spacy_models():
    de_nlp = spacy.load(SPACY_DE_MODEL)
    en_nlp = spacy.load(SPACY_EN_MODEL)
    
    return de_nlp, en_nlp

In [11]:
de_nlp, en_nlp = load_spacy_models()

### 3. Reading and tokenizing the data

In [12]:
PAD_TOKEN = "<pad>"
SOS_TOKEN = "<sos>"  
EOS_TOKEN = "<eos>" 
UNK_TOKEN = "<unk>" 

In [13]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Normalize unicode
    text = unicodedata.normalize('NFKD', text)
    
    # Space contraction
    text = re.sub(r'\s+', ' ', text)
    
    # Trim the sentence
    text = text.strip()
    
    return text

In [14]:
def tokenize_sentence(sentence, spacy_model, max_length = None):
    # Preprocess the sentence
    sentence = preprocess_text(sentence)
    
    # Tokenize
    tokens = [token.text for token in spacy_model(sentence)]
    
    # Truncate if necessary
    if max_length is not None and len(tokens) > max_length - 2:
        tokens = tokens[:max_length - 2]
    
    # Add SOS and EOS tokens
    tokens = [SOS_TOKEN] + tokens + [EOS_TOKEN]
    
    return tokens

In [15]:
def read_and_tokenize_data(data_dir, split, de_nlp, en_nlp, max_length = None):
    if split == 'train':
        de_path = os.path.join(data_dir, TRAIN_FILES['de'].replace('.gz', ''))
        en_path = os.path.join(data_dir, TRAIN_FILES['en'].replace('.gz', ''))
    elif split == 'val':
        de_path = os.path.join(data_dir, VAL_FILES['de'].replace('.gz', ''))
        en_path = os.path.join(data_dir, VAL_FILES['en'].replace('.gz', ''))
    elif split == 'test':
        de_path = os.path.join(data_dir, TEST_FILES['de'].replace('.gz', ''))
        en_path = os.path.join(data_dir, TEST_FILES['en'].replace('.gz', ''))
    else:
        print(f"Invalid split: {split}")
    
    # Read files
    with open(de_path, 'r', encoding='utf-8') as f:
        de_sentences = f.readlines()
    
    with open(en_path, 'r', encoding='utf-8') as f:
        en_sentences = f.readlines()
    
    
    # Tokenize sentences
    tokenized_de = []
    tokenized_en = []
    
    for de_sent, en_sent in tqdm(zip(de_sentences, en_sentences), total=len(de_sentences)):
        de_tokens = tokenize_sentence(de_sent, de_nlp, max_length)
        en_tokens = tokenize_sentence(en_sent, en_nlp, max_length)
        
        tokenized_de.append(de_tokens)
        tokenized_en.append(en_tokens)
    
    return tokenized_de, tokenized_en

In [16]:
tokenized_de_train, tokenized_en_train = read_and_tokenize_data(DEFAULT_DATA_DIR, 'train', de_nlp, en_nlp, 50)
tokenized_de_val, tokenized_en_val = read_and_tokenize_data(DEFAULT_DATA_DIR, 'val', de_nlp, en_nlp, 50)
tokenized_de_test, tokenized_en_test = read_and_tokenize_data(DEFAULT_DATA_DIR, 'test', de_nlp, en_nlp, 50)

100%|████████████████████████████████████| 29000/29000 [03:17<00:00, 147.09it/s]
100%|██████████████████████████████████████| 1014/1014 [00:06<00:00, 157.51it/s]
100%|██████████████████████████████████████| 1000/1000 [00:06<00:00, 159.66it/s]


### 4. Build Vocabularies

In [17]:
class Vocabulary:
    def __init__(self, language, min_freq = 2):
        self.language = language
        self.min_freq = min_freq
        self.word2idx = {}
        self.idx2word = {}
        self.word_freq = Counter()
        self.specials = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN]

        for token in self.specials:
            self.add_token(token)

    # Token and indices updation
    def add_token(self, token):
        if token not in self.word2idx:
            self.word2idx[token] = len(self.word2idx)
            self.idx2word[len(self.idx2word)] = token
        return self.word2idx[token]

    # Counter updation
    def add_tokens(self, tokens):
        self.word_freq.update(tokens)

    # Main build
    def build(self):
        words = [word for word, freq in self.word_freq.items() if freq >= self.min_freq]

        for word in words:
            self.add_token(word)
    
    def __len__(self):
        return len(self.word2idx)

    # Get token's index
    def token_to_idx(self, token):
        return self.word2idx.get(token, self.word2idx[UNK_TOKEN])

    # Get list of indices for tokens
    def tokens_to_indices(self, tokens):
        return [self.token_to_idx(token) for token in tokens]

    # Get index's respective token
    def idx_to_token(self, idx):
        return self.idx2word.get(idx, UNK_TOKEN)

    # Get list of tokens for incides
    def indices_to_tokens(self, indices):
        return [self.idx_to_token(idx) for idx in indices]

In [18]:
def build_vocabularies(tokenized_de, tokenized_en, min_freq = 2):
    # Create vocabulary object
    de_vocab = Vocabulary(language='de', min_freq=min_freq)
    en_vocab = Vocabulary(language='en', min_freq=min_freq)
    
    # Add tokens
    for tokens in tokenized_de:
        de_vocab.add_tokens(tokens)
    
    for tokens in tokenized_en:
        en_vocab.add_tokens(tokens)
    
    # Build vocabularies
    de_vocab.build()
    en_vocab.build()
    
    return de_vocab, en_vocab

In [19]:
de_vocab, en_vocab = build_vocabularies(tokenized_de_train, tokenized_en_train, min_freq = 2)

In [20]:
src_pad_idx = de_vocab.token_to_idx(PAD_TOKEN)
trg_pad_idx = en_vocab.token_to_idx(PAD_TOKEN)
trg_sos_idx = en_vocab.token_to_idx(SOS_TOKEN)

### 5. Convert tokens to indices

In [21]:
def convert_to_indices(tokenized_sentences, vocab):
    return [vocab.tokens_to_indices(tokens) for tokens in tokenized_sentences]

In [22]:
de_indices_train = convert_to_indices(tokenized_de_train, de_vocab)
en_indices_train = convert_to_indices(tokenized_en_train, en_vocab)
de_indices_val = convert_to_indices(tokenized_de_val, de_vocab)
en_indices_val = convert_to_indices(tokenized_en_val, en_vocab)
de_indices_test = convert_to_indices(tokenized_de_test, de_vocab)
en_indices_test = convert_to_indices(tokenized_en_test, en_vocab)

### 6. Creating Data Loaders

In [23]:
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, source_vocab, target_vocab):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
    
    def __len__(self):
        return len(self.source_sentences)
    
    def __getitem__(self, idx):
        source = torch.tensor(self.source_sentences[idx], dtype=torch.long)
        target = torch.tensor(self.target_sentences[idx], dtype=torch.long)
        return source, target

In [24]:
def collate_fn(batch, pad_idx):
    # Sort batch by source length (descending)
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    
    # Separate source and target sequences
    source_seqs, target_seqs = zip(*batch)
    
    # Get lengths
    source_lengths = [len(seq) for seq in source_seqs]
    target_lengths = [len(seq) for seq in target_seqs]
    
    # Pad sequences
    padded_source = torch.nn.utils.rnn.pad_sequence(
        source_seqs, batch_first=True, padding_value=pad_idx
    )
    padded_target = torch.nn.utils.rnn.pad_sequence(
        target_seqs, batch_first=True, padding_value=pad_idx
    )
    
    # Convert lengths to tensor
    source_lengths = torch.tensor(source_lengths, dtype=torch.long)
    target_lengths = torch.tensor(target_lengths, dtype=torch.long)
    
    return padded_source, padded_target, source_lengths, target_lengths

In [25]:
def create_data_loaders(de_indices_train, en_indices_train, de_indices_val, 
                        en_indices_val, de_indices_test, en_indices_test, 
                        de_vocab: Vocabulary, en_vocab: Vocabulary,
                        batch_size = 64, shuffle = True):

    # Create datasets
    train_dataset = TranslationDataset(de_indices_train, en_indices_train, de_vocab, en_vocab)
    val_dataset = TranslationDataset(de_indices_val, en_indices_val, de_vocab, en_vocab)
    test_dataset = TranslationDataset(de_indices_test, en_indices_test, de_vocab, en_vocab)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=lambda batch: collate_fn(batch, de_vocab.token_to_idx(PAD_TOKEN))
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: collate_fn(batch, de_vocab.token_to_idx(PAD_TOKEN))
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: collate_fn(batch, de_vocab.token_to_idx(PAD_TOKEN))
    )
    
    return train_loader, val_loader, test_loader


In [26]:
train_loader, val_loader, test_loader = create_data_loaders(
        de_indices_train, en_indices_train,
        de_indices_val, en_indices_val,
        de_indices_test, en_indices_test,
        de_vocab, en_vocab, batch_size = 64)

### 7. Create Embeddings

In [27]:
def prepare_embeddings(de_vocab, en_vocab, embedding_dim = 256):
    # Initialize embedding matrices with random values
    de_embeddings = torch.randn(len(de_vocab), embedding_dim)
    en_embeddings = torch.randn(len(en_vocab), embedding_dim)
    
    # Set padding token embedding to zeros
    de_embeddings[de_vocab.token_to_idx(PAD_TOKEN)] = torch.zeros(embedding_dim)
    en_embeddings[en_vocab.token_to_idx(PAD_TOKEN)] = torch.zeros(embedding_dim)
    
    return de_embeddings, en_embeddings

In [28]:
de_embeddings, en_embeddings = prepare_embeddings(de_vocab, en_vocab, embedding_dim = 256)

In [29]:
de_embeddings

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.6855,  0.5636, -1.5072,  ...,  0.4232, -0.3389,  0.5180],
        [-1.3638,  0.1930, -0.6103,  ..., -1.6034, -0.4298,  0.5762],
        ...,
        [ 0.2886, -1.2949,  0.2749,  ...,  0.2017,  1.0070,  1.6758],
        [-0.0930,  0.4567,  1.8814,  ..., -0.4820, -0.3035,  0.5588],
        [ 0.8588,  0.8981,  0.3383,  ...,  0.8831,  0.5583,  2.0298]])

### 8. Save processed data

In [30]:
def save_data(de_vocab, en_vocab, de_embeddings, en_embeddings, output_dir = DEFAULT_DATA_DIR + "data/processed_data"):
    os.makedirs(output_dir, exist_ok=True)
    
    # Save vocabularies
    torch.save(de_vocab, os.path.join(output_dir, "de_vocab.pt"))
    torch.save(en_vocab, os.path.join(output_dir, "en_vocab.pt"))
    
    # Save embeddings
    torch.save(de_embeddings, os.path.join(output_dir, "de_embeddings.pt"))
    torch.save(en_embeddings, os.path.join(output_dir, "en_embeddings.pt"))

In [31]:
save_data(de_vocab, en_vocab, de_embeddings, en_embeddings, "data/processed_data")

## Model Development

### 1. Encoder

In [92]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_size, enc_hidden_size, 
        dec_hidden_size, num_layers, dropout_p, embedding_weights, pad_idx):
        super(EncoderLSTM, self).__init__()
        
        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size
        self.num_layers = num_layers
        
        self.dropout_layer = nn.Dropout(dropout_p)
        
        self.embedding = nn.Embedding.from_pretrained(embedding_weights, freeze=False, padding_idx=pad_idx)
        
        # LSTM layer (bidirectional)
        self.lstm = nn.LSTM(embedding_size, enc_hidden_size, num_layers, 
            dropout=dropout_p if num_layers > 1 else 0, bidirectional=True, batch_first=True)
        
        # Fully connected layers to transform concatenated hidden/cell states to decoder's hidden size
        self.fc_hidden = nn.Linear(enc_hidden_size * 2, dec_hidden_size)
        self.fc_cell = nn.Linear(enc_hidden_size * 2, dec_hidden_size)

    def forward(self, src_tokens):
        # Embedded shape: (batch_size, src_seq_len, embedding_size)
        embedded = self.dropout_layer(self.embedding(src_tokens))
        
        encoder_outputs, (final_hidden, final_cell) = self.lstm(embedded)

        # combined_h shape: (batch_size, enc_hidden_size * 2)
        combined_h = torch.cat((final_hidden[-2,:,:], final_hidden[-1,:,:]), dim=1)
        # combined_c shape: (batch_size, enc_hidden_size * 2)
        combined_c = torch.cat((final_cell[-2,:,:], final_cell[-1,:,:]), dim=1)
        
        # hidden shape: (batch_size, dec_hidden_size)
        hidden = torch.tanh(self.fc_hidden(combined_h))
        # cell shape: (batch_size, dec_hidden_size)
        cell = torch.tanh(self.fc_cell(combined_c))
        
        return encoder_outputs, hidden, cell

In [93]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_size_bi, dec_hidden_size, attention_dim):
        super(Attention, self).__init__()

        self.attn_W = nn.Linear(enc_hidden_size_bi + dec_hidden_size, attention_dim)
        self.attn_v = nn.Linear(attention_dim, 1, bias=False)

    def forward(self, decoder_hidden_prev, encoder_outputs, src_mask):
        batch_size = encoder_outputs.shape[0]
        src_seq_len = encoder_outputs.shape[1]

        decoder_hidden_prev_repeated = decoder_hidden_prev.unsqueeze(1).repeat(1, src_seq_len, 1)

        concat_for_energy = torch.cat((decoder_hidden_prev_repeated, encoder_outputs), dim=2)

        # energy shape: (batch_size, src_seq_len, attention_dim)
        energy = torch.tanh(self.attn_W(concat_for_energy))
        
        # attention_scores shape: (batch_size, src_seq_len)
        attention_scores = self.attn_v(energy).squeeze(2)
        attention_scores = attention_scores.masked_fill(src_mask == 0, -1e10)

        # attention_weights shape: (batch_size, src_seq_len)
        attention_weights = torch.softmax(attention_scores, dim=1)

        # context_vector shape: (batch_size, 1, enc_hidden_size_bi)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)
        
        return context_vector, attention_weights


In [94]:
class DecoderLSTM(nn.Module):
    def __init__(self, output_size, embedding_size, enc_hidden_size_bi, dec_hidden_size, 
        num_layers, dropout_p, embedding_weights, pad_idx, attention_module):
        super(DecoderLSTM, self).__init__()
        self.dec_hidden_size = dec_hidden_size
        self.num_layers = num_layers
        self.attention = attention_module 
        self.output_size = output_size
        self.dropout_layer = nn.Dropout(dropout_p)

        self.embedding = nn.Embedding.from_pretrained(embedding_weights, freeze=False, padding_idx=pad_idx)
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_size + enc_hidden_size_bi, dec_hidden_size, num_layers, 
                            dropout=dropout_p if num_layers > 1 else 0, batch_first=True)

        self.fc_out = nn.Linear(dec_hidden_size + enc_hidden_size_bi + embedding_size, output_size)

    def forward(self, trg_token, prev_hidden, prev_cell, encoder_outputs, src_mask):
        trg_token_seq = trg_token.unsqueeze(1)
        
        # Embedded shape: (batch_size, 1, embedding_size)
        embedded = self.dropout_layer(self.embedding(trg_token_seq))
        
        prev_hidden_top_layer = prev_hidden[-1,:,:]
        
        # context_vector shape: (batch_size, 1, enc_hidden_size_bi)
        # attention_weights shape: (batch_size, src_seq_len)
        context_vector, attention_weights = self.attention(prev_hidden_top_layer, encoder_outputs, src_mask)
        
        # lstm_input shape: (batch_size, 1, embedding_size + enc_hidden_size_bi)
        lstm_input = torch.cat((embedded, context_vector), dim=2)
        
        # new_hidden shape: (num_layers, batch_size, dec_hidden_size)
        # new_cell shape: (num_layers, batch_size, dec_hidden_size)
        lstm_output, (new_hidden, new_cell) = self.lstm(lstm_input, (prev_hidden, prev_cell))
        
        # pred_input_concat shape: (batch_size, dec_hidden_size + enc_hidden_size_bi + embedding_size)
        pred_input_concat = torch.cat((lstm_output.squeeze(1), context_vector.squeeze(1), embedded.squeeze(1)), dim=1)
        
        # Prediction shape: (batch_size, output_size)
        prediction = self.fc_out(pred_input_concat)
        
        return prediction, new_hidden, new_cell, attention_weights

In [95]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src_tokens, trg_tokens, src_pad_idx, teacher_forcing_ratio=0.5):
        if isinstance(teacher_forcing_ratio, (tuple, list)):
            teacher_forcing_ratio = teacher_forcing_ratio[0]
        teacher_forcing_ratio = float(teacher_forcing_ratio)
        
        batch_size = src_tokens.shape[0]
        trg_seq_len = trg_tokens.shape[1]
        trg_vocab_size = self.decoder.output_size 

        # outputs shape: (batch_size, trg_seq_len, trg_vocab_size)
        outputs = torch.zeros(batch_size, trg_seq_len, trg_vocab_size).to(self.device)

        # src_mask shape: (batch_size, src_seq_len)
        src_mask = (src_tokens != src_pad_idx)
        
        # encoder_outputs shape: (batch_size, src_seq_len, enc_hidden_size * 2)
        # hidden shape: (batch_size, dec_hidden_size)
        # cell shape: (batch_size, dec_hidden_size)
        encoder_outputs, hidden, cell = self.encoder(src_tokens)
        
        hidden = hidden.unsqueeze(0).repeat(self.decoder.num_layers, 1, 1)
        cell = cell.unsqueeze(0).repeat(self.decoder.num_layers, 1, 1)
        
        current_trg_token = trg_tokens[:, 0]
        
        
        for t in range(trg_seq_len - 1): 
            decoder_prediction, hidden, cell, _ = self.decoder(current_trg_token, hidden, cell, encoder_outputs, src_mask)

            outputs[:, t+1, :] = decoder_prediction
            use_teacher_forcing = random.random() < teacher_forcing_ratio
            
            if use_teacher_forcing:
                current_trg_token = trg_tokens[:, t+1]
            else:
                top1_token_idx = decoder_prediction.argmax(1)
                current_trg_token = top1_token_idx
                
        return outputs

In [96]:
embedding_dim = 256
enc_hidden_dim = 512
dec_hidden_dim = 512
lstm_num_layers = 2
dropout_rate = 0.3
attention_internal_dim = 128

In [97]:
encoder = EncoderLSTM(input_size=len(de_vocab), embedding_size=embedding_dim,
        enc_hidden_size=enc_hidden_dim, dec_hidden_size=dec_hidden_dim,
        num_layers=lstm_num_layers, dropout_p=dropout_rate,
        embedding_weights=de_embeddings, pad_idx=src_pad_idx).to(device)

attention_module = Attention(enc_hidden_size_bi=enc_hidden_dim * 2,
        dec_hidden_size=dec_hidden_dim, attention_dim=attention_internal_dim).to(device)

decoder = DecoderLSTM(output_size=len(en_vocab), embedding_size=embedding_dim,
        enc_hidden_size_bi=enc_hidden_dim * 2, dec_hidden_size=dec_hidden_dim,
        num_layers=lstm_num_layers, dropout_p=dropout_rate,
        embedding_weights=en_embeddings, pad_idx=trg_pad_idx,
        attention_module=attention_module).to(device)

In [98]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name and param.dim() > 1:
            nn.init.xavier_uniform_(param.data)
        elif 'bias' in name: 
            nn.init.constant_(param.data, 0)

In [99]:
model = Seq2Seq(encoder, decoder, device).to(device)
model.apply(init_weights)

Seq2Seq(
  (encoder): EncoderLSTM(
    (dropout_layer): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(7853, 256, padding_idx=0)
    (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
    (fc_hidden): Linear(in_features=1024, out_features=512, bias=True)
    (fc_cell): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): DecoderLSTM(
    (attention): Attention(
      (attn_W): Linear(in_features=1536, out_features=128, bias=True)
      (attn_v): Linear(in_features=128, out_features=1, bias=False)
    )
    (dropout_layer): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(5892, 256, padding_idx=0)
    (lstm): LSTM(1280, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc_out): Linear(in_features=1792, out_features=5892, bias=True)
  )
)

In [100]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

## Training and Evaluations

In [124]:
start_epoch = 0
best_valid_loss = float('inf')
best_bleu = 0.0
learning_rate = 0.001
n_epochs = 10
gradient_clip_value = 1.0,
teacher_forcing_ratio_train = 0.5,

In [125]:
def train(model, iterator, optimizer, criterion, clip,
    device, src_pad_idx, teacher_forcing_ratio = 0.5):
    if isinstance(teacher_forcing_ratio, (tuple, list)):
            teacher_forcing_ratio = teacher_forcing_ratio[0]
    teacher_forcing_ratio = float(teacher_forcing_ratio)

    if isinstance(clip, (tuple, list)):
            clip = clip[0]
    clip = float(clip)

    model.train()
    epoch_loss = 0
    progress_bar = tqdm(iterator, desc="Training", leave=False)
    
    for batch in progress_bar:
        src, trg, _, _ = batch
        
        src = src.to(device) 
        trg = trg.to(device)
        
        optimizer.zero_grad()
        
        output = model(src, trg, src_pad_idx, teacher_forcing_ratio)
        
        output_for_loss = output[:, 1:].reshape(-1, output.shape[-1])
        trg_for_loss = trg[:, 1:].reshape(-1)
        
        loss = criterion(output_for_loss, trg_for_loss)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        progress_bar.set_postfix(loss=loss.item())
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [126]:
def evaluate(model, iterator, criterion, device, src_pad_idx, en_vocab):
    model.eval()
    epoch_loss = 0
    references_corpus = []
    hypotheses_corpus = []
    
    with torch.no_grad():
        for batch in tqdm(iterator, leave=False):
            src, trg, _, _ = batch
            src = src.to(device)
            trg = trg.to(device)
            
            output = model(src, trg, src_pad_idx, 0) 
            
            output_for_loss = output[:, 1:].reshape(-1, output.shape[-1])
            trg_for_loss = trg[:, 1:].reshape(-1)
            
            loss = criterion(output_for_loss, trg_for_loss)
            epoch_loss += loss.item()
            
            # For BLEU score:
            predictions_indices = output[:, 1:].argmax(2)
            
            for i in range(trg.shape[0]): 
                ref_tokens_indices = trg[i, 1:].tolist()
                ref_tokens = []
                for token_idx in ref_tokens_indices:
                    if token_idx == en_vocab.token_to_idx(EOS_TOKEN) or token_idx == en_vocab.token_to_idx(PAD_TOKEN):
                        break
                    ref_tokens.append(en_vocab.idx_to_token(token_idx))
                references_corpus.append([ref_tokens])

                hyp_tokens_indices = predictions_indices[i].tolist()
                hyp_tokens = []
                for token_idx in hyp_tokens_indices:
                    if token_idx == en_vocab.token_to_idx(EOS_TOKEN) or token_idx == en_vocab.token_to_idx(PAD_TOKEN):
                        break
                    hyp_tokens.append(en_vocab.idx_to_token(token_idx))
                hypotheses_corpus.append(hyp_tokens)

    bleu_score = 0.0
    if references_corpus and hypotheses_corpus:
        smoothing_function = SmoothingFunction().method1
        bleu_score = corpus_bleu(references_corpus, hypotheses_corpus, smoothing_function=smoothing_function)
            
    return epoch_loss / len(iterator), bleu_score

In [127]:
def translate_sentence(model, sentence_str, src_vocab, trg_vocab,
    src_spacy_model, device, src_pad_idx, max_output_len = 50):
    model.eval()
    
    src_tokens_with_sos_eos = tokenize_sentence(sentence_str, src_spacy_model, max_length=max_output_len)
    src_indices = src_vocab.tokens_to_indices(src_tokens_with_sos_eos)
    src_tensor = torch.LongTensor(src_indices).unsqueeze(0).to(device)

    src_mask = (src_tensor != src_pad_idx)

    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src_tensor)

    hidden = hidden.unsqueeze(0).repeat(model.decoder.num_layers, 1, 1) 
    cell = cell.unsqueeze(0).repeat(model.decoder.num_layers, 1, 1)

    trg_indices = [trg_vocab.token_to_idx(SOS_TOKEN)]
    all_attentions = []

    for _ in range(max_output_len):
        current_trg_token_tensor = torch.LongTensor([trg_indices[-1]]).to(device)
        
        with torch.no_grad():
            prediction, hidden, cell, attention_weights = model.decoder(
                current_trg_token_tensor, hidden, cell, encoder_outputs, src_mask
            )
        
        if attention_weights is not None:
             all_attentions.append(attention_weights.squeeze(0)) 

        predicted_token_idx = prediction.argmax(1).item()
        trg_indices.append(predicted_token_idx)

        if predicted_token_idx == trg_vocab.token_to_idx(EOS_TOKEN):
            break
            
    translated_tokens = trg_vocab.indices_to_tokens(trg_indices[1:])

    attentions_tensor = None
    if all_attentions:
        attentions_tensor = torch.stack(all_attentions, dim=0)

    return translated_tokens, attentions_tensor

In [128]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    mins = int(elapsed_time / 60)
    secs = int(elapsed_time - (mins * 60))
    return mins, secs

In [129]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [130]:
count_parameters(model)

30558468

In [131]:
def save_model(model, optimizer, epoch, loss, bleu,
    model_dir = "models", filename = "seq2seq_model.pt"):
    os.makedirs(model_dir, exist_ok=True)
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch, 'loss': loss, 'bleu': bleu
    }
    torch.save(checkpoint, os.path.join(model_dir, filename))

In [132]:
def load_model(model, optimizer, model_dir = "models",
    filename = "seq2seq_model.pt", device = torch.device('cpu')):
    checkpoint_path = os.path.join(model_dir, filename)
    if not os.path.exists(checkpoint_path):
        print(f"No checkpoint found at {checkpoint_path}. Starting from scratch.")
        return model, optimizer, 0, float('inf'), 0.0
    
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    if optimizer is not None and 'optimizer_state_dict' in checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    start_epoch = checkpoint.get('epoch', -1) + 1
    best_valid_loss = checkpoint.get('loss', float('inf'))
    best_bleu = checkpoint.get('bleu', 0.0)
    
    print(f"Model loaded from {checkpoint_path}.")
    return model, optimizer, start_epoch, best_valid_loss, best_bleu

In [133]:
model_dir = "models"

for epoch in range(start_epoch, n_epochs):
    epoch_start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, gradient_clip_value, 
                       device, src_pad_idx, teacher_forcing_ratio_train)
    valid_loss, valid_bleu = evaluate(model, val_loader, criterion, device, src_pad_idx, en_vocab)
    
    epoch_mins, epoch_secs = epoch_time(epoch_start_time, time.time())
    
    print(f"Epoch: {epoch+1:02}/{n_epochs} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f} | Val. BLEU: {valid_bleu*100:.2f}%")

    save_model(model, optimizer, epoch, valid_loss, valid_bleu, model_dir, "seq2seq_model_checkpoint.pt")
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        save_model(model, optimizer, epoch, valid_loss, valid_bleu, model_dir, "seq2seq_model_best_loss.pt")
        print(f"New best validation loss: {best_valid_loss:.3f}. Model saved as best_loss.")
    if valid_bleu > best_bleu:
        best_bleu = valid_bleu
        save_model(model, optimizer, epoch, valid_loss, valid_bleu, model_dir, "seq2seq_model_best_bleu.pt")
        print(f"New best validation BLEU: {best_bleu*100:.2f}%. Model saved as best_bleu.")

                                                                                

Epoch: 01/10 | Time: 9m 16s
Train Loss: 3.293 | Val. Loss: 3.554 | Val. BLEU: 22.38%
New best validation loss: 3.554. Model saved as best_loss.
New best validation BLEU: 22.38%. Model saved as best_bleu.


                                                                                

Epoch: 02/10 | Time: 9m 29s
Train Loss: 2.617 | Val. Loss: 3.316 | Val. BLEU: 27.97%
New best validation loss: 3.316. Model saved as best_loss.
New best validation BLEU: 27.97%. Model saved as best_bleu.


                                                                                

Epoch: 03/10 | Time: 9m 38s
Train Loss: 2.197 | Val. Loss: 3.222 | Val. BLEU: 29.27%
New best validation loss: 3.222. Model saved as best_loss.
New best validation BLEU: 29.27%. Model saved as best_bleu.


                                                                                

Epoch: 04/10 | Time: 9m 13s
Train Loss: 1.894 | Val. Loss: 3.270 | Val. BLEU: 31.19%
New best validation BLEU: 31.19%. Model saved as best_bleu.


                                                                                

Epoch: 05/10 | Time: 9m 16s
Train Loss: 1.642 | Val. Loss: 3.245 | Val. BLEU: 32.03%
New best validation BLEU: 32.03%. Model saved as best_bleu.


                                                                                

Epoch: 06/10 | Time: 9m 36s
Train Loss: 1.469 | Val. Loss: 3.364 | Val. BLEU: 32.63%
New best validation BLEU: 32.63%. Model saved as best_bleu.


                                                                                

Epoch: 07/10 | Time: 10m 53s
Train Loss: 1.313 | Val. Loss: 3.431 | Val. BLEU: 31.76%


                                                                                

Epoch: 08/10 | Time: 10m 20s
Train Loss: 1.170 | Val. Loss: 3.533 | Val. BLEU: 32.63%


                                                                                

Epoch: 09/10 | Time: 9m 18s
Train Loss: 1.055 | Val. Loss: 3.672 | Val. BLEU: 32.01%


                                                                                

Epoch: 10/10 | Time: 9m 21s
Train Loss: 0.976 | Val. Loss: 3.641 | Val. BLEU: 31.10%


In [134]:
print("Evaluating model on test set...")

model_to_test, _, _, _, _ = load_model(model, None, model_dir, "seq2seq_model_best_bleu.pt", device)
test_loss, test_bleu = evaluate(model_to_test, test_loader, criterion, device, src_pad_idx, en_vocab)
print(f"Test Loss: {test_loss:.3f} | Test BLEU: {test_bleu*100:.2f}%")

print("Translating example sentences from test set...")
num_examples_to_translate = min(3, len(de_indices_test))
for i in range(num_examples_to_translate):
    src_example_indices = de_indices_test[i]
    
    # Reconstruct sentence string from indices (excluding SOS/EOS/PAD for display)
    src_example_tokens = [de_vocab.idx_to_token(idx) for idx in src_example_indices 
                          if idx not in [de_vocab.token_to_idx(SOS_TOKEN), de_vocab.token_to_idx(EOS_TOKEN), src_pad_idx]]
    src_example_sentence = " ".join(src_example_tokens)

    trg_example_indices = en_indices_test[i]
    trg_example_tokens = [en_vocab.idx_to_token(idx) for idx in trg_example_indices
                          if idx not in [trg_sos_idx, en_vocab.token_to_idx(EOS_TOKEN), trg_pad_idx]]
    trg_example_sentence = " ".join(trg_example_tokens)

    translated_tokens, _ = translate_sentence(
        model_to_test, src_example_sentence, de_vocab, en_vocab, de_nlp, device, src_pad_idx, max_output_len = 50)
    translated_sentence = " ".join(translated_tokens)
    
    print(f"Example {i+1}:")
    print(f"  Source:    {src_example_sentence}")
    print(f"  Target:    {trg_example_sentence}")
    print(f"  Predicted: {translated_sentence}")
    print("-" * 30)

Evaluating model on test set...
Model loaded from models/seq2seq_model_best_bleu.pt.


                                                                                

Test Loss: 3.360 | Test BLEU: 32.86%
Translating example sentences from test set...
Example 1:
  Source:    ein mann mit einem orangefarbenen hut , der etwas <unk> .
  Target:    a man in an orange hat starring at something .
  Predicted: a man in an orange hat is <unk> <unk> <unk> <unk> . <eos>
------------------------------
Example 2:
  Source:    ein boston terrier läuft über <unk> gras vor einem weißen zaun .
  Target:    a boston terrier is running on lush green grass in front of a white fence .
  Predicted: a light brown dog runs through a dry grass in grass in a white fence . <eos>
------------------------------
Example 3:
  Source:    ein mädchen in einem karateanzug bricht ein brett mit einem tritt .
  Target:    a girl in karate uniform breaking a stick with a front kick .
  Predicted: a girl in a karate shirt is a a board with a volleyball . <eos>
------------------------------
