In [None]:
!pip install datasets sacrebleu

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import math
import time
import numpy as np
import pandas as pd
import re
import os
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import unicodedata
from sacrebleu.metrics import BLEU
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast

In [5]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [6]:
# Set random seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [7]:
# Constants for special tokens
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']

In [8]:
# Language constants
SRC_LANGUAGE = 'en'
TRG_LANGUAGE = 'it'

In [9]:
class CustomTokenizer:
    def __init__(self, texts, max_vocab_size=50000, language='en'):
        print(f"\nInitializing {language} tokenizer...")
        self.max_vocab_size = max_vocab_size
        self.language = language
        self.word2idx = {'<unk>': UNK_IDX, '<pad>': PAD_IDX, '<sos>': SOS_IDX, '<eos>': EOS_IDX}
        self.idx2word = {v: k for k, v in self.word2idx.items()}
        self.vocab_size = len(special_symbols)

        print(f"Building vocabulary for {language}...")
        # Build vocabulary
        self.word_freq = Counter()
        for i, text in enumerate(texts):
            if i % 10000 == 0:
                print(f"Processing text {i}/{len(texts)}")

            # Apply language-specific normalization
            if language == 'it':
                text = unicodedata.normalize('NFKC', text)
            else:
                text = text.lower()

            words = text.split()
            self.word_freq.update(words)

        # Add most common words to vocabulary
        for word, freq in self.word_freq.most_common(max_vocab_size - len(special_symbols)):
            if word not in self.word2idx:
                self.word2idx[word] = self.vocab_size
                self.idx2word[self.vocab_size] = word
                self.vocab_size += 1

    def print_vocab_info(self):
        """Prints vocabulary size and sample of most frequent words."""
        print(f"Vocabulary size for {self.language}: {self.vocab_size}")
        print(f"Sample of most frequent words in {self.language}:")
        for word, freq in list(self.word_freq.most_common(10)):
            print(f"  {word}: {freq}")

    def encode(self, text):
        # Apply language-specific normalization
        if self.language == 'it':
            text = unicodedata.normalize('NFKC', text)
        else:
            text = text.lower()
        words = text.split()
        return [SOS_IDX] + [self.word2idx.get(word, UNK_IDX) for word in words] + [EOS_IDX]

    def decode(self, indices):
        return ' '.join([self.idx2word.get(idx, '<unk>') for idx in indices if idx not in [PAD_IDX, SOS_IDX, EOS_IDX]]) # Corrected the indentation here to align with encode function

In [10]:
# Load the English-Italian dataset
print("Loading dataset...")
dataset = load_dataset("opus100", "en-it", split={'train':'train[:10000]', 'validation':'validation', 'test':'test'})
print(f"Dataset loaded successfully!")
print(f"Train size: {len(dataset['train'])}")
print(f"Validation size: {len(dataset['validation'])}")
print(f"Test size: {len(dataset['test'])}")

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/223k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.7M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/220k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset loaded successfully!
Train size: 10000
Validation size: 2000
Test size: 2000


In [11]:
# Print some examples
print("\nExample translations from dataset:")
for i in range(3):
    example = dataset['train'][i]
    print(f"\nExample {i+1}:")
    print(f"English: {example['translation']['en']}")
    print(f"Italian: {example['translation']['it']}")

print("\nCreating tokenizers...")
src_texts = [example['translation']['en'] for example in dataset['train']]
trg_texts = [example['translation']['it'] for example in dataset['train']]

print(f"Total English texts: {len(src_texts)}")
print(f"Total Italian texts: {len(trg_texts)}")

src_tokenizer = CustomTokenizer(src_texts, language='en')
trg_tokenizer = CustomTokenizer(trg_texts, language='it')


Example translations from dataset:

Example 1:
English: - Thanks, buddy.
Italian: - Grazie, amico.

Example 2:
English: Say it.
Italian: Dillo.

Example 3:
English: Sodium triphosphate (sodium tripolyphosphates)
Italian: Trifosfato di sodio (tripolifosfato di sodio)

Creating tokenizers...
Total English texts: 10000
Total Italian texts: 10000

Initializing en tokenizer...
Building vocabulary for en...
Processing text 0/10000

Initializing it tokenizer...
Building vocabulary for it...
Processing text 0/10000


In [12]:
# Add tokenization examples
print("\nTokenization examples:")
for i in range(3):
    example = dataset['train'][i]
    en_text = example['translation']['en']
    it_text = example['translation']['it']

    en_tokens = src_tokenizer.encode(en_text)
    it_tokens = trg_tokenizer.encode(it_text)

    print(f"\nExample {i+1}:")
    print(f"English: {en_text}")
    print(f"Tokenized English: {en_tokens}")
    print(f"Decoded English: {src_tokenizer.decode(en_tokens)}")
    print(f"italian: {it_text}")
    print(f"Tokenized Italian: {it_tokens}")
    print(f"Decoded Italian: {trg_tokenizer.decode(it_tokens)}")


Tokenization examples:

Example 1:
English: - Thanks, buddy.
Tokenized English: [2, 12, 1208, 2046, 3]
Decoded English: - thanks, buddy.
italian: - Grazie, amico.
Tokenized Italian: [2, 6, 396, 693, 3]
Decoded Italian: - Grazie, amico.

Example 2:
English: Say it.
Tokenized English: [2, 146, 63, 3]
Decoded English: say it.
italian: Dillo.
Tokenized Italian: [2, 6844, 3]
Decoded Italian: Dillo.

Example 3:
English: Sodium triphosphate (sodium tripolyphosphates)
Tokenized English: [2, 2047, 5988, 5989, 5990, 3]
Decoded English: sodium triphosphate (sodium tripolyphosphates)
italian: Trifosfato di sodio (tripolifosfato di sodio)
Tokenized Italian: [2, 6845, 4, 2745, 6846, 4, 6847, 3]
Decoded Italian: Trifosfato di sodio (tripolifosfato di sodio)


In [13]:
class TranslationDataset(Dataset):
    def __init__(self, dataset_split, src_tokenizer, trg_tokenizer, max_len=128):
        print(f"\nCreating dataset with {len(dataset_split)} examples...")
        self.examples = dataset_split
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        src_text = example['translation']['en']
        trg_text = example['translation']['it']

        src_tokens = self.src_tokenizer.encode(src_text)[:self.max_len]
        trg_tokens = self.trg_tokenizer.encode(trg_text)[:self.max_len]

        return torch.tensor(src_tokens), torch.tensor(trg_tokens)

In [14]:
def collate_fn(batch):
    """
    Custom collate function for batching sequences of different lengths.
    Pads sequences to the maximum length in the batch.
    """
    src_batch, trg_batch = [], []
    for src_sample, trg_sample in batch:
        src_batch.append(src_sample)
        trg_batch.append(trg_sample)

    # Pad sequences to the maximum length in the batch
    src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    trg_batch = nn.utils.rnn.pad_sequence(trg_batch, padding_value=PAD_IDX, batch_first=True)

    return src_batch, trg_batch

In [31]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, attn_variant, device):
        super().__init__()

        assert hid_dim % n_heads == 0

        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        self.attn_variant = attn_variant
        self.device = device

        # Initialize layers based on attention variant
        if attn_variant == 'multiplicative':
            self.W = nn.Linear(self.head_dim, self.head_dim)
        elif attn_variant == 'additive':
            self.Wa = nn.Linear(self.head_dim, self.head_dim)
            self.Ua = nn.Linear(self.head_dim, self.head_dim)
            self.V = nn.Linear(self.head_dim, 1)
        # General attention doesn't need additional parameters

        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)

        # Split into heads
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        # Calculate attention scores based on variant
        energy = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        if mask is not None:
            # Change the fill value to a smaller value within float16 range
            energy = energy.masked_fill(mask == 0, -65504.0)  # Use a value close to the minimum representable by float16

        attention = torch.softmax(energy, dim=-1)
        attention = self.dropout(attention)

        x = torch.matmul(attention, V)
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.hid_dim)
        x = self.fc_o(x)

        return x, attention

In [32]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, attn_variant, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, attn_variant, device)
        self.positionwise_feedforward = nn.Sequential(
            nn.Linear(hid_dim, pf_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(pf_dim, hid_dim)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        _src, _ = self.self_attention(src, src, src, src_mask)
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        _src = self.positionwise_feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src))
        return src

In [33]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, attn_variant, device, max_length=500):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList([
            EncoderLayer(hid_dim, n_heads, pf_dim, dropout, attn_variant, device)
            for _ in range(n_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, src, src_mask):
        batch_size = src.shape[0]
        src_len = src.shape[1]
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))

        for layer in self.layers:
            src = layer(src, src_mask)
        return src

In [34]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, attn_variant, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, attn_variant, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, attn_variant, device)
        self.positionwise_feedforward = nn.Sequential(
            nn.Linear(hid_dim, pf_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(pf_dim, hid_dim)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        _trg = self.positionwise_feedforward(trg)
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        return trg, attention

In [35]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, attn_variant, device, max_length=500):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList([
            DecoderLayer(hid_dim, n_heads, pf_dim, dropout, attn_variant, device)
            for _ in range(n_layers)
        ])
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))

        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)

        output = self.fc_out(trg)
        return output, attention

In [36]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output, attention

In [None]:
# Gradient accumulation steps
ACCUMULATION_STEPS = 4

# Mixed precision training
scaler = GradScaler()

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    total_batches = len(iterator)

    # Create progress bar
    pbar = tqdm(iterator, total=total_batches, desc='Training', bar_format='{l_bar}{bar:30}{r_bar}')

    optimizer.zero_grad()

    for i, (src, trg) in enumerate(pbar):
        src = src.to(device)
        trg = trg.to(device)

        with autocast():
            output, _ = model(src, trg[:,:-1])

            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)

            loss = criterion(output, trg)

        scaler.scale(loss).backward()

        # Perform optimization step after accumulating enough gradients
        if (i + 1) % ACCUMULATION_STEPS == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        epoch_loss += loss.item() / ACCUMULATION_STEPS

        # Update progress bar description
        pbar.set_postfix({
            'loss': f'{epoch_loss / (i + 1):.4f}',
            'ppl': f'{math.exp(epoch_loss / (i + 1)):.2f}'
        })

    pbar.close()
    return epoch_loss / total_batches

In [37]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    total_batches = len(iterator)

    # Create progress bar
    pbar = tqdm(iterator, total=total_batches, desc='Evaluating', bar_format='{l_bar}{bar:30}{r_bar}')

    with torch.no_grad():
        for src, trg in pbar:
            src = src.to(device)
            trg = trg.to(device)

            with autocast():
                output, _ = model(src, trg[:,:-1])

            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

            # Update progress bar description
            pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
            'ppl': f'{math.exp(loss.item()):.2f}'
        })

    pbar.close()
    return epoch_loss / total_batches

In [38]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [39]:
def visualize_attention(model, src_text, trg_text, src_tokenizer, trg_tokenizer, device, max_length=128):
    """
    Visualize attention weights for a given source and target text pair.
    Shows the attention map from the last decoder layer's first head.
    """
    model.eval()
    with torch.no_grad():
        # Tokenize and encode texts
        src_tokens = torch.tensor([src_tokenizer.encode(src_text)]).to(device)
        trg_tokens = torch.tensor([trg_tokenizer.encode(trg_text)]).to(device)

        # Forward pass through the model
        output, attention_weights = model(src_tokens, trg_tokens[:,:-1])

        # Get the last layer's attention weights (shape: [batch_size, n_heads, tgt_len, src_len])
        last_layer_attention = attention_weights[-1]

        # Get first head's attention from first batch
        attention = last_layer_attention[0, 0].cpu().numpy()

        # Get tokens for visualization
        src_tokens_list = src_text.split()
        trg_tokens_list = trg_text.split()

        # Get actual sequence lengths
        src_len = len(src_tokens_list)
        trg_len = len(trg_tokens_list)

        # Extract relevant part of attention matrix
        attention_matrix = attention[:trg_len, :src_len]

        # Create figure with larger size
        plt.figure(figsize=(12, 8))

        # Create heatmap with improved visibility
        sns.heatmap(
            attention_matrix,
            xticklabels=src_tokens_list,
            yticklabels=trg_tokens_list,
            cmap='viridis',
            annot=True,
            fmt='.2f',
            square=True,
            cbar_kws={'label': 'Attention Weight'}
        )

        # Rotate x-axis labels for better readability
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)

        plt.title(f'Attention Weights Visualization\n{model.attention_type} Attention', pad=20)
        plt.xlabel('Source Text (English)', labelpad=10)
        plt.ylabel('Target Text (French)', labelpad=10)

        # Adjust layout to prevent label cutoff
        plt.tight_layout()

        # Save with high quality
        filename = f'attention_map_{model.attention_type}_{src_text[:20].replace(" ", "_")}.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()

        print(f"Saved attention map to: {filename}")

        # Print attention weights for verification
        print("\nAttention Matrix Shape:", attention_matrix.shape)
        print("Attention Weights:")
        for i, trg_token in enumerate(trg_tokens_list):
            print(f"{trg_token:>20}: ", end="")
            for j, src_token in enumerate(src_tokens_list):
                print(f"{src_token}({attention_matrix[i,j]:.2f}) ", end="")
            print()

In [40]:
def calculate_bleu(model, data_loader, src_tokenizer, trg_tokenizer):
    """
    Calculate BLEU score for the model predictions.
    """
    model.eval()
    bleu = BLEU()
    predictions = []
    references = []

    with torch.no_grad():
        for src, trg in data_loader:
            src = src.to(device)
            trg = trg.to(device)

            with autocast():
                output, _ = model(src, trg[:,:-1])

            # Convert predictions to text
            pred_tokens = output.argmax(dim=-1)
            for pred, ref in zip(pred_tokens, trg):
                pred_text = trg_tokenizer.decode(pred.cpu().numpy())
                ref_text = trg_tokenizer.decode(ref.cpu().numpy())
                predictions.append(pred_text)
                references.append([ref_text])

    return bleu.corpus_score(predictions, references).score

In [41]:
def translate_sentence(model, sentence, src_tokenizer, trg_tokenizer, device, max_length=128):
    """
    Translate a single English sentence to italian.
    """
    model.eval()

    # Tokenize and encode the source sentence
    src_tokens = torch.tensor([src_tokenizer.encode(sentence)]).to(device)

    # Initialize target sequence with <sos>
    trg_tokens = torch.tensor([[SOS_IDX]]).to(device)

    with torch.no_grad():
        for _ in range(max_length):
            with autocast():
                # Get model prediction
                output, _ = model(src_tokens, trg_tokens)

            # Get the next token prediction
            pred_token = output.argmax(2)[:, -1].item()

            # Add predicted token to target sequence
            trg_tokens = torch.cat([trg_tokens, torch.tensor([[pred_token]]).to(device)], dim=1)

            # Stop if <eos> is predicted
            if pred_token == EOS_IDX:
                break

    # Convert tokens back to text
    translated_text = trg_tokenizer.decode(trg_tokens.squeeze().cpu().numpy())
    return translated_text

def evaluate_translations(model, test_loader, src_tokenizer, trg_tokenizer, device, num_examples=5):
    """
    Evaluate model translations on test set examples.
    """
    model.eval()
    translations = []
    print("\nEvaluating translations on test set examples:")
    with torch.no_grad():
        for src, trg in test_loader:
            if len(translations) >= num_examples:
                break

            src = src.to(device)
            trg = trg.to(device)

            # Get source and target texts
            for i in range(src.size(0)):
                if len(translations) >= num_examples:
                    break

                src_text = src_tokenizer.decode(src[i].cpu().numpy())
                true_text = trg_tokenizer.decode(trg[i].cpu().numpy())

                # Get model translation
                pred_text = translate_sentence(model, src_text, src_tokenizer, trg_tokenizer, device)

                translations.append({
                    'source': src_text,
                    'target': true_text,
                    'prediction': pred_text
                })

    return translations

In [42]:
def test_custom_translations(model, src_tokenizer, trg_tokenizer, device):
    """
    Test model on custom English sentences.
    """
    test_sentences = [
        "How are you?",
        "What is your name?",
        "I love learning new languages.",
        "The weather is beautiful today.",
        "Thank you very much."
    ]

    print("\nTesting custom translations:")
    for sentence in test_sentences:
        translation = translate_sentence(model, sentence, src_tokenizer, trg_tokenizer, device)
        print(f"\nEnglish: {sentence}")
        print(f"Italian: {translation}")

In [43]:
if __name__ == "__main__":
    print("\nCreating datasets and dataloaders...")
    # Create datasets
    train_dataset = TranslationDataset(dataset['train'], src_tokenizer, trg_tokenizer)
    valid_dataset = TranslationDataset(dataset['validation'], src_tokenizer, trg_tokenizer)
    test_dataset = TranslationDataset(dataset['test'], src_tokenizer, trg_tokenizer)

    # Create data loaders
    BATCH_SIZE = 8  # Reduced batch size to 8
    print(f"\nCreating dataloaders with batch size {BATCH_SIZE}")
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    print(f"Number of training batches: {len(train_loader)}")
    print(f"Number of validation batches: {len(valid_loader)}")
    print(f"Number of test batches: {len(test_loader)}")

    # Model hyperparameters
    print("\nInitializing model hyperparameters...")
    INPUT_DIM = src_tokenizer.vocab_size
    OUTPUT_DIM = trg_tokenizer.vocab_size
    HID_DIM = 32  # Further reduced hidden dimension
    ENC_LAYERS = 1  # Reduced number of encoder layers
    DEC_LAYERS = 1  # Reduced number of decoder layers
    ENC_HEADS = 2  # Reduced number of encoder heads
    DEC_HEADS = 2  # Reduced number of decoder heads
    ENC_PF_DIM = 64  # Further reduced position-wise feedforward dimension
    DEC_PF_DIM = 64  # Further reduced position-wise feedforward dimension
    ENC_DROPOUT = 0.1
    DEC_DROPOUT = 0.1

    print(f"Input dimension: {INPUT_DIM}")
    print(f"Output dimension: {OUTPUT_DIM}")

    # Training hyperparameters
    N_EPOCHS = 3
    CLIP = 1
    LEARNING_RATE = 0.0001

    print(f"\nTraining hyperparameters:")
    print(f"Number of epochs: {N_EPOCHS}")
    print(f"Gradient clipping: {CLIP}")
    print(f"Learning rate: {LEARNING_RATE}")

    # Train for each attention variant
    attention_variants = ['multiplicative', 'general', 'additive']

    # Create results table
    results_table = {
        'Attention Variant': [],
        'Training Loss': [],
        'Training PPL': [],
        'Validation Loss': [],
        'Validation PPL': [],
        'BLEU Score': [],
        'Training Time': []
    }

    # Phase 1: Training
    print("\n=== Training Phase ===")
    for attn_variant in attention_variants:
        print(f"\nTraining with {attn_variant} attention...")
        start_training_time = time.time()

        print("Initializing encoder and decoder...")
        enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, attn_variant, device)
        dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, DEC_DROPOUT, attn_variant, device)

        print("Creating Seq2SeqTransformer model...")
        model = Seq2SeqTransformer(enc, dec, PAD_IDX, PAD_IDX, device).to(device)
        print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")

        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

        best_valid_loss = float('inf')
        train_losses = []
        valid_losses = []

        print("\nStarting training...")
        for epoch in range(N_EPOCHS):
            print(f"\nEpoch {epoch+1}/{N_EPOCHS}")

            print("Training...")
            try:
                train_loss = train(model, train_loader, optimizer, criterion, CLIP)
            except RuntimeError as e:
                if 'out of memory' in str(e):
                    print('Out of memory error caught during training. Freeing up memory and retrying...')
                    torch.cuda.empty_cache()
                    continue
                else:
                    raise e

            print("Evaluating...")
            valid_loss = evaluate(model, valid_loader, criterion)

            train_losses.append(train_loss)
            valid_losses.append(valid_loss)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                print(f"New best validation loss: {valid_loss:.4f}")
                print(f"Saving model to en-fr-transformer-{attn_variant}.pt")
                torch.save(model.state_dict(), f'en-fr-transformer-{attn_variant}.pt')

            print(f'Epoch: {epoch+1:02}')
            print(f'Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
            print(f'Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')

        # Calculate final metrics
        training_time = time.time() - start_training_time
        bleu_score = calculate_bleu(model, test_loader, src_tokenizer, trg_tokenizer)

        # Store results
        results_table['Attention Variant'].append(attn_variant)
        results_table['Training Loss'].append(f"{train_losses[-1]:.3f}")
        results_table['Training PPL'].append(f"{math.exp(train_losses[-1]):.3f}")
        results_table['Validation Loss'].append(f"{valid_losses[-1]:.3f}")
        results_table['Validation PPL'].append(f"{math.exp(valid_losses[-1]):.3f}")
        results_table['BLEU Score'].append(f"{bleu_score:.2f}")
        results_table['Training Time'].append(f"{training_time/60:.1f}m")

        # Plot training curves
        plt.figure(figsize=(10, 6))
        plt.plot(train_losses, label='Train Loss')
        plt.plot(valid_losses, label='Valid Loss')
        plt.title(f'Training and Validation Losses ({attn_variant} Attention)')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.savefig(f'loss_plot_{attn_variant}.png')
        plt.close()

    # Print training results table
    results_df = pd.DataFrame(results_table)
    print("\nTraining Results:")
    print(results_df.to_string(index=False))
    results_df.to_csv('attention_results.csv', index=False)


Creating datasets and dataloaders...

Creating dataset with 10000 examples...

Creating dataset with 2000 examples...

Creating dataset with 2000 examples...

Creating dataloaders with batch size 8
Number of training batches: 1250
Number of validation batches: 250
Number of test batches: 250

Initializing model hyperparameters...
Input dimension: 18423
Output dimension: 24670

Training hyperparameters:
Number of epochs: 3
Gradient clipping: 1
Learning rate: 0.0001

=== Training Phase ===

Training with multiplicative attention...
Initializing encoder and decoder...
Creating Seq2SeqTransformer model...
Model parameters: 2247278

Starting training...

Epoch 1/3
Training...


Training:   0%|                              | 0/1250 [00:00<?, ?it/s]

  with autocast():


Evaluating...


Evaluating:   0%|                              | 0/250 [00:00<?, ?it/s]

  with autocast():


New best validation loss: 9.4119
Saving model to en-fr-transformer-multiplicative.pt
Epoch: 01
Train Loss: 2.482 | Train PPL:  11.969
Val. Loss: 9.412 | Val. PPL: 12232.786

Epoch 2/3
Training...


Training:   0%|                              | 0/1250 [00:00<?, ?it/s]

Evaluating...


Evaluating:   0%|                              | 0/250 [00:00<?, ?it/s]

New best validation loss: 8.9219
Saving model to en-fr-transformer-multiplicative.pt
Epoch: 02
Train Loss: 2.263 | Train PPL:   9.612
Val. Loss: 8.922 | Val. PPL: 7494.596

Epoch 3/3
Training...


Training:   0%|                              | 0/1250 [00:00<?, ?it/s]

Evaluating...


Evaluating:   0%|                              | 0/250 [00:00<?, ?it/s]

New best validation loss: 8.5088
Saving model to en-fr-transformer-multiplicative.pt
Epoch: 03
Train Loss: 2.109 | Train PPL:   8.236
Val. Loss: 8.509 | Val. PPL: 4958.427

Training with general attention...
Initializing encoder and decoder...
Creating Seq2SeqTransformer model...
Model parameters: 2246462

Starting training...

Epoch 1/3
Training...


Training:   0%|                              | 0/1250 [00:00<?, ?it/s]

Evaluating...


Evaluating:   0%|                              | 0/250 [00:00<?, ?it/s]

New best validation loss: 9.5355
Saving model to en-fr-transformer-general.pt
Epoch: 01
Train Loss: 2.464 | Train PPL:  11.756
Val. Loss: 9.535 | Val. PPL: 13842.083

Epoch 2/3
Training...


Training:   0%|                              | 0/1250 [00:00<?, ?it/s]

Evaluating...


Evaluating:   0%|                              | 0/250 [00:00<?, ?it/s]

New best validation loss: 9.0329
Saving model to en-fr-transformer-general.pt
Epoch: 02
Train Loss: 2.242 | Train PPL:   9.411
Val. Loss: 9.033 | Val. PPL: 8374.423

Epoch 3/3
Training...


Training:   0%|                              | 0/1250 [00:00<?, ?it/s]

Evaluating...


Evaluating:   0%|                              | 0/250 [00:00<?, ?it/s]

New best validation loss: 8.6384
Saving model to en-fr-transformer-general.pt
Epoch: 03
Train Loss: 2.090 | Train PPL:   8.088
Val. Loss: 8.638 | Val. PPL: 5644.239

Training with additive attention...
Initializing encoder and decoder...
Creating Seq2SeqTransformer model...
Model parameters: 2248145

Starting training...

Epoch 1/3
Training...


Training:   0%|                              | 0/1250 [00:00<?, ?it/s]

Evaluating...


Evaluating:   0%|                              | 0/250 [00:00<?, ?it/s]

New best validation loss: 9.5774
Saving model to en-fr-transformer-additive.pt
Epoch: 01
Train Loss: 2.476 | Train PPL:  11.889
Val. Loss: 9.577 | Val. PPL: 14435.381

Epoch 2/3
Training...


Training:   0%|                              | 0/1250 [00:00<?, ?it/s]

Evaluating...


Evaluating:   0%|                              | 0/250 [00:00<?, ?it/s]

New best validation loss: 9.0572
Saving model to en-fr-transformer-additive.pt
Epoch: 02
Train Loss: 2.257 | Train PPL:   9.558
Val. Loss: 9.057 | Val. PPL: 8579.985

Epoch 3/3
Training...


Training:   0%|                              | 0/1250 [00:00<?, ?it/s]

Evaluating...


Evaluating:   0%|                              | 0/250 [00:00<?, ?it/s]

New best validation loss: 8.6340
Saving model to en-fr-transformer-additive.pt
Epoch: 03
Train Loss: 2.098 | Train PPL:   8.146
Val. Loss: 8.634 | Val. PPL: 5619.336

Training Results:
Attention Variant Training Loss Training PPL Validation Loss Validation PPL BLEU Score Training Time
   multiplicative         2.109        8.236           8.509       4958.427       0.00          1.1m
          general         2.090        8.088           8.638       5644.239       0.00          1.1m
         additive         2.098        8.146           8.634       5619.336       0.00          1.1m


In [48]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

In [49]:
# Suppress font and other warnings
warnings.filterwarnings("ignore", "Glyph.*")
warnings.filterwarnings("ignore", "Matplotlib currently does not support Gujarati natively.*")

# Define constants for special tokens
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3

In [50]:
# Define the CustomTokenizer class
class CustomTokenizer:
    def __init__(self, texts, max_vocab_size=50000, language='en'):
        print(f"\nInitializing {language} tokenizer...")
        self.max_vocab_size = max_vocab_size
        self.language = language
        self.word2idx = {'<unk>': UNK_IDX, '<pad>': PAD_IDX, '<sos>': SOS_IDX, '<eos>': EOS_IDX}
        self.idx2word = {v: k for k, v in self.word2idx.items()}
        self.vocab_size = len(self.word2idx)

        print(f"Building vocabulary for {language}...")
        word_freq = Counter()
        for i, text in enumerate(texts):
            if i % 10000 == 0:
                print(f"Processing text {i}/{len(texts)}")
            words = text.lower().split() if language == 'en' else unicodedata.normalize('NFKC', text).split()
            word_freq.update(words)

        for word, _ in word_freq.most_common(max_vocab_size - len(self.word2idx)):
            self.word2idx[word] = self.vocab_size
            self.idx2word[self.vocab_size] = word
            self.vocab_size += 1

    def encode(self, text):
        words = text.lower().split() if self.language == 'en' else unicodedata.normalize('NFKC', text).split()
        return [SOS_IDX] + [self.word2idx.get(word, UNK_IDX) for word in words] + [EOS_IDX]

    def decode(self, indices):
        return ' '.join([self.idx2word.get(idx, '<unk>') for idx in indices if idx not in {PAD_IDX, SOS_IDX, EOS_IDX}])

In [51]:
# Define the MultiHeadAttentionLayer
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, attn_variant, device):
        super().__init__()

        assert hid_dim % n_heads == 0

        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        self.attn_variant = attn_variant
        self.device = device

        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)

        # Split into heads
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        # Calculate attention scores based on variant
        energy = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)

        attention = torch.softmax(energy, dim=-1)
        attention = self.dropout(attention)

        x = torch.matmul(attention, V)
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.hid_dim)
        x = self.fc_o(x)

        return x, attention

In [52]:
# Define the EncoderLayer
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, attn_variant, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, attn_variant, device)
        self.positionwise_feedforward = nn.Sequential(
            nn.Linear(hid_dim, pf_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(pf_dim, hid_dim)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        _src, _ = self.self_attention(src, src, src, src_mask)
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        _src = self.positionwise_feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src))
        return src

In [53]:
# Define the Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, attn_variant, device, max_length=500):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList([
            EncoderLayer(hid_dim, n_heads, pf_dim, dropout, attn_variant, device)
            for _ in range(n_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, src, src_mask):
        batch_size = src.shape[0]
        src_len = src.shape[1]
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))

        for layer in self.layers:
            src = layer(src, src_mask)
        return src

In [54]:
# Define the DecoderLayer
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, attn_variant, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, attn_variant, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, attn_variant, device)
        self.positionwise_feedforward = nn.Sequential(
            nn.Linear(hid_dim, pf_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(pf_dim, hid_dim)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        _trg = self.positionwise_feedforward(trg)
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        return trg, attention

In [55]:
# Define the Decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, attn_variant, device, max_length=500):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList([
            DecoderLayer(hid_dim, n_heads, pf_dim, dropout, attn_variant, device)
            for _ in range(n_layers)
        ])
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))

        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)

        output = self.fc_out(trg)
        return output, attention

In [56]:
# Define the Seq2SeqTransformer
class Seq2SeqTransformer(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output, attention

In [57]:
def translate_sentence(model, sentence, src_tokenizer, trg_tokenizer, device, max_length=128):
    """
    Translate a single English sentence to the target language.
    """
    model.eval()

    # Tokenize and encode the source sentence
    src_tokens = torch.tensor([src_tokenizer.encode(sentence)]).to(device)

    # Initialize target sequence with <sos>
    trg_tokens = torch.tensor([[SOS_IDX]]).to(device)

    with torch.no_grad():
        for _ in range(max_length):
            with torch.cuda.amp.autocast():
                # Get model prediction
                output, _ = model(src_tokens, trg_tokens)

            # Get the next token prediction
            pred_token = output.argmax(2)[:, -1].item()

            # Add predicted token to target sequence
            trg_tokens = torch.cat([trg_tokens, torch.tensor([[pred_token]]).to(device)], dim=1)

            # Stop if <eos> is predicted
            if pred_token == EOS_IDX:
                break

    # Convert tokens back to text
    translated_text = trg_tokenizer.decode(trg_tokens.squeeze().cpu().numpy())
    return translated_text

In [58]:
def evaluate_attention_maps():
    """
    Evaluate attention maps for trained models with detailed debugging
    """
    # Test pair for visualization
    test_pairs = [
    ("The time now is 05:08 .", "Adesso sono le 05:08 ."),
    ("Hello, is there anyone?", "Ciao, c'è qualcuno?")
]

    # Model hyperparameters (must match training)
    INPUT_DIM = src_tokenizer.vocab_size
    OUTPUT_DIM = trg_tokenizer.vocab_size
    HID_DIM = 32
    ENC_LAYERS = 1
    DEC_LAYERS = 1
    ENC_HEADS = 2
    DEC_HEADS = 2
    ENC_PF_DIM = 64
    DEC_PF_DIM = 64
    ENC_DROPOUT = 0.1
    DEC_DROPOUT = 0.1

    print("\n=== Attention Visualization ===")

    for attn_variant, model_path in [('multiplicative', 'en-de-transformer-multiplicative.pt'),
                                     ('general', 'en-fr-transformer-general.pt')]:

        if not os.path.exists(model_path):
            print(f"\nModel {model_path} not found. Skipping.")
            continue

        print(f"\nEvaluating {attn_variant} attention model:")

        # Initialize model
        enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, attn_variant, device)
        dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, DEC_DROPOUT, attn_variant, device)
        model = Seq2SeqTransformer(enc, dec, PAD_IDX, PAD_IDX, device).to(device)

        # Load model
        model.load_state_dict(torch.load(model_path))
        model.eval()

        print("\nGenerating visualizations...")
        for src_text, trg_text in test_pairs:
            print(f"\nProcessing pair:")
            print(f"English: {src_text}")
            print(f"Italian: {trg_text}")

            # Generate attention visualization
            with torch.no_grad():
                # Tokenize
                src_tokens = torch.tensor([src_tokenizer.encode(src_text)]).to(device)
                trg_tokens = torch.tensor([trg_tokenizer.encode(trg_text)]).to(device)

                # Get model output and attention
                output, attention_weights = model(src_tokens, trg_tokens[:,:-1])

                # Get last layer attention
                if isinstance(attention_weights, list):
                    last_layer_attention = attention_weights[-1]
                else:
                    last_layer_attention = attention_weights

                # Get first head's attention from first batch
                attention = last_layer_attention[0, 0].cpu().numpy()

                # Get tokens
                src_tokens_list = src_tokenizer.encode(src_text)
                trg_tokens_list = trg_tokenizer.encode(trg_text)

                # Print raw tokens for debugging
                print("\nRaw tokens:")
                print("Source tokens:", src_tokens_list)
                print("Target tokens:", trg_tokens_list)

                # Convert token IDs to text
                src_tokens_text = [src_tokenizer.decode([token]) for token in src_tokens_list]
                trg_tokens_text = [trg_tokenizer.decode([token]) for token in trg_tokens_list]

                print("\nDecoded tokens before filtering:")
                print("Source tokens:", src_tokens_text)
                print("Target tokens:", trg_tokens_text)

                # Remove special tokens
                src_tokens_text = [t for t in src_tokens_text if t not in ['<pad>', '<sos>', '<eos>', '']]
                trg_tokens_text = [t for t in trg_tokens_text if t not in ['<pad>', '<sos>', '<eos>', '']]

                print("\nTokens after filtering:")
                print("Source tokens:", src_tokens_text)
                print("Target tokens:", trg_tokens_text)

                print("\nAttention shape:", attention.shape)

                # Create visualization
                plt.figure(figsize=(12, 8))

                # Create heatmap
                sns.heatmap(
                    attention,  # Use full attention matrix
                    xticklabels=src_tokens_text,
                    yticklabels=trg_tokens_text,
                    cmap='viridis',
                    annot=True,
                    fmt='.2f',
                    square=True,
                    cbar_kws={'label': 'Attention Weight'}
                )

                # Adjust labels
                plt.xticks(rotation=45, ha='right')
                plt.yticks(rotation=0)

                plt.title(f'Attention Weights Visualization\n{attn_variant.capitalize()} Attention', pad=20)
                plt.xlabel('Source Text (English)', labelpad=10)
                plt.ylabel('Target Text (Italian)', labelpad=10)

                plt.tight_layout()

                # Save plot
                filename = f'attention_map_{attn_variant}_{src_text[:20].replace(" ", "_")}.png'
                plt.savefig(filename, dpi=300, bbox_inches='tight')
                plt.close()

                print(f"\nSaved attention map to: {filename}")

                # Print attention weights
                print("\nAttention Weights:")
                for i in range(min(len(trg_tokens_text), attention.shape[0])):
                    print(f"{trg_tokens_text[i]:>20}: ", end="")
                    for j in range(min(len(src_tokens_text), attention.shape[1])):
                        print(f"{src_tokens_text[j]}({attention[i,j]:.2f}) ", end="")
                    print()

        print("\nTesting translations...")
        test_sentences = [
            "Hey, i am Rida ?",
            "I am currently studing.",
            "This Degree is very difficult.",
            "CAn you tell me about yourself ?",
            "Thank you so much."
        ]

        for text in test_sentences:
            translated = translate_sentence(model, text, src_tokenizer, trg_tokenizer, device, max_length=50)
            print(f"\nEnglish: {text}")
            print(f"French: {translated}")

        print("\n" + "="*50)

    print("\nEvaluation complete! Check the generated visualizations and translation results.")

# Run the evaluation
evaluate_attention_maps()


=== Attention Visualization ===

Model en-de-transformer-multiplicative.pt not found. Skipping.

Evaluating general attention model:

Generating visualizations...

Processing pair:
English: The time now is 05:08 .
Italian: Adesso sono le 05:08 .

Raw tokens:
Source tokens: [2, 4, 110, 126, 13, 0, 104, 3]
Target tokens: [2, 1127, 23, 19, 0, 84, 3]

Decoded tokens before filtering:
Source tokens: ['', 'the', 'time', 'now', 'is', '<unk>', '.', '']
Target tokens: ['', 'Adesso', 'sono', 'le', '<unk>', '.', '']

Tokens after filtering:
Source tokens: ['the', 'time', 'now', 'is', '<unk>', '.']
Target tokens: ['Adesso', 'sono', 'le', '<unk>', '.']

Attention shape: (6, 8)


  model.load_state_dict(torch.load(model_path))



Saved attention map to: attention_map_general_The_time_now_is_05:0.png

Attention Weights:
              Adesso: the(0.13) time(0.14) now(0.12) is(0.10) <unk>(0.12) .(0.13) 
                sono: the(0.12) time(0.13) now(0.09) is(0.16) <unk>(0.12) .(0.12) 
                  le: the(0.12) time(0.13) now(0.12) is(0.12) <unk>(0.14) .(0.15) 
               <unk>: the(0.13) time(0.14) now(0.15) is(0.10) <unk>(0.12) .(0.11) 
                   .: the(0.13) time(0.15) now(0.14) is(0.08) <unk>(0.12) .(0.12) 

Processing pair:
English: Hello, is there anyone?
Italian: Ciao, c'è qualcuno?

Raw tokens:
Source tokens: [2, 732, 13, 73, 0, 3]
Target tokens: [2, 652, 371, 22151, 3]

Decoded tokens before filtering:
Source tokens: ['', 'hello,', 'is', 'there', '<unk>', '']
Target tokens: ['', 'Ciao,', "c'è", 'qualcuno?', '']

Tokens after filtering:
Source tokens: ['hello,', 'is', 'there', '<unk>']
Target tokens: ['Ciao,', "c'è", 'qualcuno?']

Attention shape: (4, 6)

Saved attention map to: attentio

  with torch.cuda.amp.autocast():
