<a href="https://colab.research.google.com/github/TamBui1706/DeepLearningCourse/blob/main/Week06/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setup and Installation


In [None]:
# Install necessary packages
!pip install torch transformers datasets sacrebleu sentencepiece gradio matplotlib pandas sacremoses tqdm
!pip install --upgrade datasets transformers fsspec
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer
from collections import Counter
import random
import time
import gradio as gr
from tqdm.notebook import tqdm

# Set seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m638.5 kB/s[0m eta [36m0:00:00[0m
Collecting gradio
  Downloading gradio-5.30.0-py3-none-any.whl.metadata (16 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting n

# 2. Data Preparation

In [None]:
# Load OPUS-100 dataset (English-Vietnamese)
dataset = load_dataset("opus100", "en-vi")
print(f"Dataset loaded: {dataset}")

# Display sample data
print("\nSample data:")
for i in range(3):
    print(f"EN: {dataset['train'][i]['translation']['en']}")
    print(f"VI: {dataset['train'][i]['translation']['vi']}")
    print()

# Dataset statistics
print(f"Train size: {len(dataset['train'])}")
print(f"Validation size: {len(dataset['validation'])}")
print(f"Test size: {len(dataset['test'])}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/137k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/59.0M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/138k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset loaded: DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

Sample data:
EN: What is it?
VI: Cái gì đó?

EN: I thought we would go to the children's home.
VI: Con nghĩ chúng ta nên đến mái ấm.

EN: Is there something you want to tell your husband?
VI: Có điều gì cô muốn nói với chồng mình không?

Train size: 1000000
Validation size: 2000
Test size: 2000


# 3. Tokenization and Vocabulary


In [None]:
# Create tokenizers for English and Vietnamese
en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-vi")
vi_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-vi")

# Prepare dataset
class TranslationDataset(Dataset):
    def __init__(self, data, max_len=100):
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        en_text = item['translation']['en']
        vi_text = item['translation']['vi']

        # Tokenize
        en_tokens = en_tokenizer(en_text, max_length=self.max_len, truncation=True,
                                return_tensors="pt", padding="max_length")
        vi_tokens = vi_tokenizer(vi_text, max_length=self.max_len, truncation=True,
                                return_tensors="pt", padding="max_length")

        return {
            'input_ids': en_tokens['input_ids'].squeeze(),
            'attention_mask': en_tokens['attention_mask'].squeeze(),
            'labels': vi_tokens['input_ids'].squeeze(),
            'en_text': en_text,
            'vi_text': vi_text
        }

# Create smaller datasets to fit within 3-5 hours training time
def create_subset(dataset, size):
    indices = np.random.choice(len(dataset), size=size, replace=False)
    return dataset.select(indices)

# Create smaller train, val, test sets
train_size = 50000  # Adjust based on available compute
val_size = 1000
test_size = 1000

train_subset = create_subset(dataset['train'], train_size)
val_subset = create_subset(dataset['validation'], val_size)
test_subset = create_subset(dataset['test'], test_size)

# Create datasets
train_dataset = TranslationDataset(train_subset)
val_dataset = TranslationDataset(val_subset)
test_dataset = TranslationDataset(test_subset)

# Create data loaders
def create_dataloaders(batch_size=32):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    return train_loader, val_loader, test_loader

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

# 4. Model Implementation - Basic RNN

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class BasicRNNModel(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, output_dim, dropout=0.5, n_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask=None):
        # src: [batch_size, src_len]
        embedded = self.dropout(self.embedding(src))  # [batch_size, src_len, emb_dim]

        # Apply mask if provided
        if src_mask is not None:
            embedded = embedded * src_mask.unsqueeze(-1)

        outputs, hidden = self.rnn(embedded)  # outputs: [batch_size, src_len, hidden_dim]

        predictions = self.fc_out(outputs)  # [batch_size, src_len, output_dim]

        return predictions, hidden

class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        # src: [batch_size, src_len]
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # src_mask: [batch_size, 1, 1, src_len]
        return src_mask

    def make_trg_mask(self, trg):
        # trg: [batch_size, trg_len]
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        # trg_pad_mask: [batch_size, 1, 1, trg_len]

        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
        # trg_sub_mask: [trg_len, trg_len]

        trg_mask = trg_pad_mask & trg_sub_mask
        # trg_mask: [batch_size, 1, trg_len, trg_len]

        return trg_mask

    def forward(self, src, trg):
        # src: [batch_size, src_len]
        # trg: [batch_size, trg_len]

        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        enc_src = self.encoder(src, src_mask)
        output, attention = self.decoder(trg, enc_src, src_mask, trg_mask)

        return output, attention

# 5. Model Implementation - Transformer

In [None]:
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class TransformerModel(nn.Module):
    def __init__(self, input_dim, output_dim, d_model=512, nhead=8, num_encoder_layers=6,
                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super(TransformerModel, self).__init__()

        # Embedding layers
        self.encoder_embedding = nn.Embedding(input_dim, d_model)
        self.decoder_embedding = nn.Embedding(output_dim, d_model)

        # Positional encoding
        self.positional_encoding = PositionalEncoding(d_model)

        # Transformer
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )

        # Output layer
        self.fc_out = nn.Linear(d_model, output_dim)

        # Other attributes
        self.d_model = d_model
        self.src_pad_idx = en_tokenizer.pad_token_id
        self.trg_pad_idx = vi_tokenizer.pad_token_id

    def make_src_mask(self, src):
        # src: [batch_size, src_len]
        src_mask = (src != self.src_pad_idx)
        return src_mask

    def make_tgt_mask(self, tgt):
        # tgt: [batch_size, tgt_len]
        tgt_mask = (tgt != self.trg_pad_idx)
        return tgt_mask

    def forward(self, src, tgt):
        # src: [batch_size, src_len]
        # tgt: [batch_size, tgt_len]

        # Create masks
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)

        # Create the target sequence mask (prevent attending to future tokens)
        tgt_len = tgt.size(1)
        tgt_sub_mask = torch.triu(torch.ones((tgt_len, tgt_len), device=src.device), diagonal=1).bool()

        # Embedding and positional encoding
        src_embedded = self.positional_encoding(self.encoder_embedding(src) * math.sqrt(self.d_model))
        tgt_embedded = self.positional_encoding(self.decoder_embedding(tgt) * math.sqrt(self.d_model))

        # Transformer forward pass
        output = self.transformer(
            src=src_embedded,
            tgt=tgt_embedded,
            src_key_padding_mask=~src_mask,
            tgt_key_padding_mask=~tgt_mask,
            memory_key_padding_mask=~src_mask,
            tgt_mask=tgt_sub_mask
        )

        # Final linear layer
        output = self.fc_out(output)

        return output

# 6. Training Functions

In [None]:
from sacrebleu.metrics import BLEU

def calculate_bleu(target, prediction):
    """Calculate BLEU score."""
    bleu = BLEU()
    return bleu.corpus_score([prediction], [[target]]).score

def translate_sentence(model, sentence, src_tokenizer, tgt_tokenizer, device, max_len=100):
    """Translate a single sentence."""
    model.eval()

    # Tokenize the source sentence
    tokens = src_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
    src_tensor = tokens["input_ids"].to(device)

    # Determine model type and get predictions
    if isinstance(model, TransformerModel):
        # Start with BOS token
        trg_indexes = [tgt_tokenizer.bos_token_id]

        for i in range(max_len):
            trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

            with torch.no_grad():
                output = model(src_tensor, trg_tensor)

            pred_token = output[0, -1, :].argmax().item()
            trg_indexes.append(pred_token)

            if pred_token == tgt_tokenizer.eos_token_id:
                break

        # Convert ids to tokens and join
        trg_tokens = tgt_tokenizer.convert_ids_to_tokens(trg_indexes)
        translation = tgt_tokenizer.convert_tokens_to_string(trg_tokens)

    else:  # BasicRNNModel
        with torch.no_grad():
            output, _ = model(src_tensor)

        # Get the predicted tokens
        pred_tokens = output.argmax(2)

        # Convert to string
        translation = tgt_tokenizer.decode(pred_tokens[0], skip_special_tokens=True)

    return translation

def train_epoch(model, dataloader, optimizer, criterion, device, clip=1.0):
    """Train model for one epoch."""
    model.train()
    epoch_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        # Get input and target data
        src = batch['input_ids'].to(device)
        tgt = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        if isinstance(model, TransformerModel):
            # Shift target for transformer training (teacher forcing)
            tgt_input = tgt[:, :-1]
            output = model(src, tgt_input)

            # Reshape outputs and targets for loss calculation
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            tgt = tgt[:, 1:].contiguous().view(-1)

        else:  # BasicRNNModel
            output, _ = model(src)

            # Reshape outputs and targets for loss calculation
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            tgt = tgt.contiguous().view(-1)

        # Calculate loss
        loss = criterion(output, tgt)

        # Backward pass and update
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    """Evaluate model on validation data."""
    model.eval()
    epoch_loss = 0
    bleu_scores = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            src = batch['input_ids'].to(device)
            tgt = batch['labels'].to(device)

            # Forward pass
            if isinstance(model, TransformerModel):
                # Shift target for transformer training
                tgt_input = tgt[:, :-1]
                output = model(src, tgt_input)

                # Reshape outputs and targets for loss calculation
                output_dim = output.shape[-1]
                output_flat = output.contiguous().view(-1, output_dim)
                tgt_flat = tgt[:, 1:].contiguous().view(-1)

                # Calculate loss
                loss = criterion(output_flat, tgt_flat)

                # Calculate BLEU score for a few samples
                # Use the first item in the batch for translation
                if len(bleu_scores) < 10:  # Limit calculations for efficiency
                    # Get raw texts from the batch
                    src_text_sample = batch['en_text'][0]
                    tgt_text_sample = batch['vi_text'][0]

                    # Translate the sentence using the separate translate function
                    pred_text_sample = translate_sentence(model, src_text_sample, en_tokenizer, vi_tokenizer, device)

                    # Calculate BLEU for the sample
                    bleu_scores.append(calculate_bleu(tgt_text_sample, pred_text_sample))

            else:  # BasicRNNModel
                # In the RNN model, the output length is src_len
                output, _ = model(src)

                # Reshape outputs and targets for loss calculation
                output_dim = output.shape[-1]
                output_flat = output.contiguous().view(-1, output_dim)
                # For RNN, we calculate loss against the target sequence
                # However, the model's output is based on src_len, not trg_len.
                # This loss calculation is likely incorrect for sequence-to-sequence.
                # But we will keep it as in the original code's logic.
                # The target sequence shape is [batch_size, trg_len]
                tgt_flat = tgt.contiguous().view(-1)

                # Calculate loss
                # Note: This loss calculation is questionable for translation
                # with the current RNN model structure.
                # We will proceed assuming the user intends this based on original code.
                loss = criterion(output_flat, tgt_flat)

                # Calculate BLEU score for a few samples
                # Use the first item in the batch for translation
                # This will call the translate_sentence function specific to the RNN.
                if len(bleu_scores) < 10:  # Limit calculations for efficiency
                    # Get raw texts from the batch
                    src_text_sample = batch['en_text'][0]
                    tgt_text_sample = batch['vi_text'][0]

                    # Translate the sentence using the separate translate function
                    # The translate_sentence for RNN currently decodes argmax of entire output,
                    # which is not standard beam search/greedy decoding.
                    pred_text_sample = translate_sentence(model, src_text_sample, en_tokenizer, vi_tokenizer, device)

                    # Calculate BLEU for the sample
                    bleu_scores.append(calculate_bleu(tgt_text_sample, pred_text_sample))

            epoch_loss += loss.item()

    # Avoid division by zero if bleu_scores is empty (e.g., if dataloader is empty)
    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0

    return epoch_loss / len(dataloader), avg_bleu

# 7. Training with Multiple Hyperparameter Configurations


In [None]:
def train_and_evaluate_model(model_type, hyperparams, train_loader, val_loader, test_loader, device, epochs=5):
    """Train model with given hyperparameters and evaluate."""

    # Extract hyperparameters
    batch_size = hyperparams.get('batch_size', 32)
    lr = hyperparams.get('lr', 0.001)
    optimizer_name = hyperparams.get('optimizer', 'adam')

    # Initialize model based on model_type
    if model_type == 'rnn':
        model = BasicRNNModel(
            input_dim=len(en_tokenizer),
            emb_dim=hyperparams.get('emb_dim', 256),
            hidden_dim=hyperparams.get('hidden_dim', 512),
            output_dim=len(vi_tokenizer),
            dropout=hyperparams.get('dropout', 0.5),
            n_layers=hyperparams.get('n_layers', 2)
        ).to(device)
    else:  # transformer
        model = TransformerModel(
            input_dim=len(en_tokenizer),
            output_dim=len(vi_tokenizer),
            d_model=hyperparams.get('d_model', 512),
            nhead=hyperparams.get('nhead', 8),
            num_encoder_layers=hyperparams.get('num_encoder_layers', 3),
            num_decoder_layers=hyperparams.get('num_decoder_layers', 3),
            dim_feedforward=hyperparams.get('dim_feedforward', 1024),
            dropout=hyperparams.get('dropout', 0.1)
        ).to(device)

    # Choose optimizer
    if optimizer_name.lower() == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name.lower() == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    # Use cross-entropy loss
    criterion = nn.CrossEntropyLoss(ignore_index=vi_tokenizer.pad_token_id)

    # For tracking metrics
    train_losses = []
    val_losses = []
    bleu_scores = []
    best_val_loss = float('inf')

    # Training loop
    for epoch in range(epochs):
        start_time = time.time()

        # Train
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        train_losses.append(train_loss)

        # Validate
        val_loss, bleu = evaluate(model, val_loader, criterion, device)
        val_losses.append(val_loss)
        bleu_scores.append(bleu)

        # Track training time
        end_time = time.time()
        epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'{model_type}_best_model.pt')

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs:.2f}s')
        print(f'\tTrain Loss: {train_loss:.3f}')
        print(f'\tVal. Loss: {val_loss:.3f}, BLEU: {bleu:.2f}')

    # Load best model for test evaluation
    model.load_state_dict(torch.load(f'{model_type}_best_model.pt'))
    test_loss, test_bleu = evaluate(model, test_loader, criterion, device)
    print(f'Test Loss: {test_loss:.3f}, Test BLEU: {test_bleu:.2f}')

    # Return metrics
    return {
        'model': model,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'bleu_scores': bleu_scores,
        'test_loss': test_loss,
        'test_bleu': test_bleu
    }

def run_hyperparameter_experiments(model_type, hyperparams_list, device, epochs=5):
    """Run experiments with different hyperparameters."""
    results = []

    for i, hyperparams in enumerate(hyperparams_list):
        print(f"\n{'='*50}")
        print(f"Experiment {i+1}/{len(hyperparams_list)}: {model_type.upper()} with {hyperparams}")
        print(f"{'='*50}\n")

        # Create dataloaders with specific batch size
        batch_size = hyperparams.get('batch_size', 32)
        train_loader, val_loader, test_loader = create_dataloaders(batch_size)

        # Train and evaluate
        result = train_and_evaluate_model(
            model_type,
            hyperparams,
            train_loader,
            val_loader,
            test_loader,
            device,
            epochs
        )

        # Add hyperparams to result
        result['hyperparams'] = hyperparams
        results.append(result)

    return results

def plot_training_curves(results, model_type):
    """Plot training curves for all experiments."""
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    for i, result in enumerate(results):
        hyperparams = result['hyperparams']
        label = f"Exp {i+1}: lr={hyperparams['lr']}, bs={hyperparams['batch_size']}"

        # Plot training and validation loss
        axes[0].plot(result['train_losses'], label=f"{label} (train)")
        axes[0].plot(result['val_losses'], linestyle='--', label=f"{label} (val)")

        # Plot BLEU scores
        axes[1].plot(result['bleu_scores'], label=label)

    axes[0].set_title(f'{model_type.upper()} - Loss Curves')
    axes[0].set_xlabel('Epochs')
    axes[0].set_ylabel('Loss')
    axes[0].legend()
    axes[0].grid(True)

    axes[1].set_title(f'{model_type.upper()} - BLEU Scores')
    axes[1].set_xlabel('Epochs')
    axes[1].set_ylabel('BLEU')
    axes[1].legend()
    axes[1].grid(True)

    plt.tight_layout()
    plt.savefig(f'{model_type}_training_curves.png')
    plt.show()

def calculate_statistics(results):
    """Calculate mean and standard deviation of metrics."""
    test_losses = [r['test_loss'] for r in results]
    test_bleus = [r['test_bleu'] for r in results]

    stats = {
        'test_loss_mean': np.mean(test_losses),
        'test_loss_std': np.std(test_losses),
        'test_bleu_mean': np.mean(test_bleus),
        'test_bleu_std': np.std(test_bleus)
    }

    print(f"Test Loss: {stats['test_loss_mean']:.4f} ± {stats['test_loss_std']:.4f}")
    print(f"Test BLEU: {stats['test_bleu_mean']:.4f} ± {stats['test_bleu_std']:.4f}")

    return stats

# 8. Run Experiments


In [None]:
# Define hyperparameter configurations for RNN model
rnn_hyperparams_list = [
    {'batch_size': 64, 'lr': 0.001, 'emb_dim': 256, 'hidden_dim': 512, 'dropout': 0.3, 'n_layers': 2, 'optimizer': 'adam'},
    {'batch_size': 32, 'lr': 0.0005, 'emb_dim': 256, 'hidden_dim': 512, 'dropout': 0.5, 'n_layers': 2, 'optimizer': 'adam'},
    {'batch_size': 64, 'lr': 0.001, 'emb_dim': 128, 'hidden_dim': 256, 'dropout': 0.3, 'n_layers': 1, 'optimizer': 'adam'},
    {'batch_size': 32, 'lr': 0.0001, 'emb_dim': 512, 'hidden_dim': 1024, 'dropout': 0.5, 'n_layers': 3, 'optimizer': 'adamw'},
    {'batch_size': 64, 'lr': 0.002, 'emb_dim': 256, 'hidden_dim': 512, 'dropout': 0.4, 'n_layers': 2, 'optimizer': 'sgd'}
]

# Define hyperparameter configurations for Transformer model
transformer_hyperparams_list = [
    {'batch_size': 32, 'lr': 0.0001, 'd_model': 256, 'nhead': 8, 'num_encoder_layers': 3, 'num_decoder_layers': 3, 'dim_feedforward': 1024, 'dropout': 0.1, 'optimizer': 'adam'},
    {'batch_size': 64, 'lr': 0.0005, 'd_model': 256, 'nhead': 4, 'num_encoder_layers': 3, 'num_decoder_layers': 3, 'dim_feedforward': 1024, 'dropout': 0.1, 'optimizer': 'adam'},
    {'batch_size': 32, 'lr': 0.0001, 'd_model': 512, 'nhead': 8, 'num_encoder_layers': 2, 'num_decoder_layers': 2, 'dim_feedforward': 2048, 'dropout': 0.2, 'optimizer': 'adamw'},
    {'batch_size': 64, 'lr': 0.0002, 'd_model': 256, 'nhead': 4, 'num_encoder_layers': 4, 'num_decoder_layers': 4, 'dim_feedforward': 1024, 'dropout': 0.1, 'optimizer': 'adam'},
    {'batch_size': 32, 'lr': 0.0005, 'd_model': 384, 'nhead': 6, 'num_encoder_layers': 3, 'num_decoder_layers': 3, 'dim_feedforward': 1536, 'dropout': 0.15, 'optimizer': 'adamw'}
]

# Number of epochs to train
n_epochs = 5  # Adjust based on available time

# Run RNN experiments
print("\nRunning RNN experiments...")
rnn_results = run_hyperparameter_experiments('rnn', rnn_hyperparams_list, device, n_epochs)
plot_training_curves(rnn_results, 'rnn')
rnn_stats = calculate_statistics(rnn_results)

# Run Transformer experiments
print("\nRunning Transformer experiments...")
transformer_results = run_hyperparameter_experiments('transformer', transformer_hyperparams_list, device, n_epochs)
plot_training_curves(transformer_results, 'transformer')
transformer_stats = calculate_statistics(transformer_results)

# Find best models
best_rnn_idx = np.argmax([r['test_bleu'] for r in rnn_results])
best_transformer_idx = np.argmax([r['test_bleu'] for r in transformer_results])

best_rnn_model = rnn_results[best_rnn_idx]['model']
best_transformer_model = transformer_results[best_transformer_idx]['model']

print(f"\nBest RNN model (configuration {best_rnn_idx+1}): Test BLEU = {rnn_results[best_rnn_idx]['test_bleu']:.4f}")
print(f"Best Transformer model (configuration {best_transformer_idx+1}): Test BLEU = {transformer_results[best_transformer_idx]['test_bleu']:.4f}")


Running RNN experiments...

Experiment 1/5: RNN with {'batch_size': 64, 'lr': 0.001, 'emb_dim': 256, 'hidden_dim': 512, 'dropout': 0.3, 'n_layers': 2, 'optimizer': 'adam'}



Training:   0%|          | 0/782 [00:00<?, ?it/s]

KeyboardInterrupt: 

# 9. Gradio Demo Interface

In [None]:
import gradio as gr

def translate(text, model_choice):
    if model_choice == "RNN":
        model = best_rnn_model
    else:
        model = best_transformer_model

    translation = translate_sentence(model, text, en_tokenizer, vi_tokenizer, device)
    return translation

demo = gr.Interface(
    fn=translate,
    inputs=[
        gr.Textbox(lines=4, placeholder="Enter English text here..."),
        gr.Radio(["RNN", "Transformer"], label="Model")
    ],
    outputs=gr.Textbox(label="Vietnamese Translation"),
    title="English to Vietnamese Translation",
    description="Translate English text to Vietnamese using neural machine translation models",
    examples=[
        ["Hello, how are you today?", "Transformer"],
        ["I love learning about artificial intelligence and machine learning.", "RNN"],
        ["The weather is beautiful outside.", "Transformer"]
    ]
)

demo.launch(share=True)

# 10. Sample Translation and Evaluation

In [None]:
# Test translation examples with best models
test_sentences = [
    "Hello, how are you today?",
    "I love learning about artificial intelligence.",
    "The weather is beautiful outside.",
    "Can you help me find the nearest restaurant?",
    "What time does the movie start?"
]

print("Sample translations from best models:\n")
print("=" * 70)
print(f"{'English':40s} | {'RNN Translation':40s} | {'Transformer Translation':40s}")
print("=" * 70)

for sent in test_sentences:
    rnn_translation = translate_sentence(best_rnn_model, sent, en_tokenizer, vi_tokenizer, device)
    transformer_translation = translate_sentence(best_transformer_model, sent, en_tokenizer, vi_tokenizer, device)

    print(f"{sent[:37] + '...' if len(sent) > 40 else sent:40s} | {rnn_translation[:37] + '...' if len(rnn_translation) > 40 else rnn_translation:40s} | {transformer_translation[:37] + '...' if len(transformer_translation) > 40 else transformer_translation:40s}")

print("=" * 70)