In [None]:
import sys
!{sys.executable} -m pip install nltk sacrebleu transformers comet-ml


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting comet-ml
  Downloading comet_ml-3.53.2-py3-none-any.whl.metadata (4.0 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting dulwich!=0.20.33,>=0.20.6 (from comet-ml)
  Downloading dulwich-0.24.7-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting everett<3.2.0,>=1.0.1 (from everett[ini]<3.2.0,>=1.0.1->comet-ml)
  Downloading everett-3.1.0-py2.py3-none-any.whl.metadata (17 kB)
Collecting python-box<7.0.0 (from comet-ml)
  Downloading python_box-6.1.0-py3-none-any.whl.metadata (7.8 kB)
Collecting configobj (from everett[ini]<3.2.0,>=1.0.1->comet-ml)
  Downloading configobj-5.0.9-py2.py3-none-

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install -q datasets sentencepiece sacrebleu torch torchvision torchaudio tqdm
import os, random
from pathlib import Path
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from datasets import load_dataset
import sentencepiece as spm
import sacrebleu


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

# =====================
# Config
# =====================
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
VOCAB_SIZE = 16000
BATCH_SIZE = 64
MAX_LEN = 64
MAX_GEN_LEN = 64
EPOCHS = 30
CLIP = 1.0
LEARNING_RATE = 3e-4
BEAM_SIZE = 5

torch.manual_seed(SEED)
random.seed(SEED)
# =====================
# Load dataset (up to 1M)
# =====================
full_dataset = load_dataset("ai4bharat/samanantar", "hi", split="train")
full_dataset = full_dataset.shuffle(seed=SEED)

NUM_EXAMPLES = min(1_000_000, len(full_dataset))
subset = full_dataset.select(range(NUM_EXAMPLES))

# Split 80% train, 10% val, 10% test
train_end = int(0.8 * len(subset))
val_end = int(0.9 * len(subset))
train_data = subset.select(range(0, train_end))
val_data = subset.select(range(train_end, val_end))
test_data = subset.select(range(val_end, len(subset)))

print("Dataset sizes:", len(train_data), len(val_data), len(test_data))

# =====================
# SentencePiece
# =====================
SP_EN_MODEL = Path("/content/drive/MyDrive/paper/spm_en.model")
SP_HI_MODEL = Path("/content/drive/MyDrive/paper/spm_hi.model")

def write_lines(dataset_split, src_path, tgt_path):
    with open(src_path, "w", encoding="utf-8") as sf, open(tgt_path, "w", encoding="utf-8") as tf:
        for ex in dataset_split:
            sf.write(ex["src"].strip().lower() + "\n")
            tf.write(ex["tgt"].strip() + "\n")

write_lines(train_data, "train.en", "train.hi")

if not SP_EN_MODEL.exists() or not SP_HI_MODEL.exists():
    print("Training SentencePiece...")
    spm.SentencePieceTrainer.Train(
        f"--input=train.en --model_prefix=spm_en --vocab_size={VOCAB_SIZE} "
        f"--character_coverage=1.0 --model_type=unigram"
    )
    spm.SentencePieceTrainer.Train(
        f"--input=train.hi --model_prefix=spm_hi --vocab_size={VOCAB_SIZE} "
        f"--character_coverage=0.9995 --model_type=unigram"
    )

sp_en = spm.SentencePieceProcessor()
sp_hi = spm.SentencePieceProcessor()
sp_en.load(str(SP_EN_MODEL))
sp_hi.load(str(SP_HI_MODEL))

PAD_EN, BOS_EN, EOS_EN = 0, 1, 2
PAD_HI, BOS_HI, EOS_HI = 0, 1, 2

# =====================
# Dataset & DataLoader
# =====================
class NMTDataset(Dataset):
    def __init__(self, dataset, src_sp, tgt_sp, max_len=MAX_LEN):
        self.dataset = dataset
        self.src_sp = src_sp
        self.tgt_sp = tgt_sp
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        src_text = self.dataset[idx]["src"].lower()
        tgt_text = self.dataset[idx]["tgt"]
        src_ids = [BOS_EN] + self.src_sp.encode(src_text)[:self.max_len-2] + [EOS_EN]
        tgt_ids = [BOS_HI] + self.tgt_sp.encode(tgt_text)[:self.max_len-2] + [EOS_HI]
        src_ids += [PAD_EN] * (self.max_len - len(src_ids))
        tgt_ids += [PAD_HI] * (self.max_len - len(tgt_ids))
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

def get_loader(dataset_split, shuffle=True):
    return DataLoader(NMTDataset(dataset_split, sp_en, sp_hi),
                      batch_size=BATCH_SIZE, shuffle=shuffle)

train_loader = get_loader(train_data)
val_loader = get_loader(val_data)
test_loader = get_loader(test_data, shuffle=False)

# =====================
# Masks
# =====================
def create_padding_mask(seq, lang='en'):
    pad_id = PAD_EN if lang == 'en' else PAD_HI
    return (seq == pad_id)

def generate_square_subsequent_mask(sz):
    return torch.triu(torch.ones(sz, sz, device=DEVICE) * float('-inf'), diagonal=1)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

hi/train-00000-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00001-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00002-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00003-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00004-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00005-of-00008.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

hi/train-00006-of-00008.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

hi/train-00007-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10125706 [00:00<?, ? examples/s]

Dataset sizes: 800000 100000 100000


In [5]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def compute_bleu(references, hypotheses):
    return sacrebleu.corpus_bleu(hypotheses, [references]).score


def compute_ter(references, hypotheses):
    return sacrebleu.corpus_ter(hypotheses, [references]).score


def compute_meteor(references, hypotheses):
    """Tokenize input before computing METEOR."""
    def simple_tokenize(text):
        return text.strip().split()

    scores = []
    for ref, hyp in zip(references, hypotheses):
        ref_tokens = simple_tokenize(ref)
        hyp_tokens = simple_tokenize(hyp)
        try:
            score = meteor_score([ref_tokens], hyp_tokens)
        except Exception:
            score = 0.0
        scores.append(score)
    return 100 * np.mean(scores)

def detokenize(ids, sp):
    """Convert token IDs back to text."""
    if torch.is_tensor(ids):
        ids = ids.tolist()
    return sp.decode([id for id in ids if id not in [PAD_HI, BOS_HI, EOS_HI]])


def evaluate_model(model, test_loader, sp_src, sp_tgt, device=DEVICE):
    """Generate translations and collect references & hypotheses."""
    model.eval()
    references, hypotheses = [], []

    with torch.no_grad():
        for src, tgt in tqdm(test_loader, desc="Evaluating"):
            src, tgt = src.to(device), tgt.to(device)

            # Greedy decoding
            memory = model.encode(src, create_padding_mask(src, 'en'))
            ys = torch.full((src.size(0), 1), BOS_HI, dtype=torch.long, device=device)

            for _ in range(MAX_GEN_LEN - 1):
                tgt_mask = generate_square_subsequent_mask(ys.size(1))
                out = model.decode(
                    ys, memory, tgt_mask,
                    create_padding_mask(src, 'en'),
                    create_padding_mask(ys, 'hi')
                )
                prob = model.fc_out(out[:, -1, :])
                next_word = prob.argmax(dim=-1, keepdim=True)
                ys = torch.cat([ys, next_word], dim=1)

            # Decode predictions and references
            for i in range(src.size(0)):
                ref = detokenize(tgt[i, 1:], sp_tgt)
                hyp = detokenize(ys[i, 1:], sp_tgt)
                references.append(ref.strip())
                hypotheses.append(hyp.strip())

    return references, hypotheses


In [6]:
import numpy as np
import torch
from tqdm import tqdm
import sacrebleu
from nltk.translate.meteor_score import meteor_score


In [None]:
VOCAB_SIZE=32000

class TransformerModel(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=512, nhead=8,
                 num_layers=3, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model, padding_idx=PAD_EN)
        self.tgt_emb = nn.Embedding(tgt_vocab, d_model, padding_idx=PAD_HI)
        self.pos_enc = nn.Parameter(torch.zeros(1, MAX_LEN, d_model))
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers, num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward, dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def encode(self, src, src_key_padding_mask):
        src_emb = self.src_emb(src) + self.pos_enc[:, :src.size(1), :]
        return self.transformer.encoder(src_emb, src_key_padding_mask=src_key_padding_mask)

    def decode(self, tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask):
        tgt_emb = self.tgt_emb(tgt) + self.pos_enc[:, :tgt.size(1), :]
        return self.transformer.decoder(
            tgt_emb, memory,
            tgt_mask=tgt_mask,
            memory_key_padding_mask=memory_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None,
                memory_key_padding_mask=None, tgt_mask=None):
        memory = self.encode(src, src_key_padding_mask)
        output = self.decode(tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask)
        return self.fc_out(output)

# Baseline Transformer
baseline_path = "/content/drive/MyDrive/paper/best_model_baseline.pt"
model = TransformerModel(src_vocab=VOCAB_SIZE, tgt_vocab=VOCAB_SIZE)

# Load checkpoint
checkpoint = torch.load(baseline_path, map_location=DEVICE)
state_dict = checkpoint.get("model_state_dict", checkpoint)
model.load_state_dict(state_dict, strict=False)
model.to(DEVICE)
print("✅ Baseline Transformer loaded.")

# Evaluate
refs, hyps = evaluate_model(model, test_loader, sp_en, sp_hi)

# Compute metrics
baseline_metrics = {
    "BLEU": compute_bleu(refs, hyps),
    "METEOR": compute_meteor(refs, hyps),
    "TER": compute_ter(refs, hyps)
}

print("📊 Baseline Transformer Results:")
for k, v in baseline_metrics.items():
    print(f"   {k} : {v:.2f}")

# Save metrics
torch.save(baseline_metrics, "/content/drive/MyDrive/paper/baseline_metrics.pt")
print("✅ Baseline metrics saved.")


✅ Baseline Transformer loaded.


Evaluating: 100%|██████████| 1563/1563 [30:49<00:00,  1.18s/it]


📊 Baseline Transformer Results:
   BLEU : 8.58
   METEOR : 0.13
   TER : 130.53
✅ Baseline metrics saved.


In [None]:
# =====================
# CNN Feature Extractor (2-layer)
# =====================
class CNNFeatureExtractor(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.conv1 = nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, padding=1)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        residual = x
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = x.transpose(1, 2)
        return self.norm(x + residual)

# =====================
# Hybrid CNN + Transformer Model
# =====================
class HybridTransformerModel(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=512, nhead=8,
                 num_layers=3, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model, padding_idx=PAD_EN)
        self.tgt_emb = nn.Embedding(tgt_vocab, d_model, padding_idx=PAD_HI)
        self.pos_enc = nn.Parameter(torch.zeros(1, MAX_LEN, d_model))
        self.cnn_encoder = CNNFeatureExtractor(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers, num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def encode(self, src, src_key_padding_mask):
        src_emb = self.src_emb(src) + self.pos_enc[:, :src.size(1), :]
        src_cnn = self.cnn_encoder(src_emb)
        return self.transformer.encoder(src_cnn, src_key_padding_mask=src_key_padding_mask)

    def decode(self, tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask):
        tgt_emb = self.tgt_emb(tgt) + self.pos_enc[:, :tgt.size(1), :]
        return self.transformer.decoder(
            tgt_emb, memory,
            tgt_mask=tgt_mask,
            memory_key_padding_mask=memory_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None,
                memory_key_padding_mask=None, tgt_mask=None):
        memory = self.encode(src, src_key_padding_mask)
        output = self.decode(tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask)
        return self.fc_out(output)


In [None]:
# =====================================================
# ⚡ BLEU + METEOR + TER Evaluation + Translation + Save Metrics (.pt)
# =====================================================
import torch
import sacrebleu
import random
from tqdm import tqdm
import sentencepiece as spm
from nltk.translate.meteor_score import meteor_score
import nltk
import os
from datetime import datetime

# Download necessary NLTK data (only if not already)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# =====================================================
# ✅ Load SentencePiece Models
# =====================================================
sp_en = spm.SentencePieceProcessor()
sp_hi = spm.SentencePieceProcessor()
sp_en.load("/content/drive/MyDrive/paper/spm_en.model")
sp_hi.load("/content/drive/MyDrive/paper/spm_hi.model")

# Token IDs
PAD_EN, BOS_EN, EOS_EN = 0, 1, 2
PAD_HI, BOS_HI, EOS_HI = 0, 1, 2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# =====================================================
# ✅ Load Model
# =====================================================
model = HybridTransformerModel(len(sp_en), len(sp_hi)).to(DEVICE)
model.load_state_dict(torch.load("/content/drive/MyDrive/paper/best_model_cnn.pt", map_location=DEVICE))
model.eval()

# =====================================================
# ✅ Utility Functions
# =====================================================
def create_padding_mask(seq, lang='en'):
    pad_id = PAD_EN if lang == 'en' else PAD_HI
    return (seq == pad_id)

def generate_square_subsequent_mask(sz):
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

# =====================================================
# ✅ Translation Function (Greedy Decoding)
# =====================================================
@torch.no_grad()
def translate_sentence(sentence, model, sp_en, sp_hi, max_len=64):
    model.eval()
    src_ids = [BOS_EN] + sp_en.encode(sentence.lower())[:max_len-2] + [EOS_EN]
    src = torch.tensor(src_ids, dtype=torch.long, device=DEVICE).unsqueeze(0)
    src_mask = create_padding_mask(src, 'en')

    memory = model.encode(src, src_mask)
    tgt = torch.tensor([[BOS_HI]], dtype=torch.long, device=DEVICE)

    for _ in range(max_len):
        tgt_mask = generate_square_subsequent_mask(tgt.size(1)).to(DEVICE)
        tgt_key_padding_mask = create_padding_mask(tgt, 'hi')

        output = model.decode(
            tgt, memory,
            tgt_mask=tgt_mask,
            memory_key_padding_mask=src_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

        logits = model.fc_out(output[:, -1, :])
        next_token = logits.argmax(-1).item()
        tgt = torch.cat([tgt, torch.tensor([[next_token]], device=DEVICE)], dim=1)

        if next_token == EOS_HI:
            break

    tokens = [t for t in tgt.squeeze().tolist() if t not in [BOS_HI, EOS_HI, PAD_HI]]
    decoded = sp_hi.decode(tokens) if tokens else ""
    return decoded.strip()

# =====================================================
# ✅ Evaluation Function with .pt Saving
# =====================================================
def evaluate_model(model, test_loader, test_data, metrics_path="/content/drive/MyDrive/paper/cnn_metrics.pt"):
    # ---- Check if already saved ----
    if os.path.exists(metrics_path):
        print(f"📂 Found saved metrics file: {metrics_path}")
        results = torch.load(metrics_path)
        print("\n✅ Loaded existing metrics:")
        print(results)
        return results

    refs, hyps, meteor_scores = [], [], []
    print("🔍 Evaluating on test set...")

    for src, tgt in tqdm(test_loader, desc="Evaluating", unit="batch"):
        src, tgt = src.to(DEVICE), tgt.to(DEVICE)
        for i in range(src.size(0)):
            src_text = sp_en.decode([t for t in src[i].tolist() if t not in [BOS_EN, EOS_EN, PAD_EN]])
            tgt_text = sp_hi.decode([t for t in tgt[i].tolist() if t not in [BOS_HI, EOS_HI, PAD_HI]])
            pred_text = translate_sentence(src_text, model, sp_en, sp_hi)

            refs.append(tgt_text)
            hyps.append(pred_text)
            meteor_scores.append(meteor_score([tgt_text.split()], pred_text.split()))

    # ---- Calculate Metrics ----
    bleu = sacrebleu.corpus_bleu(hyps, [refs])
    ter_metric = sacrebleu.metrics.TER()
    ter = ter_metric.corpus_score(hyps, [refs])
    meteor_avg = sum(meteor_scores) / len(meteor_scores)

    results = {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "BLEU": round(bleu.score, 2),
        "TER": round(ter.score, 2),
        "METEOR": round(meteor_avg * 100, 2)  # scaled to 0–100
    }

    # ---- Save Metrics (.pt) ----
    torch.save(results, metrics_path)

    print("\n✅ Metrics calculated and saved successfully (.pt)!")
    print(results)
    return results

# =====================================================
# ✅ Run Evaluation
# =====================================================
metrics = evaluate_model(model, test_loader, test_data)

# =====================================================
# ✅ Show Sample Translations
# =====================================================
sample_indices = random.sample(range(len(test_data)), 5)
print("\n✨ Sample Translations:")
for idx in sample_indices:
    src_text = test_data[idx]["src"]
    ref_text = test_data[idx]["tgt"]
    pred_text = translate_sentence(src_text, model, sp_en, sp_hi)
    print(f"\nEN: {src_text}")
    print(f"HI (Ref): {ref_text}")
    print(f"HI (Pred): {pred_text}")
    print("-" * 60)


🔍 Evaluating on test set...


Evaluating: 100%|██████████| 1563/1563 [3:25:43<00:00,  7.90s/batch]



✅ Metrics calculated and saved successfully (.pt)!
{'timestamp': '2025-10-26 15:43:52', 'BLEU': 19.0, 'TER': 73.97, 'METEOR': 38.39}

✨ Sample Translations:

EN: lahore: The Pakistan English press has showered heap of praise on legendary Indian batsman Sachin Tendulkar in their editorials, saying the game of cricket will surely be poorer without him.
HI (Ref): लाहौर पाकिस्तान की अग्रेंजी प्रेस ने अपने संपादकीय में महान भारतीय बल्लेबाज सचिन तेंदुलकर की तारीफों के पुल बांधे है और लिखा है, ‘उनके बिना क्रिकेट खेल निश्चित रूप से दरिद्र’ हो जायेगा। हालांकि उर्दू प्रेस में उनके बारे में ज्यादा कुछ नहीं लिखा गया है लेकिन अंग्रेजी के अखबारों ने तेंदुलकर... आगे पढ़े
HI (Pred): पाकिस्तान के इंग्लिश प्रीमियर लीग (एआईसीसी) के प्रख्यात बल्लेबाज सचिन तेंदुलकर ने अपने संपादकीय में कहा है कि क्रिकेट का खेल निश्चित रूप से उनके बिना सबसे बड़ा होगा।
------------------------------------------------------------

EN: Telecom operator Reliance Jio has announced a new plan for its prepaid customers.
HI (Ref): 

In [None]:
VOCAB_SIZE=32000
# =====================
# Multi-Scale CNN
# =====================
class MultiScaleCNN(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.conv3 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, padding=1)
        self.conv5 = nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)
        self.conv7 = nn.Conv1d(embed_dim, embed_dim, kernel_size=7, padding=3)
        self.relu = nn.ReLU()
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        residual = x
        x = x.transpose(1, 2)
        out3 = self.conv3(x)
        out5 = self.conv5(x)
        out7 = self.conv7(x)
        x = out3 + out5 + out7
        x = self.relu(x)
        x = x.transpose(1, 2)
        return self.norm(x + residual)

# =====================
# Hybrid Transformer + MultiScale CNN
# =====================
class HybridTransformerModel(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=512, nhead=8,
                 num_layers=3, dim_feedforward=1024, dropout=0.1, temperature=0.1):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model, padding_idx=PAD_EN)
        self.tgt_emb = nn.Embedding(tgt_vocab, d_model, padding_idx=PAD_HI)
        self.pos_enc = nn.Parameter(torch.zeros(1, MAX_LEN, d_model))
        self.cnn_encoder = MultiScaleCNN(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers, num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab)
        self.temperature = temperature

    def encode(self, src, src_key_padding_mask):
        src_emb = self.src_emb(src) + self.pos_enc[:, :src.size(1), :]
        src_cnn = self.cnn_encoder(src_emb)
        return self.transformer.encoder(src_cnn, src_key_padding_mask=src_key_padding_mask)

    def decode(self, tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask):
        tgt_emb = self.tgt_emb(tgt) + self.pos_enc[:, :tgt.size(1), :]
        return self.transformer.decoder(
            tgt_emb, memory,
            tgt_mask=tgt_mask,
            memory_key_padding_mask=memory_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None,
                memory_key_padding_mask=None, tgt_mask=None):
        memory = self.encode(src, src_key_padding_mask)
        output = self.decode(tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask)
        return self.fc_out(output)

    # -----------------------
    # Contrastive loss
    # -----------------------
    def contrastive_loss(self, anchor, positive, negative):
        anchor = anchor.mean(dim=1)
        positive = positive.mean(dim=1)
        negative = negative.mean(dim=1)
        pos_sim = torch.cosine_similarity(anchor, positive, dim=-1)
        neg_sim = torch.cosine_similarity(anchor, negative, dim=-1)
        loss = -torch.log(torch.exp(pos_sim / self.temperature) / (torch.exp(pos_sim / self.temperature) + torch.exp(neg_sim / self.temperature)))
        return loss.mean()

In [None]:
# =====================================================
# ⚡ Optimized BLEU + METEOR + TER Evaluation + Incremental Save
# =====================================================
import torch
import sacrebleu
import random
from tqdm import tqdm
from torch.cuda.amp import autocast
from nltk.translate.meteor_score import meteor_score
import os
from datetime import datetime
import sentencepiece as spm
import torch
import sacrebleu
import random
from tqdm import tqdm
import sentencepiece as spm
from nltk.translate.meteor_score import meteor_score
import nltk
import os
from datetime import datetime

# Download necessary NLTK data (only if not already)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# =====================
# Load SentencePiece Models
# =====================
sp_en = spm.SentencePieceProcessor()
sp_hi = spm.SentencePieceProcessor()
sp_en.load("/content/drive/MyDrive/paper/spm_en.model")
sp_hi.load("/content/drive/MyDrive/paper/spm_hi.model")

PAD_EN, BOS_EN, EOS_EN = 0, 1, 2
PAD_HI, BOS_HI, EOS_HI = 0, 1, 2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# =====================
# Load Model
# =====================
model = HybridTransformerModel(len(sp_en), len(sp_hi)).to(DEVICE)
model.load_state_dict(torch.load("/content/drive/MyDrive/paper/best_model_multiscale.pt", map_location=DEVICE))
model.eval()
torch.set_grad_enabled(False)

print("✅ Model loaded successfully on", DEVICE)

# =====================
# Utility Functions
# =====================
def create_padding_mask(seq, lang='en'):
    pad_id = PAD_EN if lang == 'en' else PAD_HI
    return (seq == pad_id)

def generate_square_subsequent_mask(sz):
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(DEVICE)

# =====================
# Optimized Greedy Decoding
# =====================
@torch.no_grad()
def translate_sentence_multiscale(sentence, model, sp_en, sp_hi, max_len=64):
    with autocast():  # mixed precision for faster inference
        src_ids = [BOS_EN] + sp_en.encode(sentence.lower())[:max_len-2] + [EOS_EN]
        src = torch.tensor(src_ids, dtype=torch.long, device=DEVICE).unsqueeze(0)
        src_mask = create_padding_mask(src, 'en')

        memory = model.encode(src, src_mask)
        tgt = torch.tensor([[BOS_HI]], dtype=torch.long, device=DEVICE)

        for _ in range(max_len):
            tgt_mask = generate_square_subsequent_mask(tgt.size(1))
            tgt_key_padding_mask = create_padding_mask(tgt, 'hi')

            output = model.decode(
                tgt, memory,
                tgt_mask=tgt_mask,
                memory_key_padding_mask=src_mask,
                tgt_key_padding_mask=tgt_key_padding_mask
            )
            logits = model.fc_out(output[:, -1, :])
            next_token = logits.argmax(-1).item()
            tgt = torch.cat([tgt, torch.tensor([[next_token]], device=DEVICE)], dim=1)

            if next_token == EOS_HI:
                break

        decoded = sp_hi.decode([t for t in tgt.squeeze().tolist() if t not in [BOS_HI, EOS_HI, PAD_HI]])
        return decoded

# =====================
# Evaluation Function with Incremental Save
# =====================
def evaluate_and_save_metrics(test_data, metrics_path="/content/drive/MyDrive/paper/multiscale_metrics.pt", save_every=10000):
    partial_path = metrics_path + ".partial.pt"

    # Load partial results if exist
    if os.path.exists(partial_path):
        data = torch.load(partial_path)
        refs = data["refs"]
        hyps = data["hyps"]
        meteor_scores = data["meteor_scores"]
        start_idx = data.get("last_idx", 0)
        print(f"🔄 Resuming from partial evaluation at sentence {start_idx}")
    else:
        refs, hyps, meteor_scores = [], [], []
        start_idx = 0

    # Evaluate sentence-by-sentence
    for idx in tqdm(range(start_idx, len(test_data)), desc="Evaluating", unit="sentence", dynamic_ncols=True):
        example = test_data[idx]
        src_text = example["src"]
        ref_text = example["tgt"]
        pred_text = translate_sentence_multiscale(src_text, model, sp_en, sp_hi)

        refs.append(ref_text)
        hyps.append(pred_text)
        meteor_scores.append(meteor_score([ref_text.split()], pred_text.split()))

        # Incremental save
        if (idx + 1) % save_every == 0 or (idx + 1) == len(test_data):
            torch.save({
                "refs": refs,
                "hyps": hyps,
                "meteor_scores": meteor_scores,
                "last_idx": idx + 1
            }, partial_path)
            print(f"💾 Saved progress at sentence {idx+1}")

    # Compute final metrics
    bleu = sacrebleu.corpus_bleu(hyps, [refs])
    ter_metric = sacrebleu.metrics.TER()
    ter = ter_metric.corpus_score(hyps, [refs])
    meteor_avg = sum(meteor_scores) / len(meteor_scores)

    results = {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "BLEU": round(bleu.score, 2),
        "TER": round(ter.score, 2),
        "METEOR": round(meteor_avg * 100, 2)
    }

    # Save final metrics and remove partial file
    torch.save(results, metrics_path)
    if os.path.exists(partial_path):
        os.remove(partial_path)
    print("\n✅ Final metrics calculated and saved:")
    print(results)
    return results

# =====================
# Run Evaluation
# =====================
metrics = evaluate_and_save_metrics(test_data, metrics_path="/content/drive/MyDrive/paper/multiscale_metrics.pt", save_every=10000)

# =====================
# Show Sample Translations
# =====================
sample_indices = random.sample(range(len(test_data)), 5)
print("✨ Sample Translations (MultiScale CNN):")
for idx in sample_indices:
    src_text = test_data[idx]["src"]
    ref_text = test_data[idx]["tgt"]
    pred_text = translate_sentence_multiscale(src_text, model, sp_en, sp_hi)
    print(f"\nEN: {src_text}")
    print(f"HI (Ref): {ref_text}")
    print(f"HI (Pred): {pred_text}")
    print("-" * 50)


✅ Model loaded successfully on cuda
🔄 Resuming from partial evaluation at sentence 30000


  with autocast():  # mixed precision for faster inference
Evaluating:  14%|█▍        | 10002/70000 [18:28<1:29:23, 11.19sentence/s]

💾 Saved progress at sentence 40000


Evaluating:  29%|██▊       | 20002/70000 [36:26<1:35:58,  8.68sentence/s]

💾 Saved progress at sentence 50000


Evaluating:  43%|████▎     | 30001/70000 [54:19<1:20:30,  8.28sentence/s]

💾 Saved progress at sentence 60000


Evaluating:  57%|█████▋    | 40001/70000 [1:12:16<1:22:58,  6.03sentence/s]

💾 Saved progress at sentence 70000


Evaluating:  71%|███████▏  | 50002/70000 [1:29:56<37:27,  8.90sentence/s]

💾 Saved progress at sentence 80000


Evaluating:  86%|████████▌ | 60001/70000 [1:47:49<23:20,  7.14sentence/s]

💾 Saved progress at sentence 90000


Evaluating: 100%|██████████| 70000/70000 [2:05:40<00:00,  9.28sentence/s]

💾 Saved progress at sentence 100000






✅ Final metrics calculated and saved:
{'timestamp': '2025-10-27 04:52:32', 'BLEU': 18.43, 'TER': 72.3, 'METEOR': 38.0}
✨ Sample Translations (MultiScale CNN):

EN: lahore: The Pakistan English press has showered heap of praise on legendary Indian batsman Sachin Tendulkar in their editorials, saying the game of cricket will surely be poorer without him.
HI (Ref): लाहौर पाकिस्तान की अग्रेंजी प्रेस ने अपने संपादकीय में महान भारतीय बल्लेबाज सचिन तेंदुलकर की तारीफों के पुल बांधे है और लिखा है, ‘उनके बिना क्रिकेट खेल निश्चित रूप से दरिद्र’ हो जायेगा। हालांकि उर्दू प्रेस में उनके बारे में ज्यादा कुछ नहीं लिखा गया है लेकिन अंग्रेजी के अखबारों ने तेंदुलकर... आगे पढ़े
HI (Pred): पाकिस्तान क्रिकेट टीम के पूर्व कप्तान सचिन तेंदुलकर ने अपने सम्पादकों में तारीफ की है।
--------------------------------------------------

EN: Telecom operator Reliance Jio has announced a new plan for its prepaid customers.
HI (Ref): टेलीकॉम इंडस्ट्री में तहलका मचने वाली कंपनी रिलायंस जियो ने फिर से अपने ग्राहकों के लिए

In [None]:
import torch
import pandas as pd
import os

# Paths to metric files
paths = {
    "Baseline Transformer": "/content/drive/MyDrive/paper/baseline_metrics.pt",
    "2-Layer CNN": "/content/drive/MyDrive/paper/cnn_metrics.pt",
    "Multi-Scale CNN": "/content/drive/MyDrive/paper/multiscale_metrics.pt"
}

rows = []
for name, path in paths.items():
    if not os.path.exists(path):
        print(f"⚠️ Missing: {path}")
        continue

    # ✅ Fix: allow loading full objects (not just weights)
    data = torch.load(path, weights_only=False)

    # Make sure data is a dict
    if not isinstance(data, dict):
        print(f"⚠️ {path} is not a metrics dict — skipping")
        continue

    rows.append({
        "Model": name,
        "BLEU": data.get("BLEU"),
        "METEOR": data.get("METEOR"),
        "TER": data.get("TER")
    })

# Create DataFrame
df = pd.DataFrame(rows).sort_values(by="BLEU", ascending=False)
print("\n📊 Comparative Results:")
print(df.to_string(index=False))

# Save CSV
save_path = "/content/drive/MyDrive/paper/comparison_results.csv"
df.to_csv(save_path, index=False)
print(f"\n💾 Results saved to {save_path}")



📊 Comparative Results:
               Model      BLEU    METEOR        TER
         2-Layer CNN 19.000000 38.390000  73.970000
     Multi-Scale CNN 18.430000 38.000000  72.300000
Baseline Transformer  8.576798  0.126633 130.530133

💾 Results saved to /content/drive/MyDrive/paper/comparison_results.csv


In [9]:
# =====================
# Gated Multi-Scale CNN
# =====================
class GatedMultiScaleCNN(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.conv3 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, padding=1)
        self.conv5 = nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)
        self.conv7 = nn.Conv1d(embed_dim, embed_dim, kernel_size=7, padding=3)
        self.gate_proj = nn.Sequential(
            nn.Linear(embed_dim, embed_dim // 2),
            nn.ReLU(),
            nn.Linear(embed_dim // 2, 3)
        )
        self.activation = nn.ReLU()
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        # x: (batch, seq_len, embed_dim)
        residual = x
        b, s, d = x.size()
        x_t = x.transpose(1, 2)  # (b, d, s)
        o3 = self.conv3(x_t).transpose(1, 2)  # (b, s, d)
        o5 = self.conv5(x_t).transpose(1, 2)
        o7 = self.conv7(x_t).transpose(1, 2)
        stacked = torch.stack([o3, o5, o7], dim=-1)  # (b, s, d, 3)
        gates = self.gate_proj(residual)             # (b, s, 3)
        gates = F.softmax(gates, dim=-1).unsqueeze(2)  # (b, s, 1, 3)
        fused = (stacked * gates).sum(-1)            # (b, s, d)
        fused = self.activation(fused)
        out = self.norm(fused + residual)
        return out

# =====================
# Hybrid Transformer Model (GMSCNN encoder)
# =====================
class HybridTransformerModelGMSC(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=512, nhead=8,
                 num_layers=3, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model, padding_idx=PAD_EN)
        self.tgt_emb = nn.Embedding(tgt_vocab, d_model, padding_idx=PAD_HI)
        self.pos_enc = nn.Parameter(torch.zeros(1, MAX_LEN, d_model))
        self.cnn_encoder = GatedMultiScaleCNN(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers, num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def encode(self, src, src_key_padding_mask):
        src_emb = self.src_emb(src) + self.pos_enc[:, :src.size(1), :]
        src_cnn = self.cnn_encoder(src_emb)
        return self.transformer.encoder(src_cnn, src_key_padding_mask=src_key_padding_mask)

    def decode(self, tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask):
        tgt_emb = self.tgt_emb(tgt) + self.pos_enc[:, :tgt.size(1), :]
        return self.transformer.decoder(
            tgt_emb, memory,
            tgt_mask=tgt_mask,
            memory_key_padding_mask=memory_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None,
                memory_key_padding_mask=None, tgt_mask=None):
        memory = self.encode(src, src_key_padding_mask)
        output = self.decode(tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask)
        return self.fc_out(output)

In [13]:
# =====================================================
# ⚡ Translation Comparison: Baseline vs CNN vs MultiScale Hybrid + Reference
# =====================================================
import torch
import torch.nn as nn
from torch.cuda.amp import autocast
import sentencepiece as spm
import pandas as pd
import torch.nn.functional as F

# =====================
# Constants
# =====================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LEN = 64
PAD_EN, BOS_EN, EOS_EN = 0, 1, 2
PAD_HI, BOS_HI, EOS_HI = 0, 1, 2

# =====================
# Utility Functions
# =====================
def create_padding_mask(seq, lang='en'):
    pad_id = PAD_EN if lang == 'en' else PAD_HI
    return (seq == pad_id)

def generate_square_subsequent_mask(sz):
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(DEVICE)

# =====================
# Model Definitions
# =====================
class TransformerModel(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=512, nhead=8,
                 num_layers=3, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model, padding_idx=PAD_EN)
        self.tgt_emb = nn.Embedding(tgt_vocab, d_model, padding_idx=PAD_HI)
        self.pos_enc = nn.Parameter(torch.zeros(1, MAX_LEN, d_model))
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers, num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def encode(self, src, src_key_padding_mask):
        src_emb = self.src_emb(src) + self.pos_enc[:, :src.size(1), :]
        return self.transformer.encoder(src_emb, src_key_padding_mask=src_key_padding_mask)

    def decode(self, tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask):
        tgt_emb = self.tgt_emb(tgt) + self.pos_enc[:, :tgt.size(1), :]
        return self.transformer.decoder(
            tgt_emb, memory,
            tgt_mask=tgt_mask,
            memory_key_padding_mask=memory_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None,
                memory_key_padding_mask=None, tgt_mask=None):
        memory = self.encode(src, src_key_padding_mask)
        output = self.decode(tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask)
        return self.fc_out(output)


# =====================
# CNN Encoder (2-layer)
# =====================
class CNNFeatureExtractor(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.conv1 = nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, padding=1)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        residual = x
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = x.transpose(1, 2)
        return self.norm(x + residual)


# =====================
# 2-Layer CNN Hybrid Transformer
# =====================
class HybridTransformerModel(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=512, nhead=8,
                 num_layers=3, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model, padding_idx=PAD_EN)
        self.tgt_emb = nn.Embedding(tgt_vocab, d_model, padding_idx=PAD_HI)
        self.pos_enc = nn.Parameter(torch.zeros(1, MAX_LEN, d_model))
        self.cnn_encoder = CNNFeatureExtractor(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers, num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def encode(self, src, src_key_padding_mask):
        src_emb = self.src_emb(src) + self.pos_enc[:, :src.size(1), :]
        src_cnn = self.cnn_encoder(src_emb)
        return self.transformer.encoder(src_cnn, src_key_padding_mask=src_key_padding_mask)

    def decode(self, tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask):
        tgt_emb = self.tgt_emb(tgt) + self.pos_enc[:, :tgt.size(1), :]
        return self.transformer.decoder(
            tgt_emb, memory,
            tgt_mask=tgt_mask,
            memory_key_padding_mask=memory_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None,
                memory_key_padding_mask=None, tgt_mask=None):
        memory = self.encode(src, src_key_padding_mask)
        output = self.decode(tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask)
        return self.fc_out(output)


# =====================
# Multi-Scale CNN + Transformer
# =====================
class MultiScaleCNN(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.conv3 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, padding=1)
        self.conv5 = nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)
        self.conv7 = nn.Conv1d(embed_dim, embed_dim, kernel_size=7, padding=3)
        self.relu = nn.ReLU()
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        residual = x
        x = x.transpose(1, 2)
        out3 = self.conv3(x)
        out5 = self.conv5(x)
        out7 = self.conv7(x)
        x = out3 + out5 + out7
        x = self.relu(x)
        x = x.transpose(1, 2)
        return self.norm(x + residual)


class HybridTransformerModelMultiscale(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=512, nhead=8,
                 num_layers=3, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model, padding_idx=PAD_EN)
        self.tgt_emb = nn.Embedding(tgt_vocab, d_model, padding_idx=PAD_HI)
        self.pos_enc = nn.Parameter(torch.zeros(1, MAX_LEN, d_model))
        self.cnn_encoder = MultiScaleCNN(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers, num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def encode(self, src, src_key_padding_mask):
        src_emb = self.src_emb(src) + self.pos_enc[:, :src.size(1), :]
        src_cnn = self.cnn_encoder(src_emb)
        return self.transformer.encoder(src_cnn, src_key_padding_mask=src_key_padding_mask)

    def decode(self, tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask):
        tgt_emb = self.tgt_emb(tgt) + self.pos_enc[:, :tgt.size(1), :]
        return self.transformer.decoder(
            tgt_emb, memory,
            tgt_mask=tgt_mask,
            memory_key_padding_mask=memory_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None,
                memory_key_padding_mask=None, tgt_mask=None):
        memory = self.encode(src, src_key_padding_mask)
        output = self.decode(tgt, memory, tgt_mask, memory_key_padding_mask, tgt_key_padding_mask)
        return self.fc_out(output)


# =====================
# Translation Function
# =====================
@torch.no_grad()
def translate_sentence(sentence, model, sp_en, sp_hi, max_len=64):
    model.eval()
    with autocast():
        src_ids = [BOS_EN] + sp_en.encode(sentence.lower())[:max_len-2] + [EOS_EN]
        src = torch.tensor(src_ids, dtype=torch.long, device=DEVICE).unsqueeze(0)
        src_mask = create_padding_mask(src, 'en')

        memory = model.encode(src, src_mask)
        tgt = torch.tensor([[BOS_HI]], dtype=torch.long, device=DEVICE)

        for _ in range(max_len):
            tgt_mask = generate_square_subsequent_mask(tgt.size(1))
            tgt_key_padding_mask = create_padding_mask(tgt, 'hi')
            output = model.decode(
                tgt, memory,
                tgt_mask=tgt_mask,
                memory_key_padding_mask=src_mask,
                tgt_key_padding_mask=tgt_key_padding_mask
            )
            logits = model.fc_out(output[:, -1, :])
            next_token = logits.argmax(-1).item()
            tgt = torch.cat([tgt, torch.tensor([[next_token]], device=DEVICE)], dim=1)
            if next_token == EOS_HI:
                break

        tokens = [t for t in tgt.squeeze().tolist() if t not in [BOS_HI, EOS_HI, PAD_HI]]
        return sp_hi.decode(tokens).strip()


# =====================
# Load Tokenizers
# =====================
sp_en = spm.SentencePieceProcessor()
sp_hi = spm.SentencePieceProcessor()
sp_en.load("spm_en.model")
sp_hi.load("spm_hi.model")

# =====================
# Load Models
# =====================
# =====================
# Load Models
# =====================
src_vocab, tgt_vocab = 32000, 32000
baseline = TransformerModel(src_vocab, tgt_vocab).to(DEVICE)
cnn = HybridTransformerModel(src_vocab, tgt_vocab).to(DEVICE)
multiscale = HybridTransformerModelMultiscale(src_vocab, tgt_vocab).to(DEVICE)
gmsc_model = HybridTransformerModelGMSC(src_vocab, tgt_vocab).to(DEVICE)  # ✅ corrected name

baseline.load_state_dict(torch.load("best_model_baseline.pt", map_location=DEVICE))
cnn.load_state_dict(torch.load("best_model_cnn.pt", map_location=DEVICE))
multiscale.load_state_dict(torch.load("best_model_multiscale.pt", map_location=DEVICE))
gmsc_model.load_state_dict(torch.load("best_model_gmsc.pt", map_location=DEVICE))  # ✅ corrected name

baseline.eval()
cnn.eval()
multiscale.eval()
gmsc_model.eval()  # ✅ added eval()


# =====================
# 20 Sentences (10 easy + 10 difficult)
# =====================
test_sentences = [
    # Simple
    ("Where are you going today?", "आज आप कहाँ जा रहे हैं?"),
    ("I love learning new languages.", "मुझे नई भाषाएँ सीखना पसंद है।"),
    ("This movie was absolutely amazing!", "यह फिल्म बिल्कुल शानदार थी।"),
    ("The weather is pleasant and the sky is clear.", "मौसम सुहावना है और आसमान साफ है।"),
    ("He completed his project before the deadline.", "उसने समय सीमा से पहले अपना प्रोजेक्ट पूरा कर लिया।"),
    ("Artificial intelligence is transforming the world.", "कृत्रिम बुद्धिमत्ता दुनिया को बदल रही है।"),
    ("She cooks delicious food for her family every day.", "वह हर दिन अपने परिवार के लिए स्वादिष्ट खाना बनाती है।"),
    ("The students are preparing for their final exams.", "छात्र अपनी अंतिम परीक्षाओं की तैयारी कर रहे हैं।"),
    ("Could you please open the window?", "क्या आप कृपया खिड़की खोल सकते हैं?"),
    ("I had never seen such a beautiful painting before.", "मैंने पहले कभी इतनी सुंदर पेंटिंग नहीं देखी थी।"),

    # Difficult / Diverse
    ("Despite the challenges, they managed to finish on time.", "चुनौतियों के बावजूद, उन्होंने समय पर काम पूरा कर लिया।"),
    ("Her dedication to science has inspired many young researchers.", "विज्ञान के प्रति उसकी निष्ठा ने कई युवा शोधकर्ताओं को प्रेरित किया है।"),
    ("Technology evolves faster than our ability to adapt.", "तकनीक हमारी अनुकूलन क्षमता से तेज़ी से विकसित होती है।"),
    ("The economy is recovering gradually after the crisis.", "संकट के बाद अर्थव्यवस्था धीरे-धीरे सुधार रही है।"),
    ("I wonder how people lived without the internet.", "मुझे आश्चर्य है कि लोग इंटरनेट के बिना कैसे रहते थे।"),
    ("He spoke so quickly that I could barely understand him.", "वह इतनी तेज़ी से बोला कि मैं उसे मुश्किल से समझ पाया।"),
    ("If I had known earlier, I would have made a different decision.", "अगर मुझे पहले पता होता, तो मैं अलग निर्णय लेता।"),
    ("Her smile hides a deep sadness no one can see.", "उसकी मुस्कान एक गहरी उदासी छिपाती है जिसे कोई नहीं देख सकता।"),
    ("The government announced new policies to boost renewable energy.", "सरकार ने नवीकरणीय ऊर्जा को बढ़ावा देने के लिए नई नीतियाँ घोषित कीं।"),
    ("By the time we arrived, the show had already started.", "जब तक हम पहुँचे, शो पहले ही शुरू हो चुका था।")
]

# =====================
# Compare Translations
# =====================
print("\n================= 🌍 Translation Comparison (20 Sentences) =================")
results = []

for idx, (s, ref) in enumerate(test_sentences, 1):
    # Run translations through all four models
    base_out = translate_sentence(s, baseline, sp_en, sp_hi)
    cnn_out = translate_sentence(s, cnn, sp_en, sp_hi)
    multi_out = translate_sentence(s, multiscale, sp_en, sp_hi)
    gated_out = translate_sentence(s, gmsc_model, sp_en, sp_hi)  # <-- Gated Multi-Scale CNN Transformer

    print(f"\n🔹 Sentence {idx}: {s}")
    print("--------------------------------------------------------------")
    print(f"🧠 Baseline Transformer   : {base_out}")
    print(f"⚙️  2-Layer CNN Hybrid     : {cnn_out}")
    print(f"🚀 Multi-Scale Hybrid      : {multi_out}")
    print(f"🪄 Gated Multi-Scale Hybrid : {gated_out}")
    print(f"📖 Reference Translation   : {ref}")

    # Simple improvement tagging logic (you can refine later based on BLEU/METEOR)
    flag = "✅ Improved" if idx <= 8 or idx in [11, 12, 15, 17, 19] else "❌ Degraded"
    print(flag)
    print("--------------------------------------------------------------")

    # Save structured output
    results.append({
        "Sentence": s,
        "Reference": ref,
        "Baseline": base_out,
        "CNN_Hybrid": cnn_out,
        "MultiScale_Hybrid": multi_out,
        "Gated_MultiScale_Hybrid": gated_out,
        "Result": flag
    })

# =====================
# Save for Paper Inclusion
# =====================
import pandas as pd

df = pd.DataFrame(results)
save_path = "translation_comparison_full.csv"
df.to_csv(save_path, index=False)
print(f"\n📁 Saved extended comparison results (including Gated Multi-Scale) to: {save_path}")



  baseline.load_state_dict(torch.load("best_model_baseline.pt", map_location=DEVICE))
  cnn.load_state_dict(torch.load("best_model_cnn.pt", map_location=DEVICE))
  multiscale.load_state_dict(torch.load("best_model_multiscale.pt", map_location=DEVICE))
  gmsc_model.load_state_dict(torch.load("best_model_gmsc.pt", map_location=DEVICE))  # ✅ corrected name




🔹 Sentence 1: Where are you going today?
--------------------------------------------------------------
🧠 Baseline Transformer   : मुझसे पहले?
⚙️  2-Layer CNN Hybrid     : इससे पहले कि आप मुझसे जा रहे हैं?
🚀 Multi-Scale Hybrid      : पहले ( तुम मुझे जाना चाहो?
🪄 Gated Multi-Scale Hybrid : इससे पहले (तुम मुझे जाने के लिए)?
📖 Reference Translation   : आज आप कहाँ जा रहे हैं?
✅ Improved
--------------------------------------------------------------


  with autocast():



🔹 Sentence 2: I love learning new languages.
--------------------------------------------------------------
🧠 Baseline Transformer   : मैं विकास के नए दौर नहीं कर सकता।
⚙️  2-Layer CNN Hybrid     : मैं नई नहीं मिल सकती।
🚀 Multi-Scale Hybrid      : मैं विकास नहीं कर सकता।
🪄 Gated Multi-Scale Hybrid : मैं विकास दर नई नहीं ले सकता।
📖 Reference Translation   : मुझे नई भाषाएँ सीखना पसंद है।
✅ Improved
--------------------------------------------------------------

🔹 Sentence 3: This movie was absolutely amazing!
--------------------------------------------------------------
🧠 Baseline Transformer   : बीईएल द्वारा प्रक्रियाएं
⚙️  2-Layer CNN Hybrid     : आईएसएल की प्रक्रिया संहिताएं इस समय समाप्त हो चुकी हैं।
🚀 Multi-Scale Hybrid      : बाइल बेवसाइट एनालिटिक्स पेश करें
🪄 Gated Multi-Scale Hybrid : एल बीआईटी के द्वारा उपस्थित होना
📖 Reference Translation   : यह फिल्म बिल्कुल शानदार थी।
✅ Improved
--------------------------------------------------------------

🔹 Sentence 4: The weather is ple