In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

CUDA available: True
GPU: Tesla T4


In [None]:
!pip install -q transformers datasets sentencepiece sacrebleu accelerate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    MarianMTModel, MarianTokenizer,
    Trainer, TrainingArguments,
    DataCollatorForSeq2Seq
)
import sacrebleu

In [None]:
dataset = load_dataset("ai4bharat/samanantar", "hi")

# Reduce size for feasibility (paper-friendly)
dataset = dataset["train"].shuffle(seed=42).select(range(10000))
dataset = dataset.train_test_split(test_size=0.2)

dataset

README.md: 0.00B [00:00, ?B/s]

hi/train-00000-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00001-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00002-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00003-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00004-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00005-of-00008.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

hi/train-00006-of-00008.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

hi/train-00007-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10125706 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'src', 'tgt'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['idx', 'src', 'tgt'],
        num_rows: 2000
    })
})

In [None]:
#Load Marian MT
model_name = "Helsinki-NLP/opus-mt-en-hi"

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

model.to("cuda")

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(61950, 512, padding_idx=61949)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(61950, 512, padding_idx=61949)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [None]:
# Tokenization
MAX_LEN = 128

def preprocess(batch):
    inputs = tokenizer(
        batch["src"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["tgt"],
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./marian_en_hi",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    fp16=True,
    save_strategy="no",
    logging_steps=100,
    report_to="none"
)

In [None]:
# Trainer setup
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [None]:
# train
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
100,1.2408
200,0.7772
300,0.7306
400,0.6771
500,0.6814
600,0.6225
700,0.6536


TrainOutput(global_step=750, training_loss=0.7596949106852213, metrics={'train_runtime': 190.1351, 'train_samples_per_second': 126.226, 'train_steps_per_second': 3.945, 'total_flos': 813560758272000.0, 'train_loss': 0.7596949106852213, 'epoch': 3.0})

In [None]:
# bleu computation
def compute_bleu(model, tokenizer, dataset, sample_size=200):
    model.eval()
    preds, refs = [], []

    sample = dataset.shuffle(seed=42).select(range(sample_size))

    for ex in sample:
        inputs = tokenizer(
            ex["src"],
            return_tensors="pt",
            truncation=True,
            max_length=MAX_LEN
        ).to(model.device)

        with torch.no_grad():
            generated = model.generate(**inputs, max_length=MAX_LEN)

        pred = tokenizer.decode(generated[0], skip_special_tokens=True)
        preds.append(pred)
        refs.append([ex["tgt"]])

    return sacrebleu.corpus_bleu(preds, refs).score


bleu_marian = compute_bleu(model, tokenizer, dataset["test"])
print("MarianMT BLEU Score:", bleu_marian)

MarianMT BLEU Score: 10.629422065531916


In [None]:
#IEEE table
results_df = {
    "Model": ["MarianMT (Transformer)"],
    "Dataset": ["Samanantar"],
    "Language Pair": ["EN-HI"],
    "BLEU Score": [bleu_marian]
}

results_df

{'Model': ['MarianMT (Transformer)'],
 'Dataset': ['Samanantar'],
 'Language Pair': ['EN-HI'],
 'BLEU Score': [10.629422065531916]}

In [None]:
# mBART MODEL

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_name = "facebook/mbart-large-50-many-to-many-mmt"

tokenizer_mbart = MBart50TokenizerFast.from_pretrained(
    model_name,
    src_lang="en_XX",
    tgt_lang="hi_IN"
)

model_mbart = MBartForConditionalGeneration.from_pretrained(model_name)
model_mbart.to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        

In [None]:
# tokenize MBart
MAX_LEN = 128

def preprocess_mbart(batch):
    inputs = tokenizer_mbart(
        batch["src"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

    with tokenizer_mbart.as_target_tokenizer():
        labels = tokenizer_mbart(
            batch["tgt"],
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs


tokenized_dataset_mbart = dataset.map(
    preprocess_mbart,
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
#training args
training_args_mbart = TrainingArguments(
    output_dir="./mbart_en_hi",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    fp16=True,
    save_strategy="no",
    logging_steps=100,
    report_to="none"
)

In [None]:
#trainer setup
data_collator_mbart = DataCollatorForSeq2Seq(
    tokenizer=tokenizer_mbart,
    model=model_mbart
)

trainer_mbart = Trainer(
    model=model_mbart,
    args=training_args_mbart,
    train_dataset=tokenized_dataset_mbart["train"],
    tokenizer=tokenizer_mbart,
    data_collator=data_collator_mbart
)

  trainer_mbart = Trainer(


In [None]:
#train
trainer_mbart.train()

Step,Training Loss
100,2.5741
200,0.5165
300,0.5121
400,0.4826
500,0.472
600,0.3289
700,0.3305
800,0.3342
900,0.3236
1000,0.3445


TrainOutput(global_step=1500, training_loss=0.4933709945678711, metrics={'train_runtime': 1308.5994, 'train_samples_per_second': 18.34, 'train_steps_per_second': 1.146, 'total_flos': 6501389303808000.0, 'train_loss': 0.4933709945678711, 'epoch': 3.0})

In [None]:
#Bleu eval
def compute_bleu_mbart(model, tokenizer, dataset, sample_size=200):
    model.eval()
    preds, refs = [], []

    sample = dataset.shuffle(seed=42).select(range(sample_size))

    for ex in sample:
        inputs = tokenizer(
            ex["src"],
            return_tensors="pt",
            truncation=True,
            max_length=MAX_LEN
        ).to(model.device)

        with torch.no_grad():
            generated = model.generate(
                **inputs,
                max_length=MAX_LEN,
                num_beams=4
            )

        pred = tokenizer.decode(generated[0], skip_special_tokens=True)
        preds.append(pred)
        refs.append([ex["tgt"]])

    return sacrebleu.corpus_bleu(preds, refs).score


bleu_mbart = compute_bleu_mbart(model_mbart, tokenizer_mbart, dataset["test"])
print("mBART BLEU Score:", bleu_mbart)

mBART BLEU Score: 11.328360454400997


In [None]:
#bleu tabulation
final_results = {
    "Model": ["LSTM Seq2Seq", "MarianMT", "mBART"],
    "Framework": ["TensorFlow", "Transformer", "Transformer"],
    "BLEU Score": [0.33, 10.63, bleu_mbart]
}

final_results

{'Model': ['LSTM Seq2Seq', 'MarianMT', 'mBART'],
 'Framework': ['TensorFlow', 'Transformer', 'Transformer'],
 'BLEU Score': [0.33, 10.63, 11.328360454400997]}