In [None]:
!pip install transformers datasets sacrebleu sentencepiece accelerate evaluate


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2

In [None]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
# Example paths — change these as per your folder structure
train_src_path = "/content/drive/MyDrive/dataset/train/hi-bho/train.hi"
train_tgt_path = "/content/drive/MyDrive/dataset/train/hi-bho/train.bho"
val_src_path   = "/content/drive/MyDrive/dataset/dev/dev.hi"
val_tgt_path   = "/content/drive/MyDrive/dataset/dev/dev.bho"


In [None]:
from datasets import Dataset

def load_parallel_data(src_path, tgt_path):
    with open(src_path, "r", encoding="utf-8") as f_src, open(tgt_path, "r", encoding="utf-8") as f_tgt:
        src_lines = [line.strip() for line in f_src.readlines()]
        tgt_lines = [line.strip() for line in f_tgt.readlines()]
    return Dataset.from_dict({"translation": [{"hi": s, "bho": t} for s, t in zip(src_lines, tgt_lines)]})

train_dataset = load_parallel_data(train_src_path, train_tgt_path)
val_dataset = load_parallel_data(val_src_path, val_tgt_path)

print(train_dataset[0])
print(f"Train size: {len(train_dataset)} | Validation size: {len(val_dataset)}")


{'translation': {'bho': 'जरूर दालि में कुछ करिया बा।', 'hi': 'जरुर दल में कुछ काला है'}}
Train size: 15391 | Validation size: 810


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="hin_Deva", tgt_lang="bho_Deva")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    inputs = [ex["hi"] for ex in examples["translation"]]
    targets = [ex["bho"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, padding="max_length", max_length=128)
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/15391 [00:00<?, ? examples/s]

Map:   0%|          | 0/810 [00:00<?, ? examples/s]

In [None]:
import evaluate
import numpy as np

bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu_score = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return {"bleu": bleu_score["score"]}


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import os


os.environ["WANDB_DISABLED"] = "true"

batch_size = 4
args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
    dataloader_num_workers=2,
    logging_steps=200,


    gradient_checkpointing=True,
    optim="adafactor",
    max_grad_norm=0.3,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Seq2SeqTrainer(
`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...


Step,Training Loss
200,6.2699
400,2.5722
600,0.8556
800,0.4685




TrainOutput(global_step=962, training_loss=2.1861517226125993, metrics={'train_runtime': 1492.5723, 'train_samples_per_second': 10.312, 'train_steps_per_second': 0.645, 'total_flos': 4169238366388224.0, 'train_loss': 2.1861517226125993, 'epoch': 1.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.4149027168750763, 'eval_bleu': 26.307251856762022, 'eval_runtime': 263.7891, 'eval_samples_per_second': 3.071, 'eval_steps_per_second': 0.77, 'epoch': 1.0}


In [None]:
model_name = "facebook/nllb-200-distilled-600M"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:

src_file = "/content/drive/MyDrive/dataset/dev/dev.hi"
tgt_file = "/content/drive/MyDrive/dataset/dev/dev.bho"


with open(src_file, "r", encoding="utf-8") as f:
    src_sentences = [line.strip() for line in f.readlines()]

with open(tgt_file, "r", encoding="utf-8") as f:
    ref_sentences = [line.strip() for line in f.readlines()]


subset_size = 200
src_sentences = src_sentences[:subset_size]
ref_sentences = ref_sentences[:subset_size]


In [None]:
preds = []
for text in src_sentences:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to("cuda")
    outputs = model.generate(**inputs, num_beams=5, max_length=128)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    preds.append(translation)




In [None]:
from sacrebleu import corpus_bleu

bleu = corpus_bleu(preds, [ref_sentences])
print(f"Baseline BLEU (original NLLB model): {bleu.score:.2f}")

Baseline BLEU (original NLLB model): 7.63
