In [None]:

!pip -q install -U "transformers>=4.44.0" "datasets>=2.19.0" "peft>=0.10.0" \
               "accelerate>=0.30.0" "evaluate>=0.4.2" "sacrebleu>=2.4.0" \
               "sentencepiece>=0.1.99" "bitsandbytes>=0.43.0"

import os, json, random, unicodedata, math
import numpy as np
import torch
from dataclasses import dataclass
from typing import List, Dict

from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq,
                          Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import evaluate

device = "cuda" if torch.cuda.is_available() else "cpu"
seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed);
if device == "cuda": torch.cuda.manual_seed_all(seed)

LANG_SRC_HI = "hin_Deva"
LANG_TGT_MAG = "mag_Deva"

BASE_MODEL = "facebook/nllb-200-distilled-600M"
MAX_LEN = 192


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:

from google.colab import files
print("Please choose hi.txt (Hindi lines) and mag.txt (Magahi lines) ...")
uploaded = files.upload()

assert 'hi.txt' in uploaded and 'mag.txt' in uploaded, "Please upload hi.txt and mag.txt"

def read_lines(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = [unicodedata.normalize('NFC', x.strip()) for x in f.readlines()]
    return [x for x in lines if len(x) > 0]

hi_lines = read_lines('hi.txt')
mag_lines = read_lines('mag.txt')
assert len(hi_lines) == len(mag_lines), f"Line count mismatch: {len(hi_lines)} vs {len(mag_lines)}"

print(f"Loaded {len(hi_lines)} parallel pairs.")


Please choose hi.txt (Hindi lines) and mag.txt (Magahi lines) ...


Saving hi.txt to hi.txt
Saving mag.txt to mag.txt
Loaded 1000 parallel pairs.


In [None]:

pairs = []
seen = set()
for h, m in zip(hi_lines, mag_lines):

    h2, m2 = h.strip(), m.strip()
    if not h2 or not m2:
        continue

    key = (h2, m2)
    if key in seen:
        continue
    seen.add(key)

    if 0.5 <= (len(h2.split()) + 1e-6) / (len(m2.split()) + 1e-6) <= 2.0:
        pairs.append({"hi": h2, "mag": m2})

print(f"Kept {len(pairs)} pairs after filtering.")


n = len(pairs)
n_dev = max(100, round(0.1 * n))
n_test = max(100, round(0.1 * n))
n_train = max(1, n - n_dev - n_test)

random.shuffle(pairs)
train_pairs = pairs[:n_train]
dev_pairs   = pairs[n_train:n_train + n_dev]
test_pairs  = pairs[n_train + n_dev:]

print(f"Train: {len(train_pairs)} | Dev: {len(dev_pairs)} | Test: {len(test_pairs)}")


Kept 991 pairs after filtering.
Train: 791 | Dev: 100 | Test: 100


In [None]:

try:
    print("If you have monolingual files, select them now (mono_mag.txt and/or mono_hi.txt).")
    more = files.upload()
    mono_mag = read_lines('mono_mag.txt') if 'mono_mag.txt' in more else []
    mono_hi  = read_lines('mono_hi.txt') if 'mono_hi.txt' in more else []
except Exception as e:
    mono_mag, mono_hi = [], []

print(f"Monolingual Magahi: {len(mono_mag)} | Monolingual Hindi: {len(mono_hi)}")


If you have monolingual files, select them now (mono_mag.txt and/or mono_hi.txt).


Saving mono_hi.txt to mono_hi.txt
Saving mono_mag.txt to mono_mag.txt
Monolingual Magahi: 15000 | Monolingual Hindi: 15000


In [None]:
print(len(mono_mag))

15000


In [None]:

load_in_8bit = True if device == "cuda" else False

tok = AutoTokenizer.from_pretrained(BASE_MODEL)
tok.src_lang = LANG_SRC_HI


try:
    if load_in_8bit:
        base_model = AutoModelForSeq2SeqLM.from_pretrained(
            BASE_MODEL, load_in_8bit=True, device_map="auto"
        )
    else:
        base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
    print("Loaded base model OK.", "8-bit" if load_in_8bit else "fp16/fp32")
except Exception as e:
    print("8-bit load failed, falling back to full precision.", e)
    base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
    load_in_8bit = False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Loaded base model OK. 8-bit


In [None]:

import unicodedata
from typing import List, Dict
import torch

def chunk(lst, n):

    for i in range(0, len(lst), n):
        yield lst[i:i+n]

def generate_bt(monolingual_lines: List[str], src_lang_code: str, forced_bos_code: str,
                model, tokenizer, batch_size=16, max_new_tokens=128, num_beams=1,
                max_len=192) -> List[str]:

    if not monolingual_lines:
        return []

    model.eval()
    outs = []
    tokenizer.src_lang = src_lang_code

    forced_bos = tokenizer.convert_tokens_to_ids(forced_bos_code)

    for batch in chunk(monolingual_lines, batch_size):
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True,
                           max_length=max_len).to(model.device)

        with torch.no_grad():
            gen = model.generate(
                **inputs,
                forced_bos_token_id=forced_bos,
                max_new_tokens=max_new_tokens,
                num_beams=num_beams
            )

        outs.extend(tokenizer.batch_decode(gen, skip_special_tokens=True))

    return [unicodedata.normalize('NFC', x.strip()) for x in outs]

def simple_pair_filter(src_list: List[str], tgt_list: List[str],
                       min_ratio=0.5, max_ratio=2.0) -> List[Dict[str, str]]:

    out = []
    seen = set()

    for s, t in zip(src_list, tgt_list):
        if not s or not t:
            continue

        key = (s, t)
        if key in seen:
            continue
        seen.add(key)

        if s == t:
            continue


        lw_s = len(s.split()) + 1e-6
        lw_t = len(t.split()) + 1e-6
        r = lw_s / lw_t

        if min_ratio <= r <= max_ratio:
            out.append({"src": s, "tgt": t})

    return out

In [None]:

gold_hi2mag = [{"src": p["hi"], "tgt": p["mag"]} for p in train_pairs]
gold_mag2hi = [{"src": p["mag"], "tgt": p["hi"]} for p in train_pairs]

MAX_MONO = 5000
mono_mag_sm = mono_mag[:MAX_MONO]
mono_hi_sm = mono_hi[:MAX_MONO]


synthetic_hi2mag = []
if len(mono_mag) > 0:
    print(f"hello1")

    synth_hi = generate_bt(mono_mag_sm, src_lang_code=LANG_TGT_MAG, forced_bos_code=LANG_SRC_HI,
                           model=base_model, tokenizer=tok, batch_size=32)
    print(f"hello2")
    synthetic_hi2mag = simple_pair_filter(synth_hi, mono_mag_sm)
    print(f"Synthetic pairs for hi2mag: {len(synthetic_hi2mag)}")

synthetic_mag2hi = []
if len(mono_hi) > 0:

    synth_mag = generate_bt(mono_hi_sm, src_lang_code=LANG_SRC_HI, forced_bos_code=LANG_TGT_MAG,
                            model=base_model, tokenizer=tok, batch_size=32)
    synthetic_mag2hi = simple_pair_filter(synth_mag, mono_hi_sm)
    print(f"Synthetic pairs for mag2hi: {len(synthetic_mag2hi)}")


GOLD_UPSAMPLE = 3
gold_hi2mag_up = gold_hi2mag * GOLD_UPSAMPLE
gold_mag2hi_up = gold_mag2hi * GOLD_UPSAMPLE

MAX_SYNTH = 5000
synthetic_hi2mag = synthetic_hi2mag[:MAX_SYNTH]
synthetic_mag2hi = synthetic_mag2hi[:MAX_SYNTH]

train_hi2mag = gold_hi2mag_up + synthetic_hi2mag
train_mag2hi = gold_mag2hi_up + synthetic_mag2hi

print(f"Train hi2mag size: {len(train_hi2mag)}  (gold x{GOLD_UPSAMPLE} + synth {len(synthetic_hi2mag)})")
print(f"Train mag2hi size: {len(train_mag2hi)}  (gold x{GOLD_UPSAMPLE} + synth {len(synthetic_mag2hi)})")


dev_hi2mag  = [{"src": p["hi"], "tgt": p["mag"]} for p in dev_pairs]
test_hi2mag = [{"src": p["hi"], "tgt": p["mag"]} for p in test_pairs]

dev_mag2hi  = [{"src": p["mag"], "tgt": p["hi"]} for p in dev_pairs]
test_mag2hi = [{"src": p["mag"], "tgt": p["hi"]} for p in test_pairs]


hello1
hello2
Synthetic pairs for hi2mag: 4211
Synthetic pairs for mag2hi: 4705
Train hi2mag size: 6584  (gold x3 + synth 4211)
Train mag2hi size: 7078  (gold x3 + synth 4705)


In [None]:

def make_hf_dataset(train_list, dev_list, test_list):
    return DatasetDict({
        "train": Dataset.from_list(train_list),
        "validation": Dataset.from_list(dev_list),
        "test": Dataset.from_list(test_list)
    })

def preprocess_function(batch):
    tokenizer.src_lang = src_lang
    model_inputs = tokenizer(
        batch["src"],
        max_length=MAX_LEN,
        truncation=True,
        padding=False
    )


    try:
        labels = tokenizer(
            text_target=batch["tgt"],
            max_length=MAX_LEN,
            truncation=True,
            padding=False
        )
    except TypeError:
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                batch["tgt"],
                max_length=MAX_LEN,
                truncation=True,
                padding=False
            )

    model_inputs["labels"] = labels["input_ids"]


    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": model_inputs["labels"]
    }

metric_chrf = evaluate.load("chrf")
metric_bleu = evaluate.load("sacrebleu")


def build_collator(tokenizer, model):
    return DataCollatorForSeq2Seq(
        tokenizer=tokenizer,

        padding=True,
        label_pad_token_id=-100,
        pad_to_multiple_of=8 if device == "cuda" else None,
        return_tensors="pt"
    )
def compute_metrics_builder(tokenizer):
    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        refs = [[l] for l in decoded_labels]

        chrf = metric_chrf.compute(predictions=decoded_preds, references=refs)["score"]
        bleu = metric_bleu.compute(predictions=decoded_preds, references=refs)["score"]
        return {"chrf": chrf, "sacrebleu": bleu}
    return compute_metrics


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
def train_direction(
    direction="hi2mag",
    train_data=None,
    dev_data=None,
    test_data=None,
    learning_rate=1e-4,
    epochs=4,
    lora_r=16,
    lora_alpha=32,
    lora_dropout=0.05
):
    assert direction in ["hi2mag", "mag2hi"]

    src_lang = LANG_SRC_HI if direction == "hi2mag" else LANG_TGT_MAG
    tgt_lang = LANG_TGT_MAG if direction == "hi2mag" else LANG_SRC_HI

    from transformers import BitsAndBytesConfig

    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_threshold=6.0,
        llm_int8_has_fp16_weight=False
    )

    model = AutoModelForSeq2SeqLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map="auto"
    )
    model.config.use_cache = False


    lora_cfg = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=["q_proj", "v_proj"],
        inference_mode=False,
    )

    model = get_peft_model(model, lora_cfg)
    model.train()


    tok = AutoTokenizer.from_pretrained(BASE_MODEL)
    tgt_lang_token = tok.convert_tokens_to_ids(tgt_lang)
    model.config.forced_bos_token_id = tgt_lang_token
    tokenizer = tok


    ds = make_hf_dataset(train_data, dev_data, test_data)

    def preprocess_function(batch):
        tokenizer.src_lang = src_lang
        tokenizer.tgt_lang = tgt_lang

        src = tokenizer(batch["src"], max_length=MAX_LEN, truncation=True, padding=False)

        with tokenizer.as_target_tokenizer():
            tgt = tokenizer(batch["tgt"], max_length=MAX_LEN, truncation=True, padding=False)

        return {
            "input_ids": src["input_ids"],
            "attention_mask": src["attention_mask"],
            "labels": tgt["input_ids"],
        }

    ds = ds.map(
        preprocess_function,
        batched=True,
        remove_columns=list(ds["train"].column_names),
        desc=f"Tokenizing ({src_lang} → {tgt_lang})",
    )


    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding=True,
        label_pad_token_id=-100,
        pad_to_multiple_of=8 if device == "cuda" else None,
        return_tensors="pt",
    )


    compute_metrics = compute_metrics_builder(tokenizer)


    out_dir = f"nllb_magahi_lora_{direction}"
    args = Seq2SeqTrainingArguments(
        output_dir=out_dir,
        learning_rate=learning_rate,
        auto_find_batch_size=True,
        per_device_eval_batch_size=16,
        num_train_epochs=epochs,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        predict_with_generate=True,
        generation_max_length=128,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="chrf",
        greater_is_better=True,
        fp16=(device == "cuda"),
        gradient_checkpointing=False,
        label_smoothing_factor=0.0,
        report_to=[],
        remove_unused_columns=True,
    )


    class CustomSeq2SeqTrainer(Seq2SeqTrainer):
        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            if "decoder_input_ids" in inputs and "decoder_inputs_embeds" in inputs:
                if inputs.get("decoder_input_ids") is not None:
                    inputs.pop("decoder_inputs_embeds", None)
                elif inputs.get("decoder_inputs_embeds") is not None:
                    inputs.pop("decoder_input_ids", None)

            return super().compute_loss(model, inputs, return_outputs, num_items_in_batch)

    trainer = CustomSeq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=ds["train"],
        eval_dataset=ds["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    # ---- Train ----
    trainer.train()
    print("Best:", trainer.state.best_metric)

    # ---- Eval & save ----
    test_metrics = trainer.evaluate(eval_dataset=ds["test"], metric_key_prefix="test")
    print("Test:", test_metrics)

    os.makedirs(out_dir, exist_ok=True)
    trainer.model.save_pretrained(out_dir)
    tokenizer.save_pretrained(out_dir)
    print(f"Saved LoRA adapter & tokenizer to {out_dir}")

    return out_dir, trainer

In [None]:
hi2mag_out, hi2mag_trainer = train_direction(
    direction="hi2mag",
    train_data=train_hi2mag,
    dev_data=dev_hi2mag,
    test_data=test_hi2mag
)


Tokenizing (hin_Deva → mag_Deva):   0%|          | 0/6584 [00:00<?, ? examples/s]



Tokenizing (hin_Deva → mag_Deva):   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing (hin_Deva → mag_Deva):   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = CustomSeq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Chrf,Sacrebleu
1,2.0735,1.585296,61.34744,28.478194
2,2.0596,1.441308,62.125415,29.492565
3,1.9194,1.355984,62.928886,31.49293
4,1.9852,1.33822,63.185123,31.536557




Best: 63.18512315360494




early stopping required metric_for_best_model, but did not find eval_chrf so early stopping is disabled


Test: {'test_loss': 1.268302083015442, 'test_chrf': 65.09623057196747, 'test_sacrebleu': 37.238947632905, 'test_runtime': 37.9164, 'test_samples_per_second': 2.637, 'test_steps_per_second': 0.185, 'epoch': 4.0}
Saved LoRA adapter & tokenizer to nllb_magahi_lora_hi2mag


In [None]:

def load_adapter(direction, adapter_dir):

    if load_in_8bit:
        model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL, load_in_8bit=True, device_map="auto")
        model = prepare_model_for_kbit_training(model)
    else:
        model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)

    from peft import PeftModel
    model = PeftModel.from_pretrained(model, adapter_dir)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(adapter_dir)hr
    return model, tokenizer

def translate_text(texts: List[str], direction="hi2mag", model=None, tokenizer=None, max_new_tokens=128, num_beams=4):
    assert direction in ["hi2mag", "mag2hi"]
    src_lang = LANG_SRC_HI if direction == "hi2mag" else LANG_TGT_MAG
    tgt_lang = LANG_TGT_MAG if direction == "hi2mag" else LANG_SRC_HI

    tokenizer.src_lang = src_lang
    forced_bos = tokenizer.convert_tokens_to_ids(tgt_lang)

    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LEN)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        gen = model.generate(
            **inputs,
            forced_bos_token_id=forced_bos,
            num_beams=num_beams,
            max_new_tokens=max_new_tokens
        )

    return tokenizer.batch_decode(gen, skip_special_tokens=True)


In [None]:

model_hi2mag, tok_hi2mag = load_adapter("hi2mag", hi2mag_out)
samples_hi = [
    "यह निर्णय सार्वजनिक स्वास्थ्य के लिए महत्वपूर्ण है।",
    "कृपया कल सुबह 10 बजे बैठक में शामिल हों।",
]
pred_mag = translate_text(samples_hi, "hi2mag", model_hi2mag, tok_hi2mag)
for s, p in zip(samples_hi, pred_mag):
    print("HI:", s)
    print("MAG:", p)
    print("-" * 60)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


HI: यह निर्णय सार्वजनिक स्वास्थ्य के लिए महत्वपूर्ण है।
MAG: ई फैसला पब्लिक हेल्थ ला महत्वपूर्ण हे।
------------------------------------------------------------
HI: कृपया कल सुबह 10 बजे बैठक में शामिल हों।
MAG: कृप्या कल सबेरे दस बजे बइठक में शामिल हो।
------------------------------------------------------------
