In [18]:
# Installing Dependencies
!pip install -q transformers peft accelerate datasets sacrebleu sentencepiece

import torch, os, re
from transformers import (
    AutoModelForSeq2SeqLM, AutoTokenizer,
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
from sacrebleu import corpus_bleu
from google.colab import files

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on:", device)

Running on: cuda


In [2]:
# Uploading parallel data files.

print("Please upload your bilingual files (e.g., train.en, train.as, valid.en, valid.as)")
uploaded = files.upload()

# Detect English↔Khasi file pairs automatically
all_files = list(uploaded.keys())
print("\nUploaded files:", all_files)

def find_file(pattern):
    for f in all_files:
        if re.search(pattern, f, re.IGNORECASE):
            return f
    return None

train_en = find_file(r"train.*\.en")
train_as = find_file(r"train.*\.(as|asm|txt)")
valid_en = find_file(r"valid.*\.en")
valid_as = find_file(r"valid.*\.(as|asm|txt)")

if not (train_en and train_as):
    raise ValueError("Could not find both English and assamese training files!")

Please upload your bilingual files (e.g., train.en, train.as, valid.en, valid.as)


Saving valid.en.txt to valid.en.txt
Saving valid.as.txt to valid.as.txt
Saving train.en.txt to train.en.txt
Saving train.as.txt to train.as.txt

Uploaded files: ['valid.en.txt', 'valid.as.txt', 'train.en.txt', 'train.as.txt']


In [3]:
# Reading data

def read_parallel(src, tgt, limit=None):
    with open(src, "r", encoding="utf-8") as f1, open(tgt, "r", encoding="utf-8") as f2:
        src_lines = [l.strip() for l in f1]
        tgt_lines = [l.strip() for l in f2]
    if limit:
        src_lines, tgt_lines = src_lines[:limit], tgt_lines[:limit]
    return src_lines, tgt_lines

src_train, tgt_train = read_parallel(train_en, train_as, limit=500)

if valid_en and valid_as:
    src_valid, tgt_valid = read_parallel(valid_en, valid_as, limit=250)
else:
    split = int(0.9 * len(src_train))
    src_valid, tgt_valid = src_train[split:], tgt_train[split:]
    src_train, tgt_train = src_train[:split], tgt_train[:split]

print(f"\nLoaded {len(src_train)} training and {len(src_valid)} validation pairs.\n")
print(" Sample data preview:")
for i in range(3):
    print(f"EN: {src_train[i]}\nASM: {tgt_train[i]}\n")


Loaded 500 training and 250 validation pairs.

 Sample data preview:
EN: Similarly there is a great need for value addition of soil .
ASM: Similarly there is a great need for value addition of soil .

EN: Lemon tanga also has a good market for sorbet pickles etc .
ASM: Lemon tanga also has a good market for sorbet pickles etc .

EN: These crops are very much available in Assam but Assamese farmers have not been benefited due to lack of value addition .
ASM: These crops are very much available in Assam but Assamese farmers have not been benefited due to lack of value addition .



In [4]:
# Creating dataset

train_ds = Dataset.from_dict({"src_text": src_train, "tgt_text": tgt_train})
valid_ds = Dataset.from_dict({"src_text": src_valid, "tgt_text": tgt_valid})


In [5]:
# Load Open Multilingual Model (NLLB-200)

model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Define closest available language codes
src_lang = "eng_Latn"
tgt_lang = "asm-Beng"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [6]:
# APPLY LORA (lightweight fine-tuning)

lora_cfg = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "fc1", "fc2"  # fully connected layers in Transformer blocks
    ]
)
model = get_peft_model(model, lora_cfg)


In [7]:
# Tokenization

def preprocess(batch):
    src = [f"{src_lang} {t}" for t in batch["src_text"]]
    tgt = [f"{tgt_lang} {t}" for t in batch["tgt_text"]]
    model_inputs = tokenizer(src, truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(tgt, truncation=True, padding="max_length", max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_ds = train_ds.map(preprocess, batched=True)
valid_ds = valid_ds.map(preprocess, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [8]:
# Traning Configuration
args = Seq2SeqTrainingArguments(
    output_dir="./checkpoints_asm",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    num_train_epochs=3,
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    logging_steps=20,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer
)

  trainer = Seq2SeqTrainer(


In [9]:
# Training LoRA Model
trainer.train()


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maadij-mca24[0m ([33maadij-mca24-national-institute-of-technology-patna[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
20,11.0263
40,10.5314
60,9.9117
80,9.4687
100,9.0757
120,8.3833
140,7.9274
160,7.6204
180,7.5777


TrainOutput(global_step=189, training_loss=8.981181593799086, metrics={'train_runtime': 110.2336, 'train_samples_per_second': 13.607, 'train_steps_per_second': 1.715, 'total_flos': 410635468800000.0, 'train_loss': 8.981181593799086, 'epoch': 3.0})

In [10]:
# Save The LoRA-Adapted Model

save_dir = "./khasi_lora_model"

# Save adapter + tokenizer
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"LoRA fine-tuned model saved to: {save_dir}")


LoRA fine-tuned model saved to: ./khasi_lora_model


In [19]:
#BLEU Score Evaluation

from google.colab import files
from sacrebleu import corpus_bleu

print("Please upload test source (English) and reference (assamese) files:")
uploaded = files.upload()

all_files = list(uploaded.keys())
print("\nUploaded files:", all_files)

def find_file(pattern):
    for f in all_files:
        if re.search(pattern, f, re.IGNORECASE):
            return f
    return None
# Updated regex to handle potential ' (number).txt' suffixes added by Colab
test_en = find_file(r"test\.en(\s\(\d+\))?\.txt$")
test_kha = find_file(r"test\.(asm|as|txt)(\s\(\d+\))?\.txt$")

# Assign src_file and ref_file here, before their first use
src_file =test_en
ref_file=test_kha

print(f"\nUsing Source file: {src_file}")
print(f"Using Reference file: {ref_file}")




# READ DATA

with open(src_file, "r", encoding="utf-8") as f:
    src_sentences = [line.strip() for line in f if line.strip()]

with open(ref_file, "r", encoding="utf-8") as f:
    ref_sentences = [line.strip() for line in f if line.strip()]

print(f"\nLoaded {len(src_sentences)} source and {len(ref_sentences)} reference lines.")

# Ensure lengths match
if len(src_sentences) != len(ref_sentences):
    min_len = min(len(src_sentences), len(ref_sentences))
    print(f"Trimming to {min_len} lines for comparison.")
    src_sentences = src_sentences[:min_len]
    ref_sentences = ref_sentences[:min_len]

#GENERATE TRANSLATIONS USING TRAINED MODEL

print("\nGenerating translations...")
forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)
translations = []

for i, text in enumerate(src_sentences):
    inputs = tokenizer(f"{src_lang} {text}", return_tensors="pt", truncation=True,
                       padding=True, max_length=128).to(device)
    outputs = model.generate(
        **inputs,
        forced_bos_token_id=forced_bos_token_id,
        num_beams=4,
        max_length=128
    )
    trans = tokenizer.decode(outputs[0], skip_special_tokens=True)
    translations.append(trans)
    if (i + 1) % 20 == 0:
        print(f"Translated {i+1}/{len(src_sentences)} sentences...")

# SAVE TRANSLATIONS
output_file = "translations_asm.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for line in translations:
        f.write(line + "\n")

print(f"\nTranslations saved to {output_file}")

# COMPUTE BLEU SCORE

refs = [ref_sentences]
hyps = translations

bleu = corpus_bleu(hyps, refs)
print(f"\nBLEU Score: {bleu.score:.2f}")

# Show few examples

print("\nSample Comparison (first 5 lines):")
for i in range(min(5, len(hyps))):
    print(f"\nEN: {src_sentences[i]}")
    print(f"REF: {ref_sentences[i]}")
    print(f"HYP: {hyps[i]}")

Please upload test source (English) and reference (assamese) files:


Saving test.en.txt to test.en.txt
Saving test.as.txt to test.as.txt

Uploaded files: ['test.en.txt', 'test.as.txt']

Using Source file: test.en.txt
Using Reference file: test.as.txt

Loaded 250 source and 250 reference lines.

Generating translations...
Translated 20/250 sentences...
Translated 40/250 sentences...
Translated 60/250 sentences...
Translated 80/250 sentences...
Translated 100/250 sentences...
Translated 120/250 sentences...
Translated 140/250 sentences...
Translated 160/250 sentences...
Translated 180/250 sentences...
Translated 200/250 sentences...
Translated 220/250 sentences...
Translated 240/250 sentences...





Translations saved to translations_asm.txt

BLEU Score: 0.14

Sample Comparison (first 5 lines):

EN: Assam children join Delhi Dynamos FC strong defender Gaurav Barai .
REF: দিল্লী ডায়নামোছ এফ চিত যোগ দিলে অসম সন্তান শক্তিশালী ডিফেণ্ডাৰ গৌৰৱ বৰাই ।
HYP: Assam children join Delhi Dynamos FC బలమైన డిఫెండర్ Gaurav Barai .

EN: I find no merits in the petition the same is accordingly dismissed .
REF: মই আৱেদনখনত কোনো গুণাগুণ বিচাৰি নাপাওঁ সেই অনুসৰি ইয়াক খাৰিজ কৰা হৈছে ।
HYP: I find no merits in the petition the same is accordingly dismissed

EN: In our opinion he took a correct view in the matter .
REF: আমাৰ মতে তেওঁ বিষয়টোত সঠিক দৃষ্টিভংগী লৈছিল ।
HYP: In our opinion he took a correct view in the matter .

EN: And it was about the sixth hour , and there was a darkness over all the earth until the ninth hour .
REF: তেতিয়া বাৰ মান বজাৰ সময় হ ল আৰু তিনি বজালৈকে গোটেইখন দেশৰ ওপৰত আন্ধাৰ হল ;
HYP: And it was about the sixth hour , and there was darkness over all the earth until the nint