7. Code And Output

In [None]:
# Installing Dependencies
!pip install -q transformers peft accelerate datasets sacrebleu sentencepiece

import torch, os, re
from transformers import (
    AutoModelForSeq2SeqLM, AutoTokenizer,
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
from sacrebleu import corpus_bleu
from google.colab import files

device = "cpu"
print("Running on:", device)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hRunning on: cpu


In [None]:
# Uploading parallel data files.

print("Please upload your bilingual files (e.g., train.en, train.as, valid.en, valid.as)")
uploaded = files.upload()

# Detect English↔Assamese file pairs automatically
all_files = list(uploaded.keys())
print("\nUploaded files:", all_files)

def find_file(pattern):
    for f in all_files:
        if re.search(pattern, f, re.IGNORECASE):
            return f
    return None

train_en = find_file(r"train.*\.en")
train_as = find_file(r"train.*\.(as|asm|txt)")
valid_en = find_file(r"valid.*\.en")
valid_as = find_file(r"valid.*\.(as|asm|txt)")

if not (train_en and train_as):
    raise ValueError("Could not find both English and assamese training files!")

Please upload your bilingual files (e.g., train.en, train.as, valid.en, valid.as)


Saving train.as.txt to train.as.txt
Saving train.en.txt to train.en.txt
Saving valid.as.txt to valid.as.txt
Saving valid.en.txt to valid.en.txt

Uploaded files: ['train.as.txt', 'train.en.txt', 'valid.as.txt', 'valid.en.txt']


In [None]:
# Reading data

def read_parallel(src, tgt, limit=None):
    with open(src, "r", encoding="utf-8") as f1, open(tgt, "r", encoding="utf-8") as f2:
        src_lines = [l.strip() for l in f1]
        tgt_lines = [l.strip() for l in f2]
    if limit:
        src_lines, tgt_lines = src_lines[:limit], tgt_lines[:limit]
    return src_lines, tgt_lines

src_train, tgt_train = read_parallel(train_en, train_as, limit=500)

if valid_en and valid_as:
    src_valid, tgt_valid = read_parallel(valid_en, valid_as, limit=250)
else:
    split = int(0.9 * len(src_train))
    src_valid, tgt_valid = src_train[split:], tgt_train[split:]
    src_train, tgt_train = src_train[:split], tgt_train[:split]

print(f"\nLoaded {len(src_train)} training and {len(src_valid)} validation pairs.\n")
print(" Sample data preview:")
for i in range(3):
    print(f"EN: {src_train[i]}\nASM: {tgt_train[i]}\n")


Loaded 500 training and 250 validation pairs.

 Sample data preview:
EN: Similarly there is a great need for value addition of soil .
ASM: ঠিক একেধৰণে মাটিকঁঠালৰ মূল্য সংযোজনৰ যথেষ্ট আৱশ্যক আছে ।

EN: Lemon tanga also has a good market for sorbet pickles etc .
ASM: নেমুটেঙাৰ চৰবত আচাৰ আদিৰো ভাল বজাৰ আছে ।

EN: These crops are very much available in Assam but Assamese farmers have not been benefited due to lack of value addition .
ASM: এই শস্যকেইবিধ অসমত যথেষ্ট হয় কিন্তু মূল্য সংযোজনৰ অভাৱত অসমীয়া কৃষকসকল লাভৱান হ’ব পৰা নাই ।



In [None]:
# Creating dataset

train_ds = Dataset.from_dict({"src_text": src_train, "tgt_text": tgt_train})
valid_ds = Dataset.from_dict({"src_text": src_valid, "tgt_text": tgt_valid})


In [None]:
# Load Open Multilingual Model (NLLB-200)

model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Define closest available language codes
src_lang = "eng_Latn"
tgt_lang = "asm-Beng"

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
# APPLY LORA (lightweight fine-tuning)

lora_cfg = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "fc1", "fc2"  # fully connected layers in Transformer blocks
    ]
)
model = get_peft_model(model, lora_cfg)


In [None]:
# Tokenization

def preprocess(batch):
    src = [f"{src_lang} {t}" for t in batch["src_text"]]
    tgt = [f"{tgt_lang} {t}" for t in batch["tgt_text"]]
    model_inputs = tokenizer(src, truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(tgt, truncation=True, padding="max_length", max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_ds = train_ds.map(preprocess, batched=True)
valid_ds = valid_ds.map(preprocess, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
# Traning Configuration
args = Seq2SeqTrainingArguments(
    output_dir="./checkpoints_asm",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    num_train_epochs=3,
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    logging_steps=20,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer
)

  trainer = Seq2SeqTrainer(


In [None]:
# Training LoRA Model
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msiddhid-mca24[0m ([33msiddhid-mca24-nit-patna[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
20,10.7855
40,10.1956
60,9.5866
80,9.1519
100,9.042
120,8.4933
140,8.0054
160,7.7461
180,7.6599


TrainOutput(global_step=189, training_loss=8.89561817633412, metrics={'train_runtime': 14184.23, 'train_samples_per_second': 0.106, 'train_steps_per_second': 0.013, 'total_flos': 410635468800000.0, 'train_loss': 8.89561817633412, 'epoch': 3.0})

In [None]:
# Save The LoRA-Adapted Model

save_dir = "./asm_lora_model"

# Save adapter + tokenizer
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"LoRA fine-tuned model saved to: {save_dir}")

LoRA fine-tuned model saved to: ./asm_lora_model


In [None]:
# Code to reload the tuned model so we don't have to tune it again and again
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# from peft import PeftModel

# # Define paths
# base_model_name = "facebook/mbart-large-50"   # or "google/mt5-small", etc.
# lora_model_path = "./asm_lora_model"          # path where you saved your fine-tuned model

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(lora_model_path)

# # Load the base pretrained model (same as you used before fine-tuning)
# base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)

# # Load the LoRA adapter weights on top of the base model
# model = PeftModel.from_pretrained(base_model, lora_model_path)

# # Set model to evaluation mode
# model.eval()

# print("LoRA fine-tuned model and tokenizer reloaded successfully!")

In [None]:
# BLEU SCORE EVALUATION

from google.colab import files
from sacrebleu import corpus_bleu
import re

print("Please upload test source (English) and reference (Assamese) files:")
uploaded = files.upload()

all_files = list(uploaded.keys())
print("\nUploaded files:", all_files)

# --- Identify the correct test files automatically ---
def find_file(pattern):
    for f in all_files:
        if re.search(pattern, f, re.IGNORECASE):
            return f
    return None

test_en = find_file(r"test.*\.en")
test_as = find_file(r"test.*\.(asm|as|txt)")

src_file = test_en
ref_file = test_as

print(f"\nUsing Source file: {src_file}")
print(f"Using Reference file: {ref_file}")

# READ TEST DATA
with open(src_file, "r", encoding="utf-8") as f:
    src_sentences = [line.strip() for line in f if line.strip()]

with open(ref_file, "r", encoding="utf-8") as f:
    ref_sentences = [line.strip() for line in f if line.strip()]

print(f"\nLoaded {len(src_sentences)} source and {len(ref_sentences)} reference lines.")

# Ensure equal length
if len(src_sentences) != len(ref_sentences):
    min_len = min(len(src_sentences), len(ref_sentences))
    print(f"Trimming to {min_len} lines for comparison.")
    src_sentences = src_sentences[:min_len]
    ref_sentences = ref_sentences[:min_len]

# INFERENCE (TRANSLATION GENERATION)

print("\nGenerating translations using fine-tuned LoRA model...")
forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)
translations = []

for i, text in enumerate(src_sentences):
    inputs = tokenizer(f"{src_lang} {text}", return_tensors="pt",
                       truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():  # disable gradient tracking for faster inference
        outputs = model.generate(
            **inputs,
            forced_bos_token_id=forced_bos_token_id,
            num_beams=4,
            max_length=128,
            early_stopping=True
        )
    trans = tokenizer.decode(outputs[0], skip_special_tokens=True)
    translations.append(trans)

    if (i + 1) % 20 == 0:
        print(f"Translated {i+1}/{len(src_sentences)} sentences...")

# SAVE TRANSLATIONS

output_file = "translations_asm.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for line in translations:
        f.write(line + "\n")

print(f"\nTranslations saved to: {output_file}")

# COMPUTE BLEU SCORE
refs = [ref_sentences]
hyps = translations

bleu = corpus_bleu(hyps, refs)
print(f"\nFinal BLEU Score: {bleu.score:.2f}")

# SHOW SAMPLE COMPARISON
print("\nSample Comparison (first 5 lines):")
for i in range(min(5, len(hyps))):
    print(f"\nEN:  {src_sentences[i]}")
    print(f"REF: {ref_sentences[i]}")
    print(f"HYP: {hyps[i]}")


Please upload test source (English) and reference (Assamese) files:


Saving test.as.txt to test.as.txt
Saving test.en.txt to test.en.txt

Uploaded files: ['test.as.txt', 'test.en.txt']

Using Source file: test.en.txt
Using Reference file: test.as.txt

Loaded 250 source and 250 reference lines.

Generating translations using fine-tuned LoRA model...
Translated 20/250 sentences...
Translated 40/250 sentences...
Translated 60/250 sentences...
Translated 80/250 sentences...
Translated 100/250 sentences...
Translated 120/250 sentences...
Translated 140/250 sentences...
Translated 160/250 sentences...
Translated 180/250 sentences...
Translated 200/250 sentences...
Translated 220/250 sentences...
Translated 240/250 sentences...

Translations saved to: translations_asm.txt

Final BLEU Score: 0.73

🔍 Sample Comparison (first 5 lines):

EN:  Assam children join Delhi Dynamos FC strong defender Gaurav Barai .
REF: দিল্লী ডায়নামোছ এফ চিত যোগ দিলে অসম সন্তান শক্তিশালী ডিফেণ্ডাৰ গৌৰৱ বৰাই ।
HYP: ️ Assam children joined Delhi Dynamos FC strong defender Gaurav Barai

E