In [1]:
from datasets import load_dataset
import evaluate

dataset = load_dataset("parquet", data_files={
    "train": "data/iamTangsang_dataset/train-00000-of-00001.parquet",
    "validation": "data/iamTangsang_dataset/validation-00000-of-00001.parquet", 
    "test": "data/iamTangsang_dataset/test-00000-of-00001.parquet"
    })

dataset["train"] = dataset["train"].select(range(10000))       
dataset["validation"] = dataset["validation"].select(range(2000))
dataset["test"] = dataset["test"].select(range(1000)) 

dataset = dataset.filter(lambda x: x["source"].strip() != "" and x["target"].strip() != "")

print(dataset["train"][0])

  from .autonotebook import tqdm as notebook_tqdm


{'source': '"कुनै पनि अन्य सरकारी एजेन्सीले यो जानकारी प्रयोग गर्न सक्दैन, केन्द्रीय सरकार अन्तर्गतका कसैले कुनै पनि हालतमा यो जानकारी पाउँदैनन् र राज्यका अधिकारीहरूमा पनि स्वास्थ्य अधिकारीहरूले मात्र यसलाई प्रयोग गर्न सक्दछन्," उनले भने।', 'target': '"No other government agency can use this information, no one in the commonwealth government at all, and in state authorities, only the health officer can use it.'}


In [2]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small", legacy=False)

def preprocess(examples):
    inputs = ["translate Nepali to English: " + ex for ex in examples["source"]]
    targets = examples["target"]

    model_inputs = tokenizer(inputs, max_length=300, truncation=True, padding="longest")

    labels = tokenizer(targets, max_length=300, truncation=True, padding="longest")

    model_inputs["labels"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]
    return model_inputs



In [3]:
print("Tokenizer vocab size:", tokenizer.vocab_size)


lengths = [len(tokenizer("translate Nepali to English: " + ex)["input_ids"])
           for ex in dataset["train"]["source"][:1000]]

print("Max length in batch of 1000:", max(lengths))

Tokenizer vocab size: 250100
Max length in batch of 1000: 272


In [4]:
tokenized_dataset = dataset.map(preprocess, batched=True, num_proc=4)
tokenized_dataset.set_format(type="torch")
print(tokenized_dataset["train"][0].keys())

dict_keys(['source', 'target', 'input_ids', 'attention_mask', 'labels'])


In [5]:
def find_invalid_tokens(dataset_split, name=""):
    invalid_count = 0
    for i, ex in enumerate(dataset_split):
        for token in ex["labels"]:
            if token != -100 and token >= tokenizer.vocab_size:
                print(f"❌ Invalid token ID at index {i} in {name} set: {token}")
                print("Target text:", dataset["validation"][i]["target"])
                invalid_count += 1
    print(f"\nTotal invalid tokens in {name} set:", invalid_count)

find_invalid_tokens(tokenized_dataset["validation"], name="validation")
find_invalid_tokens(tokenized_dataset["train"], name="train")
find_invalid_tokens(tokenized_dataset["test"], name="test")



Total invalid tokens in validation set: 0

Total invalid tokens in train set: 0

Total invalid tokens in test set: 0


In [6]:
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")



In [8]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-npi-en",
    eval_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    generation_max_length=300,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False, 
    use_cpu=True
)

In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [10]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [11]:
trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
 33%|███▎      | 750/2250 [15:22<26:15,  1.05s/it]  

{'loss': 7.2881, 'grad_norm': 6.854383945465088, 'learning_rate': 0.00013333333333333334, 'epoch': 1.0}


                                                  
 33%|███▎      | 750/2250 [15:37<26:15,  1.05s/it]

{'eval_loss': 3.4174652099609375, 'eval_runtime': 15.5652, 'eval_samples_per_second': 12.849, 'eval_steps_per_second': 6.425, 'epoch': 1.0}


 67%|██████▋   | 1500/2250 [29:30<13:37,  1.09s/it] 

{'loss': 3.7461, 'grad_norm': 6.9619598388671875, 'learning_rate': 6.666666666666667e-05, 'epoch': 2.0}


                                                   
 67%|██████▋   | 1500/2250 [29:45<13:37,  1.09s/it]

{'eval_loss': 3.1306755542755127, 'eval_runtime': 15.4121, 'eval_samples_per_second': 12.977, 'eval_steps_per_second': 6.488, 'epoch': 2.0}


100%|██████████| 2250/2250 [43:42<00:00,  1.18s/it]  

{'loss': 3.3156, 'grad_norm': 7.015842437744141, 'learning_rate': 0.0, 'epoch': 3.0}


                                                   
100%|██████████| 2250/2250 [43:57<00:00,  1.17s/it]

{'eval_loss': 3.0383048057556152, 'eval_runtime': 15.4603, 'eval_samples_per_second': 12.936, 'eval_steps_per_second': 6.468, 'epoch': 3.0}
{'train_runtime': 2637.7969, 'train_samples_per_second': 1.706, 'train_steps_per_second': 0.853, 'train_loss': 4.783296440972222, 'epoch': 3.0}





TrainOutput(global_step=2250, training_loss=4.783296440972222, metrics={'train_runtime': 2637.7969, 'train_samples_per_second': 1.706, 'train_steps_per_second': 0.853, 'total_flos': 944662958469120.0, 'train_loss': 4.783296440972222, 'epoch': 3.0})

In [12]:
trainer.save_model("./mt5-npi-en")
tokenizer.save_pretrained("./mt5-npi-en")

('./mt5-npi-en/tokenizer_config.json',
 './mt5-npi-en/special_tokens_map.json',
 './mt5-npi-en/spiece.model',
 './mt5-npi-en/added_tokens.json')

In [13]:
test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print("Test Evaluation:", test_results)


100%|██████████| 100/100 [00:17<00:00,  5.68it/s]

Test Evaluation: {'eval_loss': 3.1013731956481934, 'eval_runtime': 17.812, 'eval_samples_per_second': 11.228, 'eval_steps_per_second': 5.614, 'epoch': 3.0}





In [16]:
from tqdm import tqdm
import torch

# For single-GPU CPU/accelerator inference
model.eval()

batch_size = 8  # You can tune this
sources = dataset["test"]["source"]
references = dataset["test"]["target"]

predictions = []

for i in tqdm(range(0, len(sources), batch_size), desc="Generating translations"):
    batch_src = sources[i:i + batch_size]
    batch_inputs = ["translate Nepali to English: " + s for s in batch_src]
    
    inputs = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=300
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=300,
            num_beams=4
        )
    
    batch_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predictions.extend(batch_preds)

# Print a few samples to check translation quality
for i in range(10):
    print(f"\nSource    : {sources[i]}")
    print(f"Reference : {references[i]}")
    print(f"Predicted : {predictions[i]}")

bleu_score = bleu.compute(
    predictions=[p.strip() for p in predictions],
    references=[[r.strip()] for r in references]
)
print("Test BLEU:", bleu_score["bleu"])



Generating translations: 100%|██████████| 25/25 [02:52<00:00,  6.91s/it]


Source    : यसबाट युवकहरु पनि पीडित भएको देखिन्छन ।
Reference : Young people also suffer from it.
Predicted : It has also been damaged by youths.

Source    : बजाऊने लिस्ट
Reference : Playlist
Predicted : It is a new folder

Source    : त्यहाँ केही नियमहरू छन् जुन तपाईंले पछ्याउनु पर्छः
Reference : There are a few rules that you should follow:
Predicted : He doesn't know what you need to follow.

Source    : सबै गीतहरु उनी आफैंले लेखेका हुन् ।
Reference : All songs were written by himself.
Predicted : He wrote a song.

Source    : मेरो एउटा जर्मन साथी छ ।
Reference : I had a German friend.
Predicted : He is a foreigner.

Source    : तपाईँ वास्तवमै फाइल मेट्न चाहनुहुन्छ?

Reference : Do you really want to delete file ?

Predicted : It is a mistake for you?

Source    : यस कुरालाई लिएर आक्रोशित हुनुपर्ने कुनै आवश्यकता छैन ।
Reference : There is no need to get upset about this.
Predicted : It is not necessary to remove this problem.

Source    : पुरुष र महिला दुबैले यसको लाभ उठाउन सक्छन्


