In [1]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

from datasets import DatasetDict, Dataset
import random
from transformers import is_torch_available, is_tf_available
print("Torch:", is_torch_available())
print("TF:", is_tf_available())


# Dataset FilePaths
src_file = "../dataset/MultiHPLT.en-ne.ne"
tgt_file = "../dataset/MultiHPLT.en-ne.en"

# Load the full dataset
with open(src_file, encoding="utf-8") as f:
    nepali = [line.strip() for line in f]

with open(tgt_file, encoding="utf-8") as f:
    english = [line.strip() for line in f]

print("Nepali lines:", len(nepali))
print("English lines:", len(english))

# Pair and shuffle
min_len = min(len(nepali), len(english))
data = list(zip(nepali[:min_len], english[:min_len]))
random.seed(42)
random.shuffle(data)


# Split into 80% train, 15% val and 5% test
total = len(data)
train_data = data[:int(0.8 * total)]
print(f"Sample item in pairs: {train_data[0]}")
print(f"Length of train_data: {len(train_data)}")

val_data = data[int(0.8 * total):int(0.95 * total)]
test_data = data[int(0.95 * total)]


# Convertting to Hugging Face datasets
def to_dataset(pairs):
    cleaned = []
    for i, pair in enumerate(pairs):
        if isinstance(pair, tuple) and len(pair) == 2:
            cleaned.append({"ne": pair[0], "en": pair[1]})
        else:
            print(f"❌ Skipping invalid pair at index {i}: {pair}")
    return Dataset.from_dict({"translation": cleaned})


raw_datasets  = DatasetDict({
    "train": to_dataset(train_data),
    "val": to_dataset(val_data),
    "test": to_dataset(test_data)
})



  from .autonotebook import tqdm as notebook_tqdm


Torch: True
TF: False
Nepali lines: 317120
English lines: 317120
Sample item in pairs: ('कार्यशाला प्लानिङ घाम र हावाको दिशामा आधारित थियो, सम्पूर्ण निर्माताले सौर्य ऊर्जाको पूर्ण उपयोग गर्न सक्थे।M&Z Furniture ले तापक्रम र सापेक्षिक आर्द्रता कायम राख्न भूमिगत पानीको परिसंचरण अपनायो, र धुलो सङ्कलन प्रणालीहरूद्वारा धुलो-रहित वातावरण राख्यो जसले फिल्टरहरू मार्फत हावालाई निरन्तर बाहिर धकेल्छ र सफा हावामा पुन: प्रयोग गर्दछ।', 'Climate-Controlled & Dust-Free Environment Workshop planing was based on sunshine and wind direction, the whole manufacturer could make full use of solar energy. M&Z Furniture adopted underground water circulation to keep temperature and relative humidity, and kept dust-free environment by dust collection systems that constantly push air out through filters and recycle in clean air.')
Length of train_data: 253696
❌ Skipping invalid pair at index 0: PTC हीटिंग कार्य, कम तापक्रममा, PTC इलेक्ट्रिक हीटर हीटिंग, चिसो क्षेत्रमा उत्पादनहरू पनि प्रयोग गर्न सक्षम हुनेछ भनेर स

In [2]:
# Preparing model
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
device = torch.device("cpu")

model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Preprocessing
def preprocess(example):
    inputs = ["translate Nepali to English: " + ex["ne"] for ex in example["translation"]]
    targets = [ex["en"] for ex in example["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation = True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding="max_length", truncation = True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize
tokenized_datasets = raw_datasets.map(preprocess, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 253696/253696 [00:47<00:00, 5358.39 examples/s]
Map: 100%|██████████| 47568/47568 [00:08<00:00, 5310.56 examples/s]


In [5]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq



training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-multihplt",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    logging_dir='./logs',
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model),
)

trainer.train()


  0%|          | 0/158560 [02:47<?, ?it/s]
  0%|          | 8/158560 [00:45<252:43:44,  5.74s/it]

KeyboardInterrupt: 