In [1]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

from datasets import DatasetDict, Dataset
import random
from transformers import is_torch_available, is_tf_available
print("Torch:", is_torch_available())
print("TF:", is_tf_available())


# Dataset FilePaths
src_file = "../dataset/MultiHPLT.en-ne.ne"
tgt_file = "../dataset/MultiHPLT.en-ne.en"

# Load the full dataset
with open(src_file, encoding="utf-8") as f:
    nepali = [line.strip() for line in f]

with open(tgt_file, encoding="utf-8") as f:
    english = [line.strip() for line in f]

print("Nepali lines:", len(nepali))
print("English lines:", len(english))

# Pair and shuffle
min_len = min(len(nepali), len(english))
data = list(zip(nepali[:min_len], english[:min_len]))
random.seed(42)
random.shuffle(data)


# Split into 80% train, 15% val and 5% test
total = len(data)
train_data = data[:int(0.8 * total)]
print(f"Sample item in pairs: {train_data[0]}")
print(f"Length of train_data: {len(train_data)}")

val_data = data[int(0.8 * total):int(0.95 * total)]
test_data = data[int(0.95 * total)]


# Convertting to Hugging Face datasets
def to_dataset(pairs):
    cleaned = []
    for i, pair in enumerate(pairs):
        if isinstance(pair, tuple) and len(pair) == 2:
            cleaned.append({"ne": pair[0], "en": pair[1]})
        else:
            print(f"‚ùå Skipping invalid pair at index {i}: {pair}")
    return Dataset.from_dict({"translation": cleaned})


raw_datasets  = DatasetDict({
    "train": to_dataset(train_data),
    "val": to_dataset(val_data),
    "test": to_dataset(test_data)
})



  from .autonotebook import tqdm as notebook_tqdm


Torch: True
TF: False
Nepali lines: 317120
English lines: 317120
Sample item in pairs: ('‡§ï‡§æ‡§∞‡•ç‡§Ø‡§∂‡§æ‡§≤‡§æ ‡§™‡•ç‡§≤‡§æ‡§®‡§ø‡§ô ‡§ò‡§æ‡§Æ ‡§∞ ‡§π‡§æ‡§µ‡§æ‡§ï‡•ã ‡§¶‡§ø‡§∂‡§æ‡§Æ‡§æ ‡§Ü‡§ß‡§æ‡§∞‡§ø‡§§ ‡§•‡§ø‡§Ø‡•ã, ‡§∏‡§Æ‡•ç‡§™‡•Ç‡§∞‡•ç‡§£ ‡§®‡§ø‡§∞‡•ç‡§Æ‡§æ‡§§‡§æ‡§≤‡•á ‡§∏‡•å‡§∞‡•ç‡§Ø ‡§ä‡§∞‡•ç‡§ú‡§æ‡§ï‡•ã ‡§™‡•Ç‡§∞‡•ç‡§£ ‡§â‡§™‡§Ø‡•ã‡§ó ‡§ó‡§∞‡•ç‡§® ‡§∏‡§ï‡•ç‡§•‡•á‡•§M&Z Furniture ‡§≤‡•á ‡§§‡§æ‡§™‡§ï‡•ç‡§∞‡§Æ ‡§∞ ‡§∏‡§æ‡§™‡•á‡§ï‡•ç‡§∑‡§ø‡§ï ‡§Ü‡§∞‡•ç‡§¶‡•ç‡§∞‡§§‡§æ ‡§ï‡§æ‡§Ø‡§Æ ‡§∞‡§æ‡§ñ‡•ç‡§® ‡§≠‡•Ç‡§Æ‡§ø‡§ó‡§§ ‡§™‡§æ‡§®‡•Ä‡§ï‡•ã ‡§™‡§∞‡§ø‡§∏‡§Ç‡§ö‡§∞‡§£ ‡§Ö‡§™‡§®‡§æ‡§Ø‡•ã, ‡§∞ ‡§ß‡•Å‡§≤‡•ã ‡§∏‡§ô‡•ç‡§ï‡§≤‡§® ‡§™‡•ç‡§∞‡§£‡§æ‡§≤‡•Ä‡§π‡§∞‡•Ç‡§¶‡•ç‡§µ‡§æ‡§∞‡§æ ‡§ß‡•Å‡§≤‡•ã-‡§∞‡§π‡§ø‡§§ ‡§µ‡§æ‡§§‡§æ‡§µ‡§∞‡§£ ‡§∞‡§æ‡§ñ‡•ç‡§Ø‡•ã ‡§ú‡§∏‡§≤‡•á ‡§´‡§ø‡§≤‡•ç‡§ü‡§∞‡§π‡§∞‡•Ç ‡§Æ‡§æ‡§∞‡•ç‡§´‡§§ ‡§π‡§æ‡§µ‡§æ‡§≤‡§æ‡§à ‡§®‡§ø‡§∞‡§®‡•ç‡§§‡§∞ ‡§¨‡§æ‡§π‡§ø‡§∞ ‡§ß‡§ï‡•á‡§≤‡•ç‡§õ ‡§∞ ‡§∏‡§´‡§æ ‡§π‡§æ‡§µ‡§æ‡§Æ‡§æ ‡§™‡•Å‡§®: ‡§™‡•ç‡§∞‡§Ø‡•ã‡§ó ‡§ó‡§∞‡•ç‡§¶‡§õ‡•§', 'Climate-Controlled & Dust-Free 

In [2]:
# Preparing model
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
device = torch.device("cpu")

model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Preprocessing
def preprocess(example):
    inputs = ["translate Nepali to English: " + ex["ne"] for ex in example["translation"]]
    targets = [ex["en"] for ex in example["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation = True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding="max_length", truncation = True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize
tokenized_datasets = raw_datasets.map(preprocess, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 253696/253696 [00:47<00:00, 5358.39 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 47568/47568 [00:08<00:00, 5310.56 examples/s]


In [5]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq



training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-multihplt",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    logging_dir='./logs',
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model),
)

trainer.train()


  0%|          | 0/158560 [02:47<?, ?it/s]
  0%|          | 8/158560 [00:45<252:43:44,  5.74s/it]

KeyboardInterrupt: 