In [1]:
%pip install transformers sentencepiece datasets evaluate accelerate

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import Dataset
import numpy as np
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

In [3]:
# Load pre-trained model and tokenizer
model_name = "google/mt5-base"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)
model.generation_config.max_new_tokens = 500

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
# Test pre-trained model
input_ids = tokenizer('I will buy some <extra_id_0> for <extra_id_1>', return_tensors='pt').input_ids
outputs = model.generate(input_ids)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

<extra_id_0> clothes <extra_id_1> you.


In [5]:
# Prepare dataset
prefix = "translate Nepali to English: "
src_lang = "ne_NP"
tgt_lang = "en_XX"
with open("../dataset/train_raw/train.ne_NP", "r", encoding='utf-8') as f:
    train_ne = [l.strip() for l in f.readlines()]
with open("../dataset/train_raw/train.en_XX", "r", encoding='utf-8') as f:
    train_en = [l.strip() for l in f.readlines()]
with open("../dataset/test_raw/test.ne_NP", "r", encoding='utf-8') as f:
    test_ne = [l.strip() for l in f.readlines()]
with open("../dataset/test_raw/test.en_XX", "r", encoding='utf-8') as f:
    test_en = [l.strip() for l in f.readlines()]
train_dataset_dict = {
    "ne_NP": train_ne,
    "en_XX": train_en
}
test_dataset_dict = {
    "ne_NP": test_ne,
    "en_XX": test_en
}
train_dataset = Dataset.from_dict(train_dataset_dict)
test_dataset = Dataset.from_dict(test_dataset_dict)

def preprocess_function(examples):
    return tokenizer([prefix + example for example in examples[src_lang]], text_target=examples[tgt_lang], max_length=128, truncation=True)

tokenized_train_inputs = train_dataset.map(preprocess_function, batched=True, remove_columns=[src_lang, tgt_lang])
tokenized_test_inputs = test_dataset.map(preprocess_function, batched=True, remove_columns=[src_lang, tgt_lang])

Map:   0%|          | 0/107770 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

In [9]:
print(len(train_dataset))
print(tokenized_train_inputs[0])
print(tokenizer.decode(tokenized_train_inputs[0]["input_ids"][2]))
print(tokenized_train_inputs.features)
print(tokenizer.decode(tokenized_train_inputs[0]["input_ids"], skip_special_tokens=True))
print(tokenizer.decode(tokenized_train_inputs[0]["labels"], skip_special_tokens=True))

107770
{'input_ids': [37194, 36403, 266, 288, 5413, 267, 2745, 3863, 69879, 57028, 135620, 14625, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [76978, 262, 1361, 259, 84984, 1]}
i
{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
translate Nepali to English: पपुआ न्यू गिनि
Papua New Guinea


In [7]:
# Set up collator and metrics
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
metric = evaluate.load("bleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["bleu"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="mt5_ne_en",
    evaluation_strategy="epoch",
    logging_steps=2500,
    save_steps=2500,
    # evaluation_strategy="steps",
    # eval_steps=1000,
    # learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    # fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_inputs,
    eval_dataset=tokenized_test_inputs,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(torch.optim.Adam(model.parameters(), lr=0.001), None)
)

trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.308,4.993924,0.0136,70.1868
2,1.8547,4.903048,0.0096,117.5435
3,1.5437,4.939808,0.0126,120.67
4,1.2751,4.941507,0.0232,60.7095
5,1.039,5.096067,0.033,47.251
6,0.8585,5.212707,0.0287,56.9071
7,0.6884,5.680346,0.0294,54.8073


In [11]:
trainer.train("mt5_ne_en/checkpoint-100000")

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
8,0.5427,5.82731,0.0403,41.5632
9,0.4267,6.122791,0.0418,40.3755
10,0.3383,6.583009,0.0426,40.3775


TrainOutput(global_step=134720, training_loss=0.10729281409619257, metrics={'train_runtime': 6323.1393, 'train_samples_per_second': 170.437, 'train_steps_per_second': 21.306, 'total_flos': 1.001213964795433e+17, 'train_loss': 0.10729281409619257, 'epoch': 10.0})

In [12]:
trainer.save_model("mt5_ne_en_final")

In [13]:
finetuned_model = MT5ForConditionalGeneration.from_pretrained("mt5_ne_en_final")

In [14]:
for i in range(0,10):
    finetune_input = tokenizer.decode(tokenized_train_inputs[i]["input_ids"], skip_special_tokens=True)
    print("Input: ", finetune_input)
    print("Expected: ", tokenizer.decode(tokenized_train_inputs[i]["labels"], skip_special_tokens=True))
    input_ids = tokenizer(finetune_input, return_tensors='pt').input_ids
    outputs = finetuned_model.generate(input_ids)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Prediction: ", output_text)


Input:  translate Nepali to English: पपुआ न्यू गिनि
Expected:  Papua New Guinea
Prediction:  Papua New Guinea
Input:  translate Nepali to English: "% 1" छवि लोड गर्न असफल
Expected:  Failed to load image "%1"
Prediction:  Failed to load image "%1".
Input:  translate Nepali to English: दुईवटा नम्बरहरूको गुणनफल प्रदर्शित भयो। बराबरी साइनको दायाँपट्टि, वस्तुको उत्तर दिनुहोस्। तपाईँको उत्तरलाई परिमार्जन गर्न बायाँ र दायाँ बाँणहरू प्रयोग गर्नुहोस् र तपाईँले गरेको सही भयो भएन जाँच्न 'प्रविष्ट गर्नुहोस्' कुञ्जी थिच्नुहोस्। यदि सही छैन भने, पुन प्रयास गर्नुहोस्।
Expected:  A multiplication of two numbers is displayed. At the right of the equals sign, give the answer, the product. Use the left and right arrows to modify your answer and press the Enter key to check if you've got it right. If not, just try again.
Prediction:  A multiplication of two numbers is displayed. At the right of the equals sign, give the answer, the product. Use the left and right arrows to modify your answer and press the

In [15]:
def translate(nepali):
    input_ids = tokenizer(prefix+nepali, return_tensors='pt').input_ids
    outputs = finetuned_model.generate(input_ids)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return output_text


In [16]:
print(translate("फाइलमा प्रयोगकर्ताहरूको सूची बचत गर्नुहोस्"))

Save the list of users to a file


In [17]:
# Test fine-tuned model
input_ids = tokenizer('translate Nepali to English: प्याच गरिने फाइल/डाइरेक्टरी', return_tensors='pt').input_ids
outputs = finetuned_model.generate(input_ids)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

File/Directory to patch:


In [18]:
print(translate("नमस्ते, मेरो नाम क्लेमेन्ट हो।"))

host name is a greeting.


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# output_dir = '/content/drive/MyDrive/translation_model'
# model.save_pretrained(output_dir)

Mounted at /content/drive
