In [1]:
%pip install transformers sentencepiece datasets evaluate accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset, Dataset
import numpy as np
import torch
# from torch import optim
# from torch.nn import functional as F
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq #, AdamW
import evaluate
# from tqdm.notebook import tqdm
# import sentencepiece as spm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load pre-trained model and tokenizer
model_name = "google/mt5-base"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
# Test pre-trained model
input_ids = tokenizer('I will buy some <extra_id_0> for <extra_id_1>', return_tensors='pt').input_ids
outputs = model.generate(input_ids)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

<extra_id_0> clothes <extra_id_1> you.


In [4]:
# Prepare dataset
prefix = "translate Nepali to English: "
src_lang = "ne_NP"
tgt_lang = "en_XX"
with open("../dataset/train_raw/train.ne_NP", "r") as f:
    train_ne = [l.strip() for l in f.readlines()]
with open("../dataset/train_raw/train.en_XX", "r") as f:
    train_en = [l.strip() for l in f.readlines()]
with open("../dataset/test_raw/test.ne_NP", "r") as f:
    test_ne = [l.strip() for l in f.readlines()]
with open("../dataset/test_raw/test.en_XX", "r") as f:
    test_en = [l.strip() for l in f.readlines()]
train_dataset_dict = {
    "ne_NP": train_ne,
    "en_XX": train_en
}
test_dataset = {
    "ne_NP": test_ne,
    "en_XX": test_en
}
train_dataset = Dataset.from_dict(train_dataset_dict)
test_dataset = Dataset.from_dict(test_dataset)

def preprocess_function(examples):
    return tokenizer([prefix + example for example in examples[src_lang]], text_target=examples[tgt_lang], max_length=128, truncation=True)

tokenized_train_inputs = train_dataset.map(preprocess_function, batched=True, remove_columns=[src_lang, tgt_lang])
tokenized_test_inputs = test_dataset.map(preprocess_function, batched=True, remove_columns=[src_lang, tgt_lang])

Map: 100%|██████████| 163625/163625 [00:36<00:00, 4532.41 examples/s]
Map: 100%|██████████| 1012/1012 [00:00<00:00, 1832.71 examples/s]


In [5]:
print(tokenized_train_inputs)
print(tokenized_train_inputs.features)
print(tokenizer.decode(tokenized_train_inputs[0]["input_ids"], skip_special_tokens=True))
print(tokenizer.decode(tokenized_train_inputs[0]["labels"], skip_special_tokens=True))

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 163625
})
{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
translate Nepali to English: प्याच गरिने फाइल/डाइरेक्टरी
File/Directory to patch


In [6]:
# Set up collator and metrics
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
metric = evaluate.load("bleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["bleu"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir="mt5_ne_en",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    # fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_inputs,
    eval_dataset=tokenized_test_inputs,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 1/20454 [00:52<296:46:52, 52.24s/it]