In [None]:
import os
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
import evaluate
from datasets import Dataset

1. Import and read dataset

In [None]:
data_path = '/content/clean_vie.txt'
def load_data_from_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    en_texts, vi_texts = zip(*[line.strip().split("\t") for line in lines])

    return {"en": list(en_texts), "vi": list(vi_texts)}

data = load_data_from_file(data_path)

ds = Dataset.from_dict(data)

2. Tokenize

In [None]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)

3. Encoding, Pre-proccessing

In [None]:
MAX_LEN = 75

def preprocess_function(examples):
    input_ids = tokenizer(
        examples["en"], padding="max_length", truncation=True, max_length=MAX_LEN
    )["input_ids"]

    labels = tokenizer(
        examples["vi"], padding="max_length", truncation=True, max_length=MAX_LEN
    )["input_ids"]

    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]

    return {
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels)
    }

preprocessed_ds = ds.map(preprocess_function, batched=True)

4. Load mBART50 model

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

5. Evaluation

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

6. Trainer

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = Seq2SeqTrainingArguments(
    output_dir="./en-vi-mbart50",
    logging_dir="logs",
    logging_steps=1000,
    predict_with_generate=True,
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_total_limit=3,
    num_train_epochs=2,
    load_best_model_at_end=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds,
    eval_dataset=preprocessed_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

In [None]:
tokenizer.save_pretrained("/content/en-vi-mbart50")

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

model_path = "/content/en-vi-mbart50/checkpoint-1848"

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained("/content/en-vi-mbart50")

translator = pipeline("translation_en_to_vi", model=model, tokenizer=tokenizer)

text = "What are you doing"
translated_text = translator(text, num_beams=5)
print(translated_text[0]['translation_text'])

In [1]:
import numpy as np