In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
import numpy as np
import evaluate

In [36]:
from transformers import MBart50TokenizerFast,MBartForConditionalGeneration,Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer, EarlyStoppingCallback
from datasets import load_dataset

In [37]:
raw_datasets = load_dataset("Helsinki-NLP/opus-100", "en-id")
model_mbart = 'facebook/mbart-large-50-one-to-many-mmt'
display(raw_datasets)

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [38]:
tokenizer = MBart50TokenizerFast.from_pretrained(model_mbart,src_lang="en_XX",tgt_lang = "id_ID")

In [None]:
import utils
from importlib import reload

utils = reload(utils)

In [49]:
prefix = ""
source_lang = "en"
target_lang = "id"

max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [52]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

train_tokenized_dataset = tokenized_datasets["train"]
eval_tokenized_dataset = tokenized_datasets["validation"]
print(f"Number of train tokenized data: {len(train_tokenized_dataset)}")
print(f"Number of eval tokenized data: {len(eval_tokenized_dataset)}")

small_train_dataset_slicer = int(
    utils.DATASET_TRAIN_VAL_SPLIT_PERCENTAGE * len(train_tokenized_dataset)
)
small_eval_dataset_slicer = int(
    utils.DATASET_TRAIN_VAL_SPLIT_PERCENTAGE * len(eval_tokenized_dataset)
)

small_train_dataset = train_tokenized_dataset.shuffle(seed=utils.SEED).select(
    range(small_train_dataset_slicer)
)

small_eval_dataset = eval_tokenized_dataset.shuffle(seed=utils.SEED).select(
    range(small_eval_dataset_slicer)
)

print(len(small_train_dataset))
print(len(small_eval_dataset))

Number of train tokenized data: 1000000
Number of eval tokenized data: 2000
1000
2


In [41]:
model = MBartForConditionalGeneration.from_pretrained(model_mbart).cuda()

In [42]:
hyperparameters = {
    'learning_rate': 1e-5,
    'batch_size': 8,
    'num_epochs': 5
}

args = Seq2SeqTrainingArguments(
    f"{utils.MODEL_NAME}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=hyperparameters["learning_rate"],
    per_device_train_batch_size=hyperparameters["batch_size"],
    per_device_eval_batch_size=hyperparameters["batch_size"],
    weight_decay=0.01,
    save_total_limit=hyperparameters["num_epochs"],
    num_train_epochs=hyperparameters["num_epochs"],
    predict_with_generate=True,
    load_best_model_at_end=True,
)

In [43]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [44]:
metric = evaluate.load("sacrebleu")
meteor = evaluate.load('meteor')

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(
        decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels)
    meteor_result = meteor.compute(
        predictions=decoded_preds, references=decoded_labels)
    prediction_lens = [np.count_nonzero(
        pred != tokenizer.pad_token_id) for pred in preds]
    result = {'bleu': result['score']}
    result["gen_len"] = np.mean(prediction_lens)
    result["meteor"] = meteor_result["meteor"]
    result = {k: round(v, 4) for k, v in result.items()}
    return result

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [45]:
if TRAIN:
    trainer = Seq2SeqTrainer(
        model,
        args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    trainer.train()
    trainer.save_model('opus-mt-en-id-finetuned-en-to-id')

In [46]:
src_text =  "My Wonderful Family. I live in a house near the mountains. I have two brothers and one sister, and I was born last. My father teaches mathematics, and my mother is a nurse at a big hospital. My brothers are very smart and work hard in school. My sister is a nervous girl, but she is very kind. My grandmother also lives with us. She came from Italy when I was two years old. She has grown old, but she is still very strong. She cooks the best food! My family is very important to me. We do lots of things together. My brothers and I like to go on long walks in the mountains. My sister likes to cook with my grandmother. On the weekends we all play board games together. We laugh and always have a good time. I love my family very much."


src_text = src_text.replace("!",".")
sentences = [sentence+". " for sentence in src_text.split(".") if len(sentence) > 0]
print(sentences)

translation_results = ""
for sentence in sentences:
    model_inputs = tokenizer(sentence, return_tensors="pt")

    generated_tokens = model.generate(
        **model_inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["id_ID"],
    )

    translation = tokenizer.batch_decode(
        generated_tokens, skip_special_tokens=True)
    translation_results += translation[0].strip()

print()
print(translation_results)

['My Wonderful Family. ', ' I live in a house near the mountains. ', ' I have two brothers and one sister, and I was born last. ', ' My father teaches mathematics, and my mother is a nurse at a big hospital. ', ' My brothers are very smart and work hard in school. ', ' My sister is a nervous girl, but she is very kind. ', ' My grandmother also lives with us. ', ' She came from Italy when I was two years old. ', ' She has grown old, but she is still very strong. ', ' She cooks the best food. ', ' My family is very important to me. ', ' We do lots of things together. ', ' My brothers and I like to go on long walks in the mountains. ', ' My sister likes to cook with my grandmother. ', ' On the weekends we all play board games together. ', ' We laugh and always have a good time. ', ' I love my family very much. ']


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)