In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import evaluate

In [3]:
from transformers import MBart50TokenizerFast,MBartForConditionalGeneration,Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
from datasets import load_dataset, load_metric

In [4]:
raw_datasets = load_dataset("Helsinki-NLP/opus-100", "en-id")
model_mbart = 'facebook/mbart-large-50-one-to-many-mmt'

In [5]:
tokenizer = MBart50TokenizerFast.from_pretrained(model_mbart,src_lang="en_XX",tgt_lang = "id_ID")

In [6]:
source_lang = "en"
target_lang = "id"

def preprocess(data):
  inputs = [dt[source_lang] for dt in data["translation"]]
  targets = [dt[target_lang] for dt in data["translation"]]
  model_inputs = tokenizer(inputs, truncation=True)

  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

tokenized_datasets = raw_datasets.map(preprocess, batched=True)

In [7]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [8]:
model = MBartForConditionalGeneration.from_pretrained(model_mbart)
model = model.cuda()

In [13]:
args = Seq2SeqTrainingArguments(
    f"mbart-large-50-one-to-many-mmt-finetuned-en-to-id",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
metric = evaluate.load("sacrebleu")
meteor = evaluate.load('meteor')

def postprocess_text(preds, labels):
   preds = [pred.strip() for pred in preds]
   labels = [[label.strip()] for label in labels]
   return preds, labels


def compute_metrics(eval_preds):
   preds, labels = eval_preds
   if isinstance(preds, tuple):
       preds = preds[0]
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   # Replace -100 in the labels as we can't decode them.
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
   # Some simple post-processing
   decoded_preds, decoded_labels = postprocess_text(
       decoded_preds, decoded_labels)
   result = metric.compute(predictions=decoded_preds,
                           references=decoded_labels)
   meteor_result = meteor.compute(
       predictions=decoded_preds, references=decoded_labels)
   prediction_lens = [np.count_nonzero(
       pred != tokenizer.pad_token_id) for pred in preds]
   result = {'bleu': result['score']}
   result["gen_len"] = np.mean(prediction_lens)
   result["meteor"] = meteor_result["meteor"]
   result = {k: round(v, 4) for k, v in result.items()}
   return result

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [16]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer.save_model('opus-mt-en-id-finetuned-en-to-id')

In [None]:
src_text = ["I hope we all passed NLP. Hendrik is the best lecturer in Calvin Institute of Technology!!!"]
model_path = 'model\mbart-large-50-one-to-many-mmt-finetuned-en-to-id'

tokenizer = MBart50TokenizerFast.from_pretrained(model_path,src_lang="en_XX")
model = MBartForConditionalGeneration.from_pretrained(model_path)

model_inputs = tokenizer(src_text, return_tensors="pt")
generated_tokens = model.generate(**model_inputs,forced_bos_token_id=tokenizer.lang_code_to_id["id_ID"], max_new_tokens=360)
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

print(translation)