In [None]:
import os
os.environ["WANDB_DISABLED"]="true"

In [1]:
# ! pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]

Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version

In [2]:
import transformers
print(transformers.__version__)

4.15.0


### Loading the dataset

In [4]:
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("wmt16", "ro-en")
metric = load_metric("sacrebleu")

Reusing dataset wmt16 (/home/gabriel/.cache/huggingface/datasets/wmt16/ro-en/1.0.0/af3c5d746b307726d0de73ebe7f10545361b9cb6f75c83a1734c000e48b6264f)


  0%|          | 0/3 [00:00<?, ?it/s]

The dataset object itself is [datasetdict](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), which contains one key for the training, validation and test set:

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 610320
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1999
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1999
    })
})

In [6]:
raw_datasets["train"][0]

{'translation': {'en': 'Membership of Parliament: see Minutes',
  'ro': 'Componenţa Parlamentului: a se vedea procesul-verbal'}}

In [7]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))
show_random_elements(raw_datasets["train"])

Unnamed: 0,translation
0,"{'en': 'For all these reasons, we voted against the report.', 'ro': 'Din aceste motive, noi am votat împotriva acestui raport.'}"
1,"{'en': 'What is scandalous, however, is the fact that the negotiations in the WTO are still dragging on, and that alone makes it clear that this form of trade facilitation is not suitable to be used as an emergency aid measure.', 'ro': 'Cu adevărat scandaloasă este însă tergiversarea negocierilor din cadrul OMC, care indică clar faptul că această formă de facilitare comercială nu este adecvată ca instrument de acordare a unui ajutor de urgență.'}"
2,"{'en': '""Unfortunately, last Saturday we saw an example of how things should not go,"" the NATO chief said.', 'ro': '""Din nefericire, sâmbăta trecută am asistat la un exemplu despre cum nu trebuie să meargă lucrurile"", a afirmat şeful NATO.'}"
3,"{'en': 'China is probably best placed to stand up for Chinese minorities, but it would first have to appreciate its own minorities: the Tibetans and the Uighurs.', 'ro': 'China este probabil în cea mai bună postură pentru a apăra minorităţile chineze, însă ar trebui mai întâi să-şi poată aprecia propriile minorităţi: tibetanii şi uigurii.'}"
4,"{'en': 'I am sure that this is going to work much better than what we had in the past.', 'ro': 'Sunt convinsă că toate acestea vor funcţiona mult mai bine decât au funcţionat în trecut.'}"


# Preprocessing the data

In [9]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ro")

In [10]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["Hello, this one sentence!", "This is another sentence."]))

{'input_ids': [[10334, 1204, 3, 15, 8915, 27, 452, 59, 29579, 581, 23, 0], [235, 1705, 11, 32, 8, 1205, 5305, 59, 29579, 581, 2, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [11]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "ro"
def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[393, 4462, 14, 1137, 53, 216, 28636, 0], [24385, 14, 28636, 14, 4646, 4622, 53, 216, 28636, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[42140, 494, 1750, 53, 8, 59, 903, 3543, 9, 15202, 0], [36199, 6612, 9, 15202, 122, 568, 35788, 21549, 53, 8, 59, 903, 3543, 9, 15202, 0]]}

In [13]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/611 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

## Fine-tuning the model

In [24]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-it")

https://huggingface.co/Helsinki-NLP/opus-mt-en-it/resolve/main/config.json not found in cache or force_download set to True, downloading to /home/gabriel/.cache/huggingface/transformers/tmp3vhjylp2


Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

storing https://huggingface.co/Helsinki-NLP/opus-mt-en-it/resolve/main/config.json in cache at /home/gabriel/.cache/huggingface/transformers/c8e8f32034184208868243df38421204a3616f39385239385e06bcbfd876ee03.fd3818756242be8e05effce77b1609dba7d808f97edbbf5fc654365ab6a94919
creating metadata file for /home/gabriel/.cache/huggingface/transformers/c8e8f32034184208868243df38421204a3616f39385239385e06bcbfd876ee03.fd3818756242be8e05effce77b1609dba7d808f97edbbf5fc654365ab6a94919
loading configuration file https://huggingface.co/Helsinki-NLP/opus-mt-en-it/resolve/main/config.json from cache at /home/gabriel/.cache/huggingface/transformers/c8e8f32034184208868243df38421204a3616f39385239385e06bcbfd876ee03.fd3818756242be8e05effce77b1609dba7d808f97edbbf5fc654365ab6a94919
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-en-it",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectu

Downloading:   0%|          | 0.00/327M [00:00<?, ?B/s]

storing https://huggingface.co/Helsinki-NLP/opus-mt-en-it/resolve/main/pytorch_model.bin in cache at /home/gabriel/.cache/huggingface/transformers/9f3f7f8a1db4960f6bc1e56c2371c0a0d660a592887ac11f257cbb58d1502ecf.e7b48d502e6626706d68234549958f2446a59bbdce4a3c9dc96f1cb1e05c393e
creating metadata file for /home/gabriel/.cache/huggingface/transformers/9f3f7f8a1db4960f6bc1e56c2371c0a0d660a592887ac11f257cbb58d1502ecf.e7b48d502e6626706d68234549958f2446a59bbdce4a3c9dc96f1cb1e05c393e
loading weights file https://huggingface.co/Helsinki-NLP/opus-mt-en-it/resolve/main/pytorch_model.bin from cache at /home/gabriel/.cache/huggingface/transformers/9f3f7f8a1db4960f6bc1e56c2371c0a0d660a592887ac11f257cbb58d1502ecf.e7b48d502e6626706d68234549958f2446a59bbdce4a3c9dc96f1cb1e05c393e
All model checkpoint weights were used when initializing MarianMTModel.

All the weights of MarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-it.
If your task is similar to the task the model of

In [29]:
batch_size = 16
model_name = "Helsinki-NLP/opus-mt-en-it".split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True    
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
# from google.colab import drive
# drive.mount('/content/drive')

In [31]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [32]:
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [33]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation.
***** Running training *****
  Num examples = 610320
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 762900


Epoch,Training Loss,Validation Loss


Saving model checkpoint to opus-mt-en-it-finetuned-en-to-ro/checkpoint-500
Configuration saved in opus-mt-en-it-finetuned-en-to-ro/checkpoint-500/config.json
Model weights saved in opus-mt-en-it-finetuned-en-to-ro/checkpoint-500/pytorch_model.bin
tokenizer config file saved in opus-mt-en-it-finetuned-en-to-ro/checkpoint-500/tokenizer_config.json
Special tokens file saved in opus-mt-en-it-finetuned-en-to-ro/checkpoint-500/special_tokens_map.json
Saving model checkpoint to opus-mt-en-it-finetuned-en-to-ro/checkpoint-1000
Configuration saved in opus-mt-en-it-finetuned-en-to-ro/checkpoint-1000/config.json
Model weights saved in opus-mt-en-it-finetuned-en-to-ro/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in opus-mt-en-it-finetuned-en-to-ro/checkpoint-1000/tokenizer_config.json
Special tokens file saved in opus-mt-en-it-finetuned-en-to-ro/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to opus-mt-en-it-finetuned-en-to-ro/checkpoint-1500
Configuration saved i

In [None]:
import os
for dirname, _, filenames in os.walk('opus-mt-en-ro-finetuned-en-to-ro'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/vocab.json
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/rng_state.pth
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/config.json
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/tokenizer_config.json
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/trainer_state.json
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/scheduler.pt
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/special_tokens_map.json
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/pytorch_model.bin
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/target.spm
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/source.spm
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/training_args.bin
opus-mt-en-ro-finetuned-en-to-ro/checkpoint-37000/optimizer.pt
opus-mt-en-ro-finetuned-en-to-ro/runs/Sep30_03-49-58_a84265f68a25/events.out.tfevents.1632973810.a84265f68a25.217.0
opus-mt-en-ro-finetuned-en-to-ro/runs/Sep30_03-49-58_a84265f68a25/1632973810.56194/events.out.tfev

Our fine tuned model already saved under *opus-mt-en-ro-finetuned-en-to-ro/checkpoint-38000*

Load the model and translate some text from english to romanian

In [None]:
from transformers import MarianMTModel, MarianTokenizer
src_text = ['My name is Sarah and I live in London']

model_name = 'opus-mt-en-ro-finetuned-en-to-ro/checkpoint-38000'
tokenizer = MarianTokenizer.from_pretrained(model_name)
print(tokenizer.supported_language_codes)

model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

Didn't find file opus-mt-en-ro-finetuned-en-to-ro/checkpoint-38000/added_tokens.json. We won't load it.
Didn't find file opus-mt-en-ro-finetuned-en-to-ro/checkpoint-38000/tokenizer.json. We won't load it.
loading file opus-mt-en-ro-finetuned-en-to-ro/checkpoint-38000/source.spm
loading file opus-mt-en-ro-finetuned-en-to-ro/checkpoint-38000/target.spm
loading file opus-mt-en-ro-finetuned-en-to-ro/checkpoint-38000/vocab.json
loading file opus-mt-en-ro-finetuned-en-to-ro/checkpoint-38000/tokenizer_config.json
loading file None
loading file opus-mt-en-ro-finetuned-en-to-ro/checkpoint-38000/special_tokens_map.json
loading file None


[]
