# Training a translation model

Dataset: KDE4, Model: Marian

set up the environment

In [5]:
import os
os.environ["http_proxy"]="127.0.0.1:7890"
os.environ["https_proxy"]="127.0.0.1:7890"

load the dataset:

In [6]:
from datasets import load_dataset
raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

Downloading data:   0%|          | 0.00/7.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/210173 [00:00<?, ? examples/s]

Take a look at the raw dataset

In [7]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

There is only train, so the next thing to do is to split

In [8]:
split_dataset = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [10]:
split_dataset["validation"] = split_dataset.pop("test")

In [11]:
split_dataset["train"][1]["translation"]

{'en': 'Default to expanded threads',
 'fr': 'Par défaut, développer les fils de discussion'}

In [12]:
split_dataset["validation"][1]["translation"]

{'en': 'Customize Formatting', 'fr': 'Personnaliser le formatage'}

Load the tokenizer

In [16]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

Downloading tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



Take a look at one sample

In [17]:
en_sentence = split_dataset["train"][1]["translation"]["en"]
fr_sentence = split_dataset["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence, text_target=fr_sentence)
inputs

{'input_ids': [47591, 12, 9842, 19634, 9, 0], 'attention_mask': [1, 1, 1, 1, 1, 1], 'labels': [577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]}

The last thing to do is defining our func.

We use -100 to pad these input, to reach the 128 length

In [19]:
max_length = 128

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )

And apply them on our dataset

In [20]:
tokenized_datasets = split_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=split_dataset["train"].column_names,
)

Map:   0%|          | 0/189155 [00:00<?, ? examples/s]

Map:   0%|          | 0/21018 [00:00<?, ? examples/s]

Load the model

In [21]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Data Collator

In [22]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Metric: SacreBLEU

In [23]:
%pip install sacrebleu

Note: you may need to restart the kernel to use updated packages.


In [24]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

A function of computing metric

In [25]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}