In [1]:
from datasets import load_dataset

We use the mrpc dataset as an example dataset. It contains paraphrases and non-paraphrases. We will filter it and use only the paraphrases.

In [2]:
raw_datasets = load_dataset('glue', 'mrpc')
raw_datasets

Let's store the splits separately and look at one example.

In [3]:
raw_datasets_train = raw_datasets['train']
raw_datasets_val = raw_datasets['validation']
raw_datasets_test = raw_datasets['test']
raw_datasets_train[0]

Clearly label=1 means that the sentences are paraphrases. Let's filter them.

In [4]:
ds_train = raw_datasets_train.filter(lambda x: x['label']==1)
ds_val = raw_datasets_val.filter(lambda x: x['label']==1)
ds_test = raw_datasets_test.filter(lambda x: x['label']==1)
len(ds_train), len(ds_val), len(ds_test)

Seems legit. That's a little over half the dataset.

For testing, if we want to use only a small subset of the data we can do that here:

In [None]:
# ds_train = ds_train.select(range(200))
# ds_val = ds_val.select(range(40))
# ds_test = ds_test.select(range(100))

Now, let's prepare the data for training.

In [5]:
from transformers import T5Tokenizer

In [6]:
checkpoint = 'google/mt5-small'
tokenizer = T5Tokenizer.from_pretrained(checkpoint)

FYI: [here](https://huggingface.co/learn/nlp-course/chapter7/4?fw=pt#processing-the-data) it says to specify `tokenizer.src_lang` and `tokenizer.tgt_lang` for multilingual models/tokeinzers, but this tokenizer does not have these properties.

Let's see how the tokenizer works:

In [7]:
s1 = ds_train[0]['sentence1']
s2 = ds_train[0]['sentence2']
print(s1)
print(s2)
inputs = tokenizer(s1, text_target=s2)
inputs

Now we create a preprocess function that turns a dataset item into a form that the model can use for training.

First, let's find out what a reasonable `max_len` is.

In [None]:
import matplotlib.pyplot as plt
train_lengths = list(map(lambda x: max(len(x['sentence1']), len(x['sentence2'])), ds_train))
val_lengths = list(map(lambda x: max(len(x['sentence1']), len(x['sentence2'])), ds_train))
test_lengths = list(map(lambda x: max(len(x['sentence1']), len(x['sentence2'])), ds_train))

plt.hist(train_lengths + val_lengths + test_lengths, 100)
plt.show()

Note that the y axis here is characters and not tokens. With 128 (for `max_len`) we are on the safe side.

In [8]:
max_length = 128

# the prefix has to (dynamically) be adjusted depending on the language or when training multilingually (I think).
prefix = 'paraphrase: '

def preprocess_function(examples):
    inputs = [prefix+s1 for s1 in examples['sentence1']]
    targets = examples['sentence2']
    # most likely there will be nothing to truncate, but we still add it
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
    return model_inputs

Now we apply the preprocessing function to the datasets.

In [9]:
tokenized_ds_train = ds_train.map(
    preprocess_function,
    batched=True,
    remove_columns=ds_train.column_names
)
tokenized_ds_val = ds_val.map(
    preprocess_function,
    batched=True,
    remove_columns=ds_val.column_names
)
tokenized_ds_test = ds_test.map(
    preprocess_function,
    batched=True,
    remove_columns=ds_test.column_names
)

Now the data is ready.

Next, the model and a Datacollator.

In [10]:
from transformers import MT5ForConditionalGeneration

model = MT5ForConditionalGeneration.from_pretrained(checkpoint)

In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Here, I'll skip the example usage of the datacollator, check it out [here](https://huggingface.co/learn/nlp-course/chapter7/4?fw=pt#data-collation).

Now, let's continue with metrics. We will use Parascore.

In [12]:
from parascore import ParaScorer

scorer = ParaScorer(lang='en', model_type='bert-base-uncased')

Let's quickly go over how Parascore is used:

In [21]:
cands = ["A young person is skating.", "I like sports.", "He catches the ball.", "That's very interesting!"]
sources = ["There's a child on a skateboard.", "I like to relax.", "good morning, everyone!", "I find this interesting."]
score = scorer.free_score(cands, sources)
float(score[-1].mean())

Now, here's the `compute_metrics` function (mostly copied from [here](https://huggingface.co/learn/nlp-course/chapter7/4?fw=pt#metrics)):

In [24]:
import numpy as np

In [27]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    
    parascore = scorer.free_score(decoded_preds, decoded_labels)
    return {'parascore': float(parascore[-1].mean())}
    

In [26]:
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

In [31]:


args = Seq2SeqTrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    output_dir='.',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

In [32]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()