In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## preprocessing data before instantiating trainer classes



AttributeError: 'Version' object has no attribute 'major'

In [2]:
!pip install --upgrade numexpr

Collecting numexpr
  Downloading numexpr-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (471 kB)
[K     |████████████████████████████████| 471 kB 1.3 MB/s 
Installing collected packages: numexpr
  Attempting uninstall: numexpr
    Found existing installation: numexpr 2.6.9
    Uninstalling numexpr-2.6.9:
      Successfully uninstalled numexpr-2.6.9
Successfully installed numexpr-2.7.3


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")
# training args can be many hyperparameters, currently only default ones are enough

In [None]:
from transformers import AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# because BERT has not been pretrained on classifying pairs of sentences, 
# so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead
# warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) 
# and that some others were randomly initialized (the ones for the new head).

# beacuse head's weights are randomly initialized we have to train it!

In [None]:
AutoModelForSequenceClassification.from_pretrained("./test", num_labels=2)

In [None]:
# using dataset, model, data_collator, tokenizer and trainerargs classes you can now define a Trainer class
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
# to start training the model
# reports the training loss every 500

# It won’t, however, tell you how well (or badly) your model is performing
# beacuse we didnt tell Trainer to "evaluate" or "compute metrics"

trainer.train()

In [None]:
# compute_metrics function needs to take EvalPrediction object as input, and return dict mapping of 
# strings (the strings being the names of the metrics returned) to floats (their values)

# to get predictions Trainer.predict method can be used

predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

# output of the predict method is another named tuple with three fields: predictions, label_ids, and metrics
#   metrics: field will just contain the loss on the dataset passed, as well as some time metric
# predictions is a two-dimensional array with shape 408 x 2 (408 being the number of elements in the dataset we used)
# Those are the logits for each element of the dataset we passed to predict 

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

# we need to take the index with the maximum value on the second axis to compare meaningful outputs to the labels

In [None]:
# to calculate metrics
from datasets import load_metric

metric = load_metric("glue", "mrpc")
metric.compute(predictions=preds, 
               references=predictions.label_ids)

In [2]:
# wrapping all together as a single function:
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [3]:
# to see it used in action to report metrics at the end of each epoch:

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# The Trainer will work out of the box on multiple GPUs 

NameError: name 'TrainingArguments' is not defined