In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## preprocessing data before instantiating trainer classes

In [2]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")
# training args can be many hyperparameters, currently only default ones are enough

In [3]:
from transformers import AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# because BERT has not been pretrained on classifying pairs of sentences, 
# so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead
# warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) 
# and that some others were randomly initialized (the ones for the new head).

# beacuse head's weights are randomly initialized we have to train it!

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
AutoModelForSequenceClassification.from_pretrained("./test", num_labels=2)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [4]:
# using dataset, model, data_collator, tokenizer and trainerargs classes you can now define a Trainer class
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [13]:
!pip freeze |grep torch

torch==1.9.0


In [None]:
# to start training the model
# reports the training loss every 500

# It won’t, however, tell you how well (or badly) your model is performing
# beacuse we didnt tell Trainer to "evaluate" or "compute metrics"

trainer.train()

In [None]:
# compute_metrics function needs to take EvalPrediction object as input, and return dict mapping of 
# strings (the strings being the names of the metrics returned) to floats (their values)

# to get predictions Trainer.predict method can be used

predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

# output of the predict method is another named tuple with three fields: predictions, label_ids, and metrics
#   metrics: field will just contain the loss on the dataset passed, as well as some time metric
# predictions is a two-dimensional array with shape 408 x 2 (408 being the number of elements in the dataset we used)
# Those are the logits for each element of the dataset we passed to predict 

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

# we need to take the index with the maximum value on the second axis to compare meaningful outputs to the labels

In [None]:
# to calculate metrics
from datasets import load_metric

metric = load_metric("glue", "mrpc")
metric.compute(predictions=preds, 
               references=predictions.label_ids)

In [None]:
# wrapping all together as a single function:
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# to see it used in action to report metrics at the end of each epoch:

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# The Trainer will work out of the box on multiple GPUs 