In [1]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


In [None]:
# When in google colab run the following
# !pip install transformers datasets evaluate

# Huggingface notebook adapation

[Notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)

In [None]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]
task = "wnli"

In [None]:
from datasets import load_dataset, load_metric
import evaluate

actual_task = "mnli" if task == "mnli-mm" else task
dataset = load_dataset("glue", actual_task)
metric = evaluate.load('glue', actual_task)

Found cached dataset glue (C:/Users/Valentin/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

## Load the tokenizer.

In [None]:
from transformers import AutoTokenizer
model_checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

## Preprocess the datasets depending on the chosen task

In [None]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [None]:
sentence1_key, sentence2_key = task_to_keys[task]

In [None]:
def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\Valentin\.cache\huggingface\datasets\glue\wnli\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-7818ec736c792fc6.arrow
Loading cached processed dataset at C:\Users\Valentin\.cache\huggingface\datasets\glue\wnli\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-179bd20f74eb937b.arrow


Map:   0%|          | 0/146 [00:00<?, ? examples/s]

## Load the model

In [None]:
from transformers import AutoModelForSequenceClassification
num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,
                                                           num_labels=num_labels,
                                                           torch_dtype="auto")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Training with bare training loop
https://huggingface.co/course/chapter3/4?fw=pt

## Prepare the data collator

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

## Prepare dataloader

In [None]:
encoded_dataset = encoded_dataset.remove_columns(["sentence1", "sentence2", "idx"])
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch")
encoded_dataset["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
from torch.utils.data import DataLoader
batch_size = 8
train_dataloader = DataLoader(
    encoded_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    encoded_dataset["validation"], batch_size=batch_size, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    {k: v.shape for k, v in batch.items()}
    break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Setup optimizer

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



## Set up training parameters

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

240


## Move the model to the GPU if possible

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [None]:
from tqdm.auto import tqdm
import evaluate

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad() # TODO richtig?
        outputs = model(**batch)
        loss = outputs.loss

        # loss.backward() computes dloss/dx for every parameter x which has requires_grad=True
        # https://discuss.pytorch.org/t/what-does-the-backward-function-do/9944
        # -> gradient doutput/dinput is accumulated during forward pass with autograd
        loss.backward()

        #optimizer.step updates the value of x using the gradient x.grad
        # https://discuss.pytorch.org/t/what-does-the-backward-function-do/9944
        optimizer.step()

        lr_scheduler.step()
        progress_bar.update(1)
    
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    print(metric.compute())

# Use the trainer

In [None]:
from transformers import TrainingArguments, Trainer
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

In [None]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [None]:
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.616189,0.67509
2,0.560300,0.661755,0.722022


TrainOutput(global_step=624, training_loss=0.5238346258799235, metrics={'train_runtime': 549.57, 'train_samples_per_second': 9.062, 'train_steps_per_second': 1.135, 'total_flos': 1270065912869880.0, 'train_loss': 0.5238346258799235, 'epoch': 2.0})