#Step by step

In [1]:
!pip install datasets
!pip install transformers



In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [3]:
raw_datasets['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [4]:
from transformers import AutoTokenizer

model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer("What a great sentence this is! Ain't I right?", "Right on!", "Right on indeed!")
inputs #token_type_ids separates the last sentence. This is due to Bert's learning to sentence predic

{'input_ids': [101, 2054, 1037, 2307, 6251, 2023, 2003, 999, 7110, 1005, 1056, 1045, 2157, 1029, 102, 2157, 2006, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [101, 2157, 2006, 5262, 999, 102]}

In [5]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'])

['[CLS]',
 'what',
 'a',
 'great',
 'sentence',
 'this',
 'is',
 '!',
 'ain',
 "'",
 't',
 'i',
 'right',
 '?',
 '[SEP]',
 'right',
 'on',
 '!',
 '[SEP]']

In [6]:
def tokenize_function(raw_data):
  return tokenizer(raw_data['sentence1'], raw_data['sentence2'], truncation = True) #we will add padding later
tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

We tokenize them together to make use of Berts ability for second sentence prediction.

As such we tokenize the two sentences one after the other

By using the map function we can use the `batched = True` hyperparam which speeds up the tokenization a lot. Thus it is recommended

In [7]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [8]:
from transformers import DataCollatorWithPadding #this padding is better since it is dynamic(pads so its as long as longest word, not as max length)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

 **Training**

In [9]:
! pip install -U accelerate
! pip install -U transformers



In [10]:
from transformers import TrainingArguments, AutoModelForSequenceClassification

training_args = TrainingArguments('test_trainer')
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = 2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [19]:
import evaluate
import numpy as np
def compute_metrics(eval_preds):
  metric = evaluate.load('glue', 'mrpc') #evaluator for the specific dataset
  logits, labels = eval_preds
  pred = np.argmax(logits, axis = -1)
  return metric.compute(predictions = pred, references = labels)

In [22]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset =  tokenized_datasets['validation'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,

)
trainer.train()

Step,Training Loss
500,0.1188
1000,0.0564


TrainOutput(global_step=1377, training_loss=0.07274493603231946, metrics={'train_runtime': 122.5719, 'train_samples_per_second': 89.776, 'train_steps_per_second': 11.234, 'total_flos': 405114969714960.0, 'train_loss': 0.07274493603231946, 'epoch': 3.0})

In [23]:
trainer.evaluate()

{'eval_loss': 1.0693354606628418,
 'eval_accuracy': 0.8382352941176471,
 'eval_f1': 0.8850174216027874,
 'eval_runtime': 1.9893,
 'eval_samples_per_second': 205.101,
 'eval_steps_per_second': 25.638,
 'epoch': 3.0}

USING L4 INSTEAD OF CPU CHANGED TYME FROM 2 HOURS TO 2 MINUTES!?!!?!?

#Full Code

In [None]:
!pip install datasets
!pip install transformers
! pip install -U accelerate
! pip install -U transformers
!pip install evaluate

In [None]:
from datasets import load_dataset
import evaluate
import numpy as np

raw_datasets = load_dataset("glue", "mrpc")

from transformers import AutoTokenizer

model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer("What a great sentence this is! Ain't I right?", "Right on!", "Right on indeed!")

def tokenize_function(raw_data):
  return tokenizer(raw_data['sentence1'], raw_data['sentence2'], truncation = True) #we will add padding later
tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)

from transformers import DataCollatorWithPadding #this padding is better since it is dynamic(pads so its as long as longest word, not as max length)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

training_args = TrainingArguments('test_trainer')
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = 2)

def compute_metrics(eval_preds):
  metric = evaluate.load('glue', 'mrpc') #evaluator for the specific dataset
  logits, labels = eval_preds
  pred = np.argmax(logits, axis = -1)
  return metric.compute(predictions = pred, references = labels)


trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset =  tokenized_datasets['validation'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,

)
trainer.train()
trainer.evaluate()