In [1]:
%pip install datasets evaluate transformers[sentencepiece]
%pip install accelerate -U
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoTokenizer
from datasets import load_dataset

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_datasets = load_dataset("glue", "sst2")

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [3]:
from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import TrainingArguments

training_args = TrainingArguments("sst2-trainer", evaluation_strategy="epoch")

In [5]:
import evaluate
import numpy as np

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "sst2")
    logits, lables = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=lables)

In [6]:
from transformers import Trainer

trainer = Trainer(
    model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [7]:
trainer.train()

  2%|▏         | 500/25257 [00:55<43:56,  9.39it/s] 

{'loss': 0.4356, 'learning_rate': 4.9010175396919665e-05, 'epoch': 0.06}


  4%|▍         | 1000/25257 [01:58<44:04,  9.17it/s]  

{'loss': 0.354, 'learning_rate': 4.8020350793839334e-05, 'epoch': 0.12}


  6%|▌         | 1397/25257 [02:58<48:45,  8.16it/s]   