In [None]:
!pip install datasets
!pip install accelerate
!pip install git+https://github.com/huggingface/transformers
!pip install evaluate

In [None]:
# Preparing datasets
import accelerate
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

raw_dataset = load_dataset("glue","mrpc")                                                                                  # glue provides datasets for benchmarking we will use its mrpc dataset
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                                                                      # tokenzier
tokenized_datasets = raw_dataset.map(lambda x : tokenizer(x["sentence1"],x["sentence2"],truncation=True), batched=True)    # tokenizing and creating batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)                                                               # preparing collator for dynamic padding
# now we have tokenized_dataset plus data_collator for dynamic padding

In [None]:
# now preparing for model training
from transformers import TrainingArguments
import torch
training_args = TrainingArguments("/content/drive/MyDrive")                                                 # directory of model to be saved , also push_to_hub = True if you want to push model to hugging face

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)                                        # loading bert base uncased model from above checkpoint

In [None]:
# Configuring Model using trainer
from transformers import Trainer
trainer = Trainer(
    model,                                                                # model path
    training_args,                                                        # training args for saving model directory
    train_dataset = tokenized_datasets['train'],                          # training dataset
    eval_dataset = tokenized_datasets['validation'],                      # validation dataset
    data_collator=data_collator,                                          # data collator for dynamic padding i.e for making all token len equal
    tokenizer=tokenizer,                                                  # tokenizer for converting sentence to input_ids
)

In [None]:
# finetuning the model by starting training process
trainer.train()

Step,Training Loss
500,0.5108
1000,0.2722


TrainOutput(global_step=1377, training_loss=0.3115876972805181, metrics={'train_runtime': 203.4878, 'train_samples_per_second': 54.077, 'train_steps_per_second': 6.767, 'total_flos': 405626802939840.0, 'train_loss': 0.3115876972805181, 'epoch': 3.0})

In [None]:
# Evaluating the model
predictions  = trainer.predict(tokenized_datasets['validation'])                         # making predictions on validation datasets


In [None]:
import  numpy as np
preds = np.argmax(predictions.predictions, axis=-1)                                     # selecting labels from the predictions

In [None]:
# creating evaluations
import evaluate
metrics = evaluate.load("glue", "mrpc")
metrics.compute(predictions=preds, references=predictions.label_ids)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.8553921568627451, 'f1': 0.8970331588132635}

In [None]:
# Putting it all together
def compute_metrics(eval_preds):
  metric = evaluate.load("glue","mrpc")                                               # lodaing glue evaluating for benchmarking model
  logits, label = eval_preds                                                          # eval_preds is 2 dimensional i,e logits and label
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions, references=label)

In [None]:
trainer = Trainer(
    model,                                                                # model path
    training_args,                                                        # training args for saving model directory
    train_dataset = tokenized_datasets['train'],                          # training dataset
    eval_dataset = tokenized_datasets['validation'],                      # validation dataset
    data_collator=data_collator,                                          # data collator for dynamic padding i.e for making all token len equal
    tokenizer=tokenizer,                                                  # tokenizer for converting sentence to input_ids
    compute_metrics = compute_metrics,                                     # custom function to calculate accuracy and f1
)

In [None]:
trainer.train()

Step,Training Loss
500,0.0813
1000,0.0259


Checkpoint destination directory /content/drive/MyDrive/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/drive/MyDrive/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1377, training_loss=0.04330072700760896, metrics={'train_runtime': 213.4085, 'train_samples_per_second': 51.563, 'train_steps_per_second': 6.452, 'total_flos': 405626802939840.0, 'train_loss': 0.04330072700760896, 'epoch': 3.0})