# Tutorial on Hugging Face full training cycle

In [None]:
!pip install datasets
!pip install transformers

In [None]:
#@ Preparing tokenized datasets with datacollator for padding
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")                                       # loading mrpcs datasets from hugging face datasets
checkpoint = "bert-base-uncased"                                                  # checkpoint specifying bert based model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                             # tokenizer


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True) # tokenize function to tokenize sentences and truncate if needed


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)            # creating tokenized datasets in batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)                      # specifying data collator for dynamic padding


In [None]:
tokenized_datasets

In [None]:
# preprocessing datasets before passing to modelss
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1","sentence2","idx"])                                             # model doesnot expect this column so removing them
tokenized_datasets = tokenized_datasets.rename_column("label","labels")                                                             # model expect labels as name
tokenized_datasets.set_format("torch")                                                                         # torch tensors instead of list
tokenized_datasets["train"].column_names                                                                       # checking names of column from training data

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
# defining dataloaders
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn = data_collator                       # preparing training data loader with batch size, shuffling and padding
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn = data_collator                               # preparing validation data loader with batch size and padding
)

In [None]:
for batch in train_dataloader:
  break
{k: v.shape for k,v in batch.items()}

In [None]:
# loading the model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)                        # loading model for classification

In [None]:
# check by passing batch to our model

outputs = model(**batch)
print(outputs.loss,outputs.logits.shape)

tensor(0.4100, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)                                                               # adding optimization and regularization

In [None]:
# using get_scheduler use linear decay from maximum 5e-5 to 0
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs*len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",                                                                                                # using linear decay to reduce learning rate linearly
    optimizer=optimizer,                                                                                     # optimizer with admaw and learning rate
    num_warmup_steps=0,
    num_training_steps=num_training_steps,                                                                   # giving training steps

)

In [None]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")                         # run gpu if available
model.to(device)

In [None]:
# training model
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):                                  # taking each epoch
  for batch in train_dataloader:                                 # loading a batch
    batch = {k : v.to(device) for k,v in batch.items()}          # taking value from batch dict
    outputs = model(**batch)                                     # unpacking batch dict and passing to model
    loss = outputs.loss                                          # calculating loss from output
    loss.backward()                                              # backpropagation to update weights

    #steps for optimization
    optimizer.step()                                            # optimizing model using adamw and learning rate
    lr_scheduler.step()                                         # lr scheduler decrease learning rate form 5e-5 to 0
    optimizer.zero_grad()                                       # make gradient descent zero
    progress_bar.update(1)                                      # updating progress bar

  0%|          | 0/1377 [00:00<?, ?it/s]

In [None]:
!pip install evaluate

In [None]:
# evaluating model
import evaluate

metric = evaluate.load("glue","mrpc")                                               # loading metrc
model.eval()
for batch in eval_dataloader:
  batch = {k: v.to(device) for k,v in batch.items()}                                 # taking   value from batch
  with torch.no_grad():                                                              # disabling gradient descents
    outputs = model(**batch)                                                         # getting output

  logits = outputs.logits                                                            # taking logits only
  predictions = torch.argmax(logits, dim=-1)                                         # getting max value as predictions from logits
  metric.add_batch(predictions=predictions, references=batch["labels"])              # adding predictions and refrenc to evaluate

metric.compute()

{'accuracy': 0.8602941176470589, 'f1': 0.9028960817717206}