# Fine-tuning BERT

This notebook uses the BERTFineTuning.py file to fine-tune BERT

In [None]:
import torch
import numpy as np
import time
import random

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from transformers import get_linear_schedule_with_warmup

In [None]:
#Local setup:
%cd ..

In [None]:
from src.BERTFineTuning import BERTFineTuning

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
fine_tuner = BERTFineTuning("bert-base-uncased", device)

## Loading data

In [None]:
from src.CustomDataLoader import CustomDataLoader

data_files = {"train": "DBPEDIA_train.csv", "test": "DBPEDIA_test.csv", "validation": "DBPEDIA_val.csv"}
loader = CustomDataLoader(name="DeveloperOats/DBPedia_Classes", data_files=data_files)

dataset = loader.load_huggingface_data()
subsets = list(dataset.keys())
dfs = loader.to_dataframe(data_dict=dataset, subsets=subsets)

## Selecting equal few-shots in validation data

Use 37 from validation set to fine-tune

In [None]:
from collections import Counter
Counter(dfs["validation"]["l1"])

In [None]:
df_train = loader.selectEqualFewshots(dfs["validation"], label_name = "l1", text_name = "text", shots=37, seed=21)
df_test = dfs["test"]
display(df_train)

## Using all data available to fine-tune

In [None]:
##When other data needs to be used
#df_train = dfs["train"]
#df_validation = dfs["validation"]
#df_test = dfs["test"]

# Cleaning text

In [None]:
df_train['text'] = df_train['text'].apply(lambda x: fine_tuner.clean_text(x))
#df_validation['text'] = df_validation['text'].apply(lambda x: fine_tuner.clean_text(x))
df_test['text'] = df_test['text'].apply(lambda x: fine_tuner.clean_text(x))

In [None]:
tweets_train = df_train.text.values
labels_train = df_train.l1.values

#tweets_validation = df_validation.text.values
#labels_validation = df_validation.l1.values

tweets_test = df_test.text.values
labels_test = df_test.l1.values

## BERT Tokenizer

For the tokenizer, we need integer labels instead of string label. So, we need to convert them.

In [None]:
#Need to give integer instead of string label

_, labels_train = np.unique(labels_train, return_inverse=True)
#_, labels_validation = np.unique(labels_validation, return_inverse=True)
_, labels_test = np.unique(labels_test, return_inverse=True)

In [None]:
input_ids_train, attention_masks_train, labels_train = fine_tuner.tokenize(tweets_train, labels_train)
#input_ids_validation, attention_masks_validation, labels_validation = fine_tuner.tokenize(tweets_validation, labels_validation)
input_ids_test, attention_masks_test, labels_test = fine_tuner.tokenize(tweets_test, labels_test)

## Create a TensorDataset

In [None]:
# Combine the training inputs into a TensorDataset.
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

# Divide the dataset by randomly selecting samples.
train_dataset, _ = random_split(dataset_train, [len(dataset_train), 0])

#dataset_validation = TensorDataset(input_ids_validation, attention_masks_validation, labels_validation)
#val_dataset, _ = random_split(dataset_validation, [len(dataset_validation), 0])

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)
test_dataset, _ = random_split(dataset_test, [len(dataset_test), 0])

In [None]:
# The authors recommend batch_size = 16 or 32 for fine-tuning BERT.
# Because the amount of samples is so low, let's use 16.
batch_size = 16

# For training, we randomly sample a batch.
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)

# Order does not matter for validation or testing dataloader, so we will just pick them.
#validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)
test_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)

## The actual fine-tuning of the model

In [None]:
epochs = 6

In [None]:
#Login to push to hub
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()
TOKEN = os.getenv('TOKEN')

login(TOKEN)

In [None]:
# The authors recommend 2 to 4 epochs for fine-tuning. However, since we only have a few shots, we will just use 6 epochs.
epochs = 6
HuggingFaceRepoName = "TheChickenAgent/TemporaryRepo"
# Use a LR scheduler
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(fine_tuner.optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
fine_tuner.tune(train_dataloader, test_dataloader, scheduler, epochs, HuggingFaceRepoName)

# Loading the best model

In [None]:
model = torch.load('bert_model')

# Metrics

In [None]:
predictions = []
targets = []

for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        targets.extend(batch[2].to('cpu').numpy())
        with torch.no_grad():
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            
            pred_flat = np.argmax(logits, axis=1).flatten()
            predictions.extend(list(pred_flat))

In [None]:
from collections import Counter
print(Counter(predictions))
print(Counter(targets))

In [None]:
fine_tuner.calculate_metrics(y_pred = predictions, y_true = targets, averaging = "macro", device='cpu')

In [None]:
fine_tuner.calculate_metrics(y_pred = predictions, y_true = targets, averaging = "weighted", device='cpu')