# Fine Tuning BERT
adapted from: https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb#scrollTo=4wxY3x-ZZz8h

## Set Up Environment

In [1]:
!pip install -q transformers datasets

## Import Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset('json', data_files={'train': './data/book/train.json', 'validation': './data/book/valid.json'})

# Subset the first 300 rows of the training data
dataset['train'] = dataset['train'].select(range(250))

# Subset the first 150 rows of the validation data
dataset['validation'] = dataset['validation'].select(range(250))

## Inspect Data

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 250
    })
})

In [None]:
example = dataset['train'][0]
example

{'input': 'User Preference: "Q Is for Quarry" written by Sue Grafton, "The End of Enemies (Briggs Tanner Novels)" written by Grant Blackwood\nUser Unpreference: "ICEFIRE" written by Judith Reeves-Stevens\nWhether the user will like the target book ""Specter of the Past: Star Wars (Star Wars (Bantam Books (Firm) : Unnumbered).)" written by Timothy Zahn"?',
 'instruction': 'Given the user\'s preference and unpreference, identify whether the user will like the target book by answering "Yes." or "No.".',
 'output': 'Yes.'}

## Preprocess data

In [None]:
labels = [label for label in dataset['train'].features.keys() if label not in ['input', 'instruction']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['output']

In [None]:
from transformers import AutoTokenizer
import numpy as np
import torch
    

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
    # take a batch of texts
    text = [x + y for x, y in zip(examples["instruction"], examples['input'])]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  
    # convert 'Yes' or 'No' to binary labels
    labels_batch = examples['output']
    # Binary encode the labels ('No' -> 0 and 'Yes' -> 1)
    labels_encoded = torch.tensor([1 if label == 'Yes.' else 0 for label in labels_batch], dtype=torch.float)

    encoding["labels"] = labels_encoded
  
    return encoding


In [None]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
encoded_dataset.set_format("torch")

## Define the Model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=1,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [None]:
# Unwrap tensors into a list of integers for both training and validation sets
train_labels = [label.item() for label in encoded_dataset['train']['labels']]
validation_labels = [label.item() for label in encoded_dataset['validation']['labels']]

print(set(train_labels))
print(set(validation_labels))

{0.0, 1.0}
{0.0, 1.0}


In [None]:
batch_size = 8
metric_name = "f1"

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Define metrics

In [40]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def binary_metrics(predictions, labels):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # Use threshold to turn probabilities into binary predictions
    y_pred = (probs >= 0.5).long()
    # Compute metrics
    f1 = f1_score(y_true=labels, y_pred=y_pred, average='binary')
    roc_auc = roc_auc_score(y_true=labels, y_score=probs)
    accuracy = accuracy_score(y_true=labels, y_pred=y_pred)
    metrics = {'f1': f1, 'roc_auc': roc_auc, 'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    # Convert the logits to class probabilities using the sigmoid function
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(preds)).numpy()
    
    # Flatten the predictions and labels since we're dealing with binary classification
    flatten_labels = p.label_ids.flatten()
    flatten_preds = probs.flatten()
    
    # Now use the binary_metrics function to calculate the binary classification metrics
    result = binary_metrics(predictions=flatten_preds, labels=flatten_labels)
    
    # You can log these metrics or return them
    return result

In [41]:
#forward pass

# Get the input ids from the first example of the training dataset
input_ids = encoded_dataset['train']['input_ids'][0].unsqueeze(0)
labels = torch.tensor([encoded_dataset['train']['labels'][0]]).unsqueeze(0)
labels = labels.float()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move the model to the appropriate device
model = model.to(device)

# Move the input tensors to the same device as the model
input_ids = input_ids.to(device)
labels = labels.to(device)


# Perform the forward pass
outputs = model(input_ids=input_ids, labels=labels)

print(outputs)

SequenceClassifierOutput(loss=tensor(0.3963, device='cuda:0', grad_fn=<MseLossBackward0>), logits=tensor([[0.3705]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## Train the model

In [42]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [43]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.279051,0.607242,0.597176,0.436
2,No log,0.343848,0.607242,0.578697,0.436
3,No log,0.409616,0.607242,0.603553,0.436
4,No log,0.342777,0.607242,0.59633,0.436
5,No log,0.345435,0.607242,0.587026,0.436
6,No log,0.378324,0.607242,0.597436,0.436
7,No log,0.329076,0.607242,0.612337,0.436
8,No log,0.326902,0.607242,0.623007,0.436
9,No log,0.315467,0.607242,0.629449,0.436
10,No log,0.324132,0.607242,0.626456,0.436


TrainOutput(global_step=320, training_loss=0.09186248183250427, metrics={'train_runtime': 72.1308, 'train_samples_per_second': 34.659, 'train_steps_per_second': 4.436, 'total_flos': 164442933120000.0, 'train_loss': 0.09186248183250427, 'epoch': 10.0})

## Evaluate Model

In [44]:
trainer.evaluate()

{'eval_loss': 0.27905139327049255,
 'eval_f1': 0.6072423398328691,
 'eval_roc_auc': 0.5971761337757824,
 'eval_accuracy': 0.436,
 'eval_runtime': 0.4974,
 'eval_samples_per_second': 502.58,
 'eval_steps_per_second': 64.33,
 'epoch': 10.0}

In [22]:
text = "Nothing left to do but smile, smile, smile"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [23]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[0.4482]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [24]:
logits = outputs.logits
logits.shape

torch.Size([1, 1])

In [25]:
for i in range(len(labels)):
    print(labels[i], logits[0][i])

tensor([1.], device='cuda:0') tensor(0.4482, device='cuda:0', grad_fn=<SelectBackward0>)


In [26]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.2)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

  predictions[np.where(probs >= 0.2)] = 1


IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed

## Finish