## Fine Tuning BERT for Regression Tasks

1. [Fine-tune BERT and for regression problem](https://discuss.huggingface.co/t/fine-tune-bert-and-camembert-for-regression-problem/332)
2. [Modify BertForSequenceClassification](https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_bert.py#L1227)
3. [Understand fine tuning](https://medium.com/@prakashakshay90/fine-tuning-bert-model-using-pytorch-f34148d58a37) phase with myPersonality

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig,AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup, BertModel
from tqdm import tqdm, trange, tnrange, tqdm_notebook
import pandas as pd
import io
import numpy as np
from sklearn.model_selection import train_test_split
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score, matthews_corrcoef

import random
import os

In [2]:
# identify and specify the GPU as the device, later in training loop we will load data into device
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
#n_gpu = torch.cuda.device_count()
#torch.cuda.get_device_name(0)

SEED = 19

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)

In [3]:
#print(device, n_gpu, torch.cuda.get_device_name(0))

### Read myPersonality dataset status and big5 scores

In [4]:
# Reading Data into dataFrame
text = pd.read_csv("~/Venv/Documents/dataset/myPersonalitySmall/statuses_unicode.txt", header=None, names=['sentence'])
big5 = pd.read_csv("../dataset/myPersonalitySmall/big5labels.txt", delimiter=" ", header=None, names=['O', 'C', 'E', 'A', 'N'])
print(text.sample(5))
print(text.sentence.size)

                                               sentence
8333  Guess whose place of work got broken into last...
4013  You're going to want to read my latest article...
5759                                good day today. =]]
1363  is sad thinking about how her kitties are at h...
7930  HAHAHA ~ love this!: """"Please put this on yo...
9913


In [5]:
df = pd.concat([text, big5], axis=1, sort=False)
print(df.sample(5))

                                               sentence     O     C     E  \
666   seriously cannot figure out what the HELL is w...  4.15  3.10  3.35   
8236  is completely disappointed with the *PROPNAME*...  4.25  3.15  3.20   
1920                               has contacts finally  4.25  2.80  2.45   
6837  can't decide if it's good or bad that med stud...  4.13  3.50  4.13   
8835  Congratulations *PROPNAME*!!! Newest Federal A...  4.50  3.50  2.25   

         A     N  
666   2.85  2.75  
8236  3.85  1.85  
1920  3.80  4.00  
6837  4.50  1.43  
8835  3.25  2.75  


Find the max_seq_len in the dataset, replacing the default 512 of BERT

In [6]:
max_len = 0
for i in df['sentence']:
    #print(len(str(i)))
    if (len(str(i))>max_len):
        max_len = len(str(i))
print(max_len)

432


In [7]:
MAX_LEN = 512 #the nearest power of 2

If you run the code in a virtual environment

In [8]:
# pip install ipywidgets
# jupyter nbextension enable --py widgetsnbextension --sys-prefix

In [9]:
## Import BERT tokenizer, that is used to convert our text into tokens that corresponds to BERT library
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

### Formatting the sentences according to BERT 

In [None]:
df['sentence']= df['sentence'].astype('str') #from object to string (sometimes str sometimes string)
sentences = df.sentence.values
#print(sentences)
input_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True) for sent in sentences]

In [11]:
#print("Actual sentence before tokenization: ",sentences[2])
#print("Encoded Input from dataset: ",input_ids[2])

attention mask

In [12]:
## Create attention mask
attention_masks = []
## Create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
#print(attention_masks[2])

In [13]:
labels = df.O.values

In [14]:
# Split into a training set and a test set using a stratified k fold
train_inputs,validation_inputs,train_labels,validation_labels = train_test_split(input_ids,labels,random_state=SEED,test_size=0.1)
train_masks,validation_masks,_,_ = train_test_split(attention_masks,input_ids,random_state=SEED,test_size=0.1)

In [15]:
# convert all our data into torch tensors, required data type for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [16]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs,train_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

validation_data = TensorDataset(validation_inputs,validation_masks,validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data,sampler=validation_sampler,batch_size=batch_size)

### Train the model changing the task from classification to regression

In [17]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=1, output_hidden_states = True).to(device) # output_hidden_states ci permette di estrarre gli embeddings
# setting num_label=1 configure the model to perform regression and change the loss into Mean-Square Loss

The description of the change for the regression task is described at [transformers repository](https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_bert.py#L1227) on Github

In [18]:
# Parameters:
lr = 2e-5
adam_epsilon = 1e-8

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

num_warmup_steps = 0
num_training_steps = len(train_dataloader)*epochs

#Prepare optimizer and schedule (linear warmup and decay)
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
#     ]

### In Transformers, optimizer and schedules are splitted and instantiated like this:
#optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon)
optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon,correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

In [None]:
## Store our loss and accuracy for plotting
train_loss_set = []
learning_rate = []

# Gradients gets accumulated by default
model.zero_grad()

# tnrange is a tqdm wrapper around the normal python range
for _ in tnrange(1,epochs+1,desc='Epoch'):
    print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
    # Calculate total loss for this epoch
    batch_loss = 0

    for step, batch in enumerate(train_dataloader):
        # Set our model to training mode (as opposed to evaluation mode)
        model.train()
    
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
    
        # Backward pass
        loss.backward()
    
        # Clip the norm of the gradients to 1.0
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
    
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        
        # Update learning rate schedule
        scheduler.step()

        # Clear the previous accumulated gradients
        optimizer.zero_grad()
    
        # Update tracking variables
        batch_loss += loss.item()

    # Calculate the average loss over the training data.
    avg_train_loss = batch_loss / len(train_dataloader)

    #store the current learning rate
    for param_group in optimizer.param_groups:
        print("\n\tCurrent Learning rate: ",param_group['lr'])
        learning_rate.append(param_group['lr'])
    
    train_loss_set.append(avg_train_loss)
    print(F'\n\tAverage Training loss: {avg_train_loss}')
    
    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_accuracy,eval_mcc_accuracy,nb_eval_steps = 0, 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
        hidden_states = outputs.hidden_states
        layer_i = 13
        for elem in hidden_states[layer_i]:
            print("CLS batch", elem[0])
            
        logits = outputs.logits
        # Move logits and labels to CPU
        logits = logits[0].to('cpu').numpy()
        label_ids = b_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        tmp_eval_accuracy = accuracy_score(pred_flat, labels_flat)
        tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)
    
        eval_accuracy += tmp_eval_accuracy
        eval_mcc_accuracy += tmp_eval_mcc_accuracy
        nb_eval_steps += 1

    print(F'\n\tValidation Accuracy: {eval_accuracy/nb_eval_steps}')
    print(F'\n\tValidation MCC Accuracy: {eval_mcc_accuracy/nb_eval_steps}')



  for _ in tnrange(1,epochs+1,desc='Epoch'):


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

