## Fine Tuning BERT for Regression Tasks

1. [Fine-tune BERT and for regression problem](https://discuss.huggingface.co/t/fine-tune-bert-and-camembert-for-regression-problem/332)
2. [Modify BertForSequenceClassification](https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_bert.py#L1227)
3. [Understand fine tuning](https://medium.com/@prakashakshay90/fine-tuning-bert-model-using-pytorch-f34148d58a37) phase with myPersonality

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Read myPersonality dataset status and big5 scores

In [None]:
# !ls "/content/drive/MyDrive"

In [None]:
import pandas as pd
# Reading Data into dataFrame
text = pd.read_csv("drive/MyDrive/Colab Notebooks/dataset/statuses_unicode.txt", header=None, names=['sentence'])
big5 = pd.read_csv("drive/MyDrive/Colab Notebooks/dataset/big5labels.txt", delimiter=" ", header=None, names=['O', 'C', 'E', 'A', 'N'])
df = pd.concat([text, big5], axis=1, sort=False)
#df = df[:32]
print(df.shape)
print(df.sample(5))
df['sentence']= df['sentence'].astype('str')
sentences = df.sentence.values
labels = df.N.values # working with openness
output_model_name = "drive/MyDrive/Colab Notebooks/models/myPers_bert_fine_tuned_N"
print(sentences[8], labels[8])
print(sentences.shape, labels.shape)

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
                'bert-base-multilingual-cased',
                do_lower_case=False)

In [None]:
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

In [None]:
max_len = 0

# For every sentence...
for sent in sentences:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

attention mask

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
max_len = 256 # the closest power of two exceeding max len found
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

### Train the model changing the task from classification to regression

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=1, #configure the model to perform regression and change the loss into Mean-Square Loss
    output_hidden_states = False, # prendiamo solo l'ultimo layer
    output_attentions = False
) # output_hidden_states ci permette di estrarre gli embeddings

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
# Parameters:
optimizer = AdamW(model.parameters(),
                  lr = 3e-5, # args.learning_rate - default is 5e-5,
                  eps = 1e-8, # args.adam_epsilon  - default is 1e-8.
                  correct_bias=True # To reproduce BertAdam specific behavior set correct_bias=False
                )

from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 10

num_warmup_steps = 0
num_training_steps = len(train_dataloader)*epochs

#Prepare optimizer and schedule (linear warmup and decay)
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
#     ]

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=num_warmup_steps,
                                            num_training_steps=num_training_steps)  # PyTorch scheduler

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


# Training

In [None]:
import random
import numpy as np
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import mean_squared_error
seed_val = 1 
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []
total_t0 = time.time()
for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    total_eval_loss = 0
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].float().to(device)
        model.zero_grad()
        outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        total_train_loss += outputs.loss.item()
        # print("train b_labels", b_labels)
        # print("train logits", outputs.logits)

        outputs.loss.type(torch.FloatTensor).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    eval_mse,nb_eval_steps = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].float().to(device)
        with torch.no_grad(): 
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        # Accumulate the validation loss.
        total_eval_loss += outputs.loss.item()

        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        #print("logits", logits)
        label_ids = b_labels.to('cpu').numpy()
        #print("labels", label_ids)
        pred_flat = logits.flatten()
        labels_flat = label_ids.flatten()
        tmp_eval_mse = mean_squared_error(pred_flat, labels_flat)
        #tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)
      
        eval_mse += tmp_eval_mse
        #eval_mcc_accuracy += tmp_eval_mcc_accuracy
        nb_eval_steps += 1
    print(F'\n\tValidation mse: {eval_mse/nb_eval_steps}')
    #print(F'\n\tValidation MCC Accuracy: {eval_mcc_accuracy/nb_eval_steps}')
    print("pred", pred_flat)
    print("original", labels_flat)
    
model.save_pretrained(output_model_name)
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of    279.    Elapsed: 0:00:31.
  Batch    80  of    279.    Elapsed: 0:01:02.
  Batch   120  of    279.    Elapsed: 0:01:34.
  Batch   160  of    279.    Elapsed: 0:02:05.
  Batch   200  of    279.    Elapsed: 0:02:36.
  Batch   240  of    279.    Elapsed: 0:03:07.

  Average training loss: 0.66
  Training epoch took: 0:03:38

Running Validation...

	Validation mse: 0.5825167663635746
pred [2.6716192 2.6035268 2.6129446 2.7873814 2.7606938 2.6426182 2.623395
 2.5436513 2.7837799 2.8264434 2.7294607 2.6869934 2.6101263 2.56546
 2.7194693 2.7366698 2.6078482 2.6718383 2.5820603 2.7374005 2.5228782
 2.7152436 2.5824583 2.6417053 2.718999  2.7201421 2.6469526 2.7702997
 2.6645272 2.6528344 2.6535451 2.6040194]
original [1.5  2.85 3.85 3.05 2.9  3.25 4.75 3.75 2.5  2.9  2.8  1.43 3.75 2.3
 4.25 1.9  1.43 2.7  3.7  2.65 2.25 2.45 1.43 3.5  2.3  2.85 2.8  3.75
 2.25 2.45 3.3  1.75]

Training...
  Batch    40  of    279.    Elapsed: 0:00:31.
  Batch    80  of    27

load pretrained and finetuned

In [None]:
'''
loaded_model = BertForSequenceClassification.from_pretrained("drive/MyDrive/Colab Notebooks/models/myPers_bert_fine_tuned_O", output_hidden_states = True)
sent1 = "With their homes in ashes, residents share harrowing tales of survival after massive wildfires kill 15"
sent2 = "News anchor hits back at viewer who sent her snarky note about ‘showing too much cleavage’ during broadcast"
max_len = 256 # the closest power of two exceeding max len found
input_ids = []
attention_masks = []
for sent in [sent1, sent2]:
  encoded_dict = tokenizer.encode_plus(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = max_len,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt'     # Return pytorch tensors.
                    )
  # Add the encoded sentence to the list.    
  input_ids.append(encoded_dict['input_ids'])
      
  # And its attention mask (simply differentiates padding from non-padding).
  attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
pt_output = loaded_model(input_ids, token_type_ids=None, 
                        attention_mask=attention_masks)

token_embeddings = torch.stack(pt_output.hidden_states, dim=0)
print(token_embeddings.size())
last_layer = token_embeddings[-1]
print(last_layer.size())
last_layer = last_layer.permute(1,0,2)
print(last_layer[0].size()) # CLS token

# print(pt_output.hidden_states[-1].detach().numpy())
# print(pt_output.hidden_states[-1].detach().numpy().shape)
'''

'\nloaded_model = BertForSequenceClassification.from_pretrained("drive/MyDrive/Colab Notebooks/models/myPers_bert_fine_tuned_O", output_hidden_states = True)\nsent1 = "With their homes in ashes, residents share harrowing tales of survival after massive wildfires kill 15"\nsent2 = "News anchor hits back at viewer who sent her snarky note about ‘showing too much cleavage’ during broadcast"\nmax_len = 256 # the closest power of two exceeding max len found\ninput_ids = []\nattention_masks = []\nfor sent in [sent1, sent2]:\n  encoded_dict = tokenizer.encode_plus(\n                          sent,                      # Sentence to encode.\n                          add_special_tokens = True, # Add \'[CLS]\' and \'[SEP]\'\n                          max_length = max_len,           # Pad & truncate all sentences.\n                          pad_to_max_length = True,\n                          return_attention_mask = True,   # Construct attn. masks.\n                          return_tensors = \'p