# Fine Tuning DistilBert for BHV and Big5 Regression


## Environment setup 

Check if GPU is enabled in this environment

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Install HuggingFace transformer library

In [None]:
!pip install transformers

Mount Google Drive Repository for storage

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Check if the Google Drive Folder is mounted correctly and undesrtand the path to be called

In [None]:
!ls "/content/drive/MyDrive/Colab Notebooks"

## Dataset

Load a dataset made of sentences and labels

In [None]:
import pandas as pd
pData = "drive/MyDrive/Colab Notebooks/dataset/"
pModels = "drive/MyDrive/Colab Notebooks/models/"
# df_text = pd.read_csv(pData + "Big5/statuses_unicode.txt",
#                       header=None, names=['text'])
df_text = pd.read_csv(pData + "all_tweets_text.csv")
# df_labels = pd.read_csv(pData + "Big5/big5labels.txt",
#                         delimiter=" ", header=None,
#                         names=['O', 'C', 'E', 'A', 'N'])
df_labels = pd.read_csv(
    pData + "BHV/glove_all_bhv_all_tweets.csv",
    header=None,
    names=['SD', 'ST', 'HE', 'AC', 'PO', 'SE', 'CO', 'TR', 'BE', 'UN'])
df = pd.concat([df_text, df_labels], axis=1, sort=False)
# df = df[:32]
print("data shape", df.shape)
print(df.sample(2))
df['text']= df['text'].astype('str')
sentences  = df.text.values
labels = df.UN.values  # <--- choose the one you need
output_model_name = pModels+"distil_UN"  # <--- here too 
print(sentences[0], labels[0])
print(sentences.shape, labels.shape)

Labels do not need further processing so transform them into a tensor

In [None]:
labels = torch.tensor(labels)  

## Setup DistilBertForSequenceClassification 

Tokenizer

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-multilingual-cased',
    do_lower_case=False
)

Check tokenizer effectiveness

In [None]:
print(' Original: ', sentences[0])
print('Tokenized: ', tokenizer.tokenize(sentences[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(
      tokenizer.tokenize(sentences[0])))

Find Max Sequence Length to avoid resource wasting 

In [None]:
max_len = 0
for sent in sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)

Preprocess the sentences

In [None]:
input_ids = []
attention_masks = []
for sent in sentences:
  encoded_dict = tokenizer.encode_plus(
      sent,
      add_special_tokens = True,
      max_length = max_len,
      # pad_to_max_length = True,
      padding = 'max_length',
      return_attention_mask = True,
      return_tensors = 'pt'
  )
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

Prepare the train_and_test TensorDataset

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32  # recommended 16 or 32

train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = batch_size
)
validation_dataloader = DataLoader(
    val_dataset,
    sampler = SequentialSampler(val_dataset),
    batch_size = batch_size
)

Select the model and its parameters

In [None]:
from transformers import DistilBertForSequenceClassification
from transformers import DistilBertConfig, AdamW
from transformers import get_linear_schedule_with_warmup

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-multilingual-cased",
    num_labels = 1,  # <--- regression task
    output_hidden_states = False,
    output_attentions = False 
)
model.cuda()  # <--- run this model on GPU

optimizer = AdamW(
    model.parameters(),
    lr = 5e-5,
    eps = 1e-8,
    correct_bias=True
)

epochs = 2  # authors recommend between 2 and 4

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader)*epochs
)

Define the function to compute time elapsed in each batch

In [None]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Training

In [None]:
import random
import numpy as np
from sklearn.metrics import mean_squared_error

seed_val = 1
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []
total_t0 = time.time()

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    total_eval_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(step, len(train_dataloader), elapsed)

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].float().to(device)
        model.zero_grad()
        outputs = model(
         b_input_ids,
         attention_mask=b_input_mask,
         labels=b_labels
        )
        total_train_loss += outputs.loss.item()
        outputs.loss.type(torch.FloatTensor).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    print("  Average training loss: {0:.8f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, 
    # measure our performance on our validation set.
    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    eval_mse,nb_eval_steps = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].float().to(device)
        with torch.no_grad(): 
            outputs = model(
             b_input_ids,
             attention_mask=b_input_mask, 
             labels=b_labels
            )
        # Accumulate the validation loss.
        total_eval_loss += outputs.loss.item()

        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        #print("logits", logits)
        label_ids = b_labels.to('cpu').numpy()
        #print("labels", label_ids)
        pred_flat = logits.flatten()
        labels_flat = label_ids.flatten()
        tmp_eval_mse = mean_squared_error(pred_flat, labels_flat)
        #tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)
      
        eval_mse += tmp_eval_mse
        #eval_mcc_accuracy += tmp_eval_mcc_accuracy
        nb_eval_steps += 1
    print(F'\n\tValidation mse: {eval_mse/nb_eval_steps}')
model.save_pretrained(output_model_name)
print("Training complete!")
print("Total {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
# loaded_model = DistilBertForSequenceClassification.from_pretrained(
#     pModels+"distil_O", output_hidden_states = True)
# sent1 = "With their homes in ashes, residents share harrowing tales of survival after massive wildfires kill 15"
# sent2 = "News anchor hits back at viewer who sent her snarky note about ‘showing too much cleavage’ during broadcast"
# max_len = 256 # the closest power of two exceeding max len found
# input_ids = []
# attention_masks = []
# for sent in [sent1, sent2]:
#   encoded_dict = tokenizer.encode_plus(
#       sent,
#       add_special_tokens = True,
#       max_length = max_len,
#       # pad_to_max_length = True,
#       padding = 'max_length',
#       return_attention_mask = True,
#       return_tensors = 'pt'
#   )
#   # Add the encoded sentence to the list.    
#   input_ids.append(encoded_dict['input_ids'])
      
#   # And its attention mask (simply differentiates padding from non-padding).
#   attention_masks.append(encoded_dict['attention_mask'])

# input_ids = torch.cat(input_ids, dim=0)
# attention_masks = torch.cat(attention_masks, dim=0)
# pt_output = loaded_model(input_ids, 
#                         attention_mask=attention_masks)

# token_embeddings = torch.stack(pt_output.hidden_states, dim=0)
# # print(token_embeddings.size())
# last_layer = token_embeddings[-1]
# # print(last_layer.size())
# last_layer = last_layer.permute(1,0,2)
# # print(last_layer[0].size()) # CLS token
# #print(pt_output.hidden_states[-1].detach().numpy())
# #print(pt_output.hidden_states[-1].detach().numpy().shape)
