In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)
from torch import cuda
from lib.dataset_utils import *
from lib.plot_utils import *
from sklearn.metrics import accuracy_score, jaccard_score, f1_score
import matplotlib.pyplot as plt

- Minibatch
- learning rate
- momentum
- regularization
- dropout?
- topologia
- optimizer?

In [2]:
# Defining some key variables that will be used later on in the training
DATASET_NAME = DatasetEnum.GoEmotionsCleaned
MINIBATCH_SIZE = 16
EPOCHS = 1
LEARNING_RATE = 5e-05
FROZEN_LAYERS = 9
loader_params = {'batch_size': MINIBATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
# Creating the loss function and optimizer
loss_function = torch.nn.BCEWithLogitsLoss()
train_df, val_df, test_df = load_dataset(DATASET_NAME)
MAX_LEN = compute_max_tokens([train_df, val_df, test_df], RobertaTokenizer.from_pretrained('roberta-base'))

In [3]:
train_df[:10]

Unnamed: 0,text,admiration,amusement,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,...,sadness,surprise,neutral,annoyance,approval,caring,confusion,curiosity,desire,disappointment
0,My favourite food is anything I didn't have to...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,"Now if he does off himself, everyone will thin...",0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,WHY THE FUCK IS BAYLESS ISOING,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,To make her feel threatened,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Dirty Southern Wankers,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,Yes I heard abt the f bombs! That has to be wh...,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,We need more boards and to create a bit more s...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,Damn youtube and outrage drama is super lucrat...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,It might be linked to the trust factor of your...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [4]:
#TODO cross validation su topologia
class RobertaClass(torch.nn.Module):
    def __init__(self, n_classes, frozen_layers=-1):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        if frozen_layers != -1:
            for param in self.l1.embeddings.parameters():
                param.requires_grad = False
            for i in range(frozen_layers):
                for param in self.l1.encoder.layer[i].parameters():
                    param.requires_grad = False
        self.pre_classifier = torch.nn.Linear(768, 768)#TODO add_module?
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]#TODO ???
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    
    def get_out_dim(self):
        return self.classifier.out_features

In [5]:
def create_model_params(optimizer=torch.optim.Adam,
                        tokenizer=RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True),
                        tokenizer_max_len=None,
                        loader_params={'batch_size': 8, 'shuffle': True, 'num_workers': 0},
                        loss_function=torch.nn.BCEWithLogitsLoss(),
                        epochs=1,
                        learning_rate=1e-05,
                        val_patience=1,
                        clip_grad_norm=-1):
    return {
        'optimizer': optimizer,
        'tokenizer': tokenizer,
        'tokenizer_max_len': tokenizer_max_len,
        'loader_params': loader_params,
        'loss_function': loss_function,
        'epochs': epochs,
        'learning_rate': learning_rate,
        'val_patience': val_patience,
        'clip_grad_norm': clip_grad_norm
    }

class SimpleModelInterface:
    def __init__(self, 
                 model: torch.nn.Module, 
                 scores={},
                 model_params_dict=create_model_params()):
        self.model = model
        self.params = model_params_dict
        self.optimizer = model_params_dict['optimizer'](params=model.parameters(), lr=model_params_dict['learning_rate'])
        self.scores = scores
        self.train_scores = {name: [] for name in scores.keys()}
        self.train_loss = []
        self.val_scores = {name: [] for name in scores.keys()}
        self.val_loss = []
        self.device = 'cuda' if cuda.is_available() else 'cpu'
        model.to(self.device)

    def _train(self, training_loader, validation_loader=None, save_path=None):
        self.model.train()
        #TODO usare confusion matrix?
        cur_patience = self.params['val_patience']
        best_val_loss = np.inf
        for _ in range(self.params['epochs']):
            tr_loss = 0
            predictions_acc = []
            targets_acc = []
            for _,data in tqdm(enumerate(training_loader, 0)):
                ids = data['ids'].to(self.device, dtype = torch.long)
                mask = data['mask'].to(self.device, dtype = torch.long)
                token_type_ids = data['token_type_ids'].to(self.device, dtype = torch.long)
                targets = data['targets'].to(self.device, dtype = torch.float)

                outputs = self.model(ids, mask, token_type_ids)
                loss = loss_function(outputs, targets)
                tr_loss += loss.item()
                # append predictions and targets
                predictions_acc.extend(torch.sigmoid(outputs).detach().cpu().numpy())
                targets_acc.extend(targets.detach().cpu().numpy())

                # backward pass
                self.optimizer.zero_grad()
                loss.backward()
                if self.params['clip_grad_norm'] > -0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.params['clip_grad_norm'])
                self.optimizer.step()

            # create numpy arrays
            predictions_acc = np.array(predictions_acc)
            targets_acc = np.array(targets_acc)
            # calculate training scores
            epoch_loss = tr_loss/len(training_loader)
            self.train_loss.append(epoch_loss)
            for name, score in self.scores.items():
                self.train_scores[name].append(score(targets_acc, predictions_acc))

            # calculate validation scores
            if validation_loader is not None:
                val_scores = self.evaluate(validation_loader)
                for name, score in val_scores.items():
                    self.val_scores[name].append(score)
                self.val_loss.append(val_scores['loss'])
                if val_scores['loss'] < best_val_loss:
                    best_val_loss = val_scores['loss']
                    cur_patience = self.params['val_patience']
                    # save model
                    if save_path is not None:
                        torch.save(self.model, save_path)
                else:
                    cur_patience -= 1
                if cur_patience == 0:
                    break

        # restore best model
        if save_path is not None and validation_loader is not None:
            self.model = torch.load(save_path)
            self.model.to(self.device)




    def fit(self, training_df, validation_df=None):
        training_loader = create_data_loader_from_dataframe(training_df, self.params['tokenizer'], self.params['tokenizer_max_len'], **self.params['loader_params'])
        validation_loader = None
        if validation_df is not None:
            validation_loader = create_data_loader_from_dataframe(validation_df, self.params['tokenizer'], self.params['tokenizer_max_len'], **self.params['loader_params'])
        self._train(training_loader, validation_loader)

    def _predict(self, data_loader, compute_scores=False):
        self.model.eval()
        pred_loss=0
        # initialize target and prediction matrices
        predictions_acc = []
        targets_acc = []
        with torch.no_grad():
            for _, data in tqdm(enumerate(data_loader, 0)):
                ids = data['ids'].to(self.device, dtype = torch.long)
                mask = data['mask'].to(self.device, dtype = torch.long)
                token_type_ids = data['token_type_ids'].to(self.device, dtype=torch.long)
                targets = data['targets'].to(self.device, dtype = torch.float)
                outputs = self.model(ids, mask, token_type_ids).squeeze()
                if compute_scores:
                    # accumulate loss
                    loss = loss_function(outputs, targets)
                    pred_loss += loss.item()
                # append predictions and targets
                predictions_acc.extend(torch.sigmoid(outputs).detach().cpu().numpy())
                targets_acc.extend(targets.detach().cpu().numpy())
        return np.array(targets_acc), np.array(predictions_acc), pred_loss

    def predict(self, testing_df):
        testing_loader = create_data_loader_from_dataframe(testing_df, self.params['tokenizer'], self.params['tokenizer_max_len'], **self.params['loader_params'])
        target, out, _ = self._predict(testing_loader, compute_scores=False)
        return out, target

    def evaluate(self, testing_df):
        testing_loader = create_data_loader_from_dataframe(testing_df, self.params['tokenizer'], self.params['tokenizer_max_len'], **self.params['loader_params'])
        targets_acc, predictions_acc, pred_loss = self._predict(testing_loader, compute_scores=True)
        # calculate scores
        scores = {name: score(targets_acc, predictions_acc) for name, score in self.scores.items()}
        scores['loss'] = pred_loss/len(testing_loader)
        return scores

    def get_train_scores(self):
        return self.train_scores
    
    def get_train_loss(self):
        return self.train_loss

    def get_val_scores(self):
        return self.val_scores
    
    def get_val_loss(self):
        return self.val_loss
    
    def save_model(self, model_path, vocabulary_path):
        torch.save(self.model, model_path)
        self.tokenizer.save_vocabulary(vocabulary_path)

In [None]:
def tune_sigmoid_threshold(y_true, y_pred, metric_fun=accuracy_score, metric_params={}, is_maximization=True):
    thresholds = np.arange(0, 1, 0.01)
    scores = [metric_fun(y_true, y_pred > t, **metric_params) for t in thresholds]
    best_threshold = thresholds[np.argmax(scores)] if is_maximization else thresholds[np.argmin(scores)]
    return best_threshold, scores

def plot_threshold_tuning(y_true, y_pred, metric_fun=accuracy_score, metric_params={}, plot=False, is_maximization=True, metric_name='Accuracy'):
    best_threshold, scores = tune_sigmoid_threshold(y_true, y_pred, metric_fun, metric_params, is_maximization)
    if plot:
        plt.plot(np.arange(0, 1, 0.01), scores)
        plt.xlabel('Threshold')
        plt.ylabel(metric_name)
        # get average type if provided
        if 'average' in metric_params:
            metric_name += f' ({metric_params["average"]})'
        plt.title(f'{metric_name} over sigmoid threshold')
        plt.show()
        print(f'Best threshold: {best_threshold}')
        print(f'Best {metric_name}: {max(scores) if is_maximization else min(scores)}')

In [6]:
THRESHOLD = 0.5
def accuracy(y_true, y_pred):# tuning implicitly done in score calculation :)
    _, best_res = tune_sigmoid_threshold(y_true, y_pred, accuracy_score)
    return best_res

def jaccard(y_true, y_pred):
    _, best_res = tune_sigmoid_threshold(y_true, y_pred, jaccard_score, {'average': 'macro'})
    return best_res

def jaccard_samples(y_true, y_pred):
    _, best_res = tune_sigmoid_threshold(y_true, y_pred, jaccard_score, {'average': 'samples'})
    return best_res

def f1(y_true, y_pred):
    _, best_res = tune_sigmoid_threshold(y_true, y_pred, f1_score, {'average': 'macro'})
    return best_res

def f1_micro(y_true, y_pred):
    _, best_res = tune_sigmoid_threshold(y_true, y_pred, f1_score, {'average': 'micro'})
    return best_res

In [7]:
#FIXME nclasses
model = SimpleModelInterface(RobertaClass(train_df.shape[1]-1), {'accuracy': accuracy, 'jaccard_macro': jaccard, 'f1_macro': f1, 'jaccard_samples': jaccard_samples, 'f1_micro':f1_micro}, create_model_params(tokenizer_max_len=MAX_LEN))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.fit(train_df[:1000])

125it [00:47,  2.64it/s]


In [None]:
#FIXME jaccard ed f1 non funzionano correttamente
scores = model.evaluate(test_df[:1000])
scores

In [None]:
out, target = model.predict(test_df)

In [None]:
plot_threshold_tuning(target, out, plot=True)
plot_threshold_tuning(target, out, plot=True, metric_params={'average':'micro'}, metric_fun=f1_score, metric_name='F1 Score')
plot_threshold_tuning(target, out, plot=True, metric_params={'average':'macro'}, metric_fun=f1_score, metric_name='F1 Score')