In [45]:
from pathlib import Path

import datasets
import pandas as pd
import torch

from accelerate import Accelerator
from sklearn.metrics import classification_report
from torch.utils.data.dataloader import DataLoader
from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer)
from transformers.modeling_outputs import SequenceClassifierOutput
from tqdm.notebook import tqdm

In [13]:
cv_path = Path('../../../BRUM/data/cross_validation')
humor_features_cv_path = Path('../../../BRUM/data/features/humor_features')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [35]:
class HumorRecognitionModel(torch.nn.Module):
    def __init__(self, checkpoint, num_labels):
        super().__init__()
        self.num_labels = num_labels
        self.base_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
        self.dropout = torch.nn.Dropout(0.1)
        self.linear = torch.nn.Linear(2 + 27, self.num_labels)

        for param in self.base_model.parameters():
            param.requires_grad = False

    def forward(self, input_ids, attention_mask, humor_features, labels=None, **kwargs):
        seq_output = self.base_model(input_ids=input_ids,
                                     attention_mask=attention_mask)
        concat_repr = torch.cat((seq_output.logits, humor_features), dim=1)
        dropout_output = self.dropout(concat_repr)
        logits = self.linear(dropout_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return SequenceClassifierOutput(loss=loss,
                                        logits=logits,
                                        hidden_states=seq_output.hidden_states,
                                        attentions=seq_output.attentions)


In [16]:
def tokenize_data(data, checkpoint, batch_size=16):
    dataset = datasets.Dataset.from_pandas(data[['Text', 'Label']].reset_index())
    tokenized_data = dataset.rename_column('Label', 'label')

    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    def tokenize_function(data):
        label_batch = [element['label'] for element in data]
        index_batch = [element['index'] for element in data]
        text_batch = [element['Text'] for element in data]
        tokenized = tokenizer(text_batch,
                              truncation=True,
                              padding='longest',
                              return_tensors='pt')
        return tokenized, index_batch, label_batch

    dataloader = DataLoader(tokenized_data,
                            batch_size=batch_size,
                            collate_fn=tokenize_function)
    return dataloader

In [57]:
num_epochs = 5
learning_rate = 5e-4

accelerator = Accelerator(mixed_precision='fp16')

In [58]:
def finetune_knowledge_injection(train, checkpoint, features, output, freeze_downstream=True):
    model = HumorRecognitionModel(checkpoint, 2)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=learning_rate)
    lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer)

    label2id = model.base_model.config.label2id
    train['Label'] = train['Label'].map(label2id)
    dataloader = tokenize_data(train, checkpoint, batch_size=16)

    model, optimizer, dataloader, lr_scheduler = accelerator.prepare(model, optimizer, dataloader, lr_scheduler)

    for epoch in range(num_epochs):
        model.train()
        with tqdm(dataloader, unit='batch') as tepoch:
            for batch in tepoch:
                tepoch.set_description(f'Epoch {epoch}')

                inputs = dict(batch[0])
                batch_features = features.iloc[batch[1], :].values
                inputs['humor_features'] = torch.Tensor(batch_features.tolist()).to(device)
                inputs['labels'] = torch.Tensor(batch[2]).long().to(device)

                outputs = model(**inputs)
                loss = outputs.loss
                accelerator.backward(loss)

                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

                tepoch.set_postfix(training_loss=loss.item())

    output.mkdir(parents=True, exist_ok=True)
    torch.save(model.state_dict(), output/'checkpoint.pt')

    accelerator.free_memory()
    del model, optimizer, dataloader, lr_scheduler


In [59]:
for fold in cv_path.iterdir():
    train = pd.read_json(fold/'train.json')[['Text', 'Label']]
    checkpoint = list((Path('../../results/models/bertimbau')/fold.name).iterdir())[0]
    features = pd.read_hdf(humor_features_cv_path/fold.name/'train'/'data.hdf5')
    features = features.drop(columns='Label')
    output = Path('../../results/models/pipeline_bertimbau')/fold.name
    finetune_knowledge_injection(train, checkpoint, features, output)
    break


loading configuration file ..\..\results\models\bertimbau\fold_0\checkpoint-3312\config.json
Model config BertConfig {
  "_name_or_path": "..\\..\\results\\models\\bertimbau\\fold_0\\checkpoint-3312",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "H",
    "1": "N"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "H": 0,
    "N": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "problem_type": "single

  0%|          | 0/552 [00:00<?, ?batch/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/552 [00:00<?, ?batch/s]

  0%|          | 0/552 [00:00<?, ?batch/s]

  0%|          | 0/552 [00:00<?, ?batch/s]

  0%|          | 0/552 [00:00<?, ?batch/s]

In [60]:
def knowledge_injection_prediction(test, features, downstream_checkpoint, checkpoint, output):
    # Load model
    model = HumorRecognitionModel(downstream_checkpoint, 2).to(device)
    loaded_model = torch.load(checkpoint)
    model.load_state_dict(loaded_model)
    model.eval()

    # Convert labels to model ids
    label2id = model.base_model.config.label2id
    id2label = model.base_model.config.id2label
    test['Label'] = test['Label'].map(label2id)
    dataloader = tokenize_data(test, downstream_checkpoint, batch_size=64)

    model, dataloader = accelerator.prepare(model, dataloader)

    results_dict = {'Prediction': list(),
                    'Label': list()}
    for batch in tqdm(dataloader, unit='batch'):
        # Compute model inputs
        inputs = dict(batch[0])
        batch_features = features.iloc[batch[1], :].values
        inputs['humor_features'] = torch.Tensor(batch_features.tolist()).to(device)
        inputs['labels'] = torch.Tensor(batch[2]).to(device).long()

        # Do prediction
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits.softmax(1), 1)

        # Create predictions dataframe
        results_dict['Prediction'].extend(predictions.tolist())
        results_dict['Label'].extend(inputs['labels'].tolist())
    results = pd.DataFrame(results_dict, index=test.index)
    results['Prediction'] = results['Prediction'].map(model.base_model.config.id2label)
    results['Label'] = results['Label'].map(id2label)

    # Save predictions
    output.parent.mkdir(parents=True, exist_ok=True)
    results.to_csv(output, encoding='utf-8')

    accelerator.free_memory()
    del model, dataloader


In [61]:
for fold in cv_path.iterdir():
    test = pd.read_json(fold/'test.json')[['Text', 'Label']]

    features = pd.read_hdf(humor_features_cv_path/fold.name/'train'/'data.hdf5')
    features = features.drop(columns='Label')
    
    downstream_checkpoint_folder = Path('../../results/models/bertimbau')/fold.name
    downstream_checkpoint = list(downstream_checkpoint_folder.iterdir())[0]

    checkpoint_folder = Path('../../results/models/pipeline_bertimbau')
    checkpoint = checkpoint_folder/fold.name/'checkpoint.pt'

    output_folder = Path('../../results/predictions/pipeline_bertimbau')/fold.name
    output = output_folder.with_suffix('.csv')

    knowledge_injection_prediction(test, features, downstream_checkpoint, checkpoint, output)
    break

loading configuration file ..\..\results\models\bertimbau\fold_0\checkpoint-3312\config.json
Model config BertConfig {
  "_name_or_path": "..\\..\\results\\models\\bertimbau\\fold_0\\checkpoint-3312",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "H",
    "1": "N"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "H": 0,
    "N": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "problem_type": "single

  0%|          | 0/16 [00:00<?, ?batch/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [62]:
predictions_path = Path('../../results/predictions')

results = dict()
for method in predictions_path.iterdir():
    folds = dict()
    for fold in method.iterdir():
        fold_df = pd.read_csv(fold)
        evaluation = classification_report(fold_df['Label'],
                                           fold_df['Prediction'],
                                           output_dict=True)
        evaluation_df = pd.DataFrame.from_dict(evaluation)
        folds[fold.stem] = evaluation_df
    results[method.stem] = pd.concat(folds, names=['Fold'])
results_df = pd.concat(results, names=['Method'])
results_df.index = results_df.index.rename('Metric', level=2)
results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,H,N,accuracy,macro avg,weighted avg
Method,Fold,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
albertina_ptbr_base,fold_0,precision,0.740576,0.703774,0.720693,0.722175,0.722194
albertina_ptbr_base,fold_0,recall,0.680244,0.761224,0.720693,0.720734,0.720693
albertina_ptbr_base,fold_0,f1-score,0.709130,0.731373,0.720693,0.720251,0.720240
albertina_ptbr_base,fold_0,support,491.000000,490.000000,0.720693,981.000000,981.000000
albertina_ptbr_base,fold_1,precision,0.797531,0.708333,0.745158,0.752932,0.752978
...,...,...,...,...,...,...,...
knwoledge_injection_bertimbau_frozen,fold_9,support,490.000000,490.000000,0.708163,980.000000,980.000000
pipeline_bertimbau,fold_0,precision,0.693182,0.656192,0.672783,0.674687,0.674706
pipeline_bertimbau,fold_0,recall,0.621181,0.724490,0.672783,0.672836,0.672783
pipeline_bertimbau,fold_0,f1-score,0.655209,0.688652,0.672783,0.671931,0.671914


In [64]:
results_df.query('Fold == "fold_0"')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,H,N,accuracy,macro avg,weighted avg
Method,Fold,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
albertina_ptbr_base,fold_0,precision,0.740576,0.703774,0.720693,0.722175,0.722194
albertina_ptbr_base,fold_0,recall,0.680244,0.761224,0.720693,0.720734,0.720693
albertina_ptbr_base,fold_0,f1-score,0.70913,0.731373,0.720693,0.720251,0.72024
albertina_ptbr_base,fold_0,support,491.0,490.0,0.720693,981.0,981.0
albertina_ptpt_base,fold_0,precision,0.737527,0.709615,0.722732,0.723571,0.723585
albertina_ptpt_base,fold_0,recall,0.692464,0.753061,0.722732,0.722763,0.722732
albertina_ptpt_base,fold_0,f1-score,0.714286,0.730693,0.722732,0.722489,0.722481
albertina_ptpt_base,fold_0,support,491.0,490.0,0.722732,981.0,981.0
bertimbau,fold_0,precision,0.693878,0.657407,0.673802,0.675642,0.675661
bertimbau,fold_0,recall,0.623218,0.72449,0.673802,0.673854,0.673802
