# Import e inicialización

In [14]:
import pandas as pd
import math
import warnings

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torchmetrics as tm
from torchmetrics.classification import ConfusionMatrix
import pytorch_lightning as pl

from transformers import AutoTokenizer, AutoModel, AdamW, get_cosine_schedule_with_warmup
from sklearn.model_selection import KFold

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Útiles

In [15]:
def load_glossary(path="../glossary/isoiecieee5652.csv"):
    glossary = pd.read_csv(path)
    glossary['Term'] = glossary['Term'].str.strip()

    return glossary


def count_relevant_terms(comment, glossary):
    count = 0

    for term in glossary['Term']:
        if term in comment:
            count += 1

    return count


def build_count_feature_vector(size, comment, glossary, full=True):
    feature_vector = torch.zeros(size)
    relevant_count = count_relevant_terms(comment, glossary)

    if full:
        feature_vector[:] = relevant_count
    else:
        feature_vector[0] = relevant_count

    return feature_vector


def build_position_feature_vector(size, comment, glossary, tokenizer):
    feature_vector = torch.zeros(size)
    tokenized_comment = tokenizer.encode(comment)

    for term in glossary["Term"]:
        term_without_space = tokenizer.encode(term, add_special_tokens=False)
        term_with_space = tokenizer.encode(" " + term, add_special_tokens=False)

        apply_positional_match(feature_vector, tokenized_comment, term_without_space)
        apply_positional_match(feature_vector, tokenized_comment, term_with_space)

    return feature_vector


def apply_positional_match(feature_vector, tokenized_comment, tokenized_term):
    for i in range(len(tokenized_comment) - len(tokenized_term) + 1):

        if tokenized_comment[i:i + len(tokenized_term)] == tokenized_term:
            for j in range(len(tokenized_term)):
                feature_vector[i + j] = 1


def collate_function(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.tensor([item["label"] for item in batch])
    feature_vectors = torch.tensor([item["feature_vector"] for item in batch])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "label": labels,
        "feature_vectors": feature_vectors
    }

# Comment **Filter**

In [16]:
class CommentDataset (Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer, max_token_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        warnings.simplefilter(action="ignore", category=FutureWarning)
        item = self.data.iloc[index]
        comment = str(item.Review)
        label = torch.FloatTensor(self.data.iloc[index, 1:])
        encoding = self.tokenizer.encode_plus(
                                comment,
                                add_special_tokens=True,
                                max_length=self.max_token_len,
                                return_token_type_ids=False,
                                padding="max_length",
                                truncation=True,
                                return_attention_mask=True,
                                return_tensors='pt'
        )
        if len(encoding["input_ids"].flatten()) != self.max_token_len:
            print("Bad length, expected ", self.max_token_len, ", got ", encoding["input_ids"].flatten().len)

        return {'input_ids': encoding["input_ids"].flatten(),
                'attention_mask': encoding["attention_mask"].flatten(),
                'label': label}

In [17]:
class CommentDataModule(pl.LightningDataModule):

    def __init__(self, train_data, test_data, tokenizer, batch_size, max_token_len):
        super().__init__()
        self.train_data = train_data
        self.test_data = test_data
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len
        self.train_dataset = None
        self.test_dataset = None

    def setup(self, stage=None):
        self.train_dataset = CommentDataset(self.train_data, self.tokenizer, self.max_token_len)
        self.test_dataset = CommentDataset(self.test_data, self.tokenizer, self.max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2,
                          persistent_workers=True)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2,
                          persistent_workers=True,)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2,
                          persistent_workers=True,)

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2, shuffle=False,
                          persistent_workers=True,)

In [18]:
class CrossCommentDataModule(pl.LightningDataModule):
    def __init__(self, k_fold, n_folds, split_seed, full_dataset, tokenizer, batch_size, max_token_len):
        super().__init__()
        self.full_dataset = full_dataset
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

        # actual fold number
        self.k_fold = k_fold
        # number of folds
        self.n_folds = n_folds
        # seed to control the randomness of fold splitting
        self.split_seed = split_seed

        self.train_dataset = None
        self.test_dataset = None

    def setup(self, stage=None):

        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.split_seed)
        all_splits = [k for k in kf.split(self.full_dataset)]
        train_indexes, val_indexes = all_splits[self.k_fold]
        train_indexes, val_indexes = train_indexes.tolist(), val_indexes.tolist()

        self.train_dataset = CommentDataset(self.full_dataset.iloc[train_indexes], self.tokenizer, self.max_token_len)
        self.test_dataset = CommentDataset(self.full_dataset.iloc[val_indexes], self.tokenizer, self.max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2,
                          persistent_workers=True,)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2,
                          persistent_workers=True,)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2,
                          persistent_workers=True,)

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2, shuffle=False,
                          persistent_workers=True,)

In [19]:
class CommentFilter(pl.LightningModule):

    def __init__(self, config: dict):
        super().__init__()
        self.config = config
        self.pretrained_model = AutoModel.from_pretrained(config['model'], return_dict=True).to(device)
        self.linear_classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, 1).to(device)
        self.sigmoid = torch.nn.Sigmoid().to(device)
        torch.nn.init.xavier_uniform_(self.linear_classifier.weight)
        self.criterion = nn.BCELoss()
        self.num_classes = 2

        self.predictions = []
        self.references = []

        # metrics
        self.accuracy = tm.Accuracy(task="binary")
        self.precision = tm.Precision(task="binary")
        self.recall = tm.Recall(task="binary")
        self.f1_score = tm.F1Score(task="binary")

    def forward(self, input_ids, attention_mask, labels=None):
        # roberta layer
        output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)

        # final logits
        output = self.linear_classifier(output.last_hidden_state.mean(dim=1))
        logits = self.sigmoid(output)

        # calculate loss
        loss = 0
        if labels is not None:
            loss = self.criterion(logits, labels)
        return loss, logits

    def training_step(self, batch, batch_index):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        loss, outputs = self.forward(input_ids, attention_mask, labels)

        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_index):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        loss, outputs = self(input_ids, attention_mask, labels)

        metrics = {
            "val_loss": loss,
            "accuracy": self.accuracy(outputs, labels),
            "precision": self.precision(outputs, labels),
            "recall": self.recall(outputs, labels),
            "F1-score": self.f1_score(outputs, labels)
        }

        self.predictions.append(outputs)
        self.references.append(labels)

        self.log_dict(metrics, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return {"loss": loss, "predictions": outputs, "labels": labels}

    def test_step(self, batch, batch_index):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        loss, outputs = self(input_ids, attention_mask, labels)

        metrics = {
            "train_loss": loss,
            "accuracy": self.accuracy(outputs, labels),
        }

        self.predictions.append(outputs)
        self.references.append(labels)

        self.log_dict(metrics, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def predict_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        return outputs

    def on_test_start(self):
        self.predictions.clear()
        self.references.clear()

    def on_test_end(self):
        predictions = (torch.concat(self.predictions) > 0.5).int()
        labels = torch.concat(self.references)
        confusion_mat = ConfusionMatrix(task="binary", num_classes=2).to(device)
        print("Cunfusion Matrix: \n", confusion_mat(predictions, labels))

    def get_metrics(self):

        predictions = (torch.concat(self.predictions) > 0.5).int()
        labels = torch.concat(self.references)
        confusion_mat = ConfusionMatrix(task="binary", num_classes=2).to(device)
        conf_result = confusion_mat(predictions, labels)

        tp = float(conf_result[0, 0].item())
        fp = float(conf_result[0, 1].item())
        fn = float(conf_result[1, 0].item())
        tn = float(conf_result[1, 1].item())

        print(confusion_mat, tp, fp, fn, tn)

        return {
            "accuracy": (tp+tn)/(tp+fp+fn+tn),
            "precision": tp/(tp+fp),
            "recall": tp/(tp+fn),
            "F1-score": 2*((tp/(tp+fp)*tp/(tp+fn))/(tp/(tp+fp)+tp/(tp+fn)))
        }

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'])
        total_steps = self.config['train_size']/self.config['batch_size']
        warmup_steps = math.floor(total_steps * self.config['warmup'])
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
        return [optimizer], [scheduler]

# Comment Filter (Feature Vector)

In [20]:
class CommentDatasetFV (CommentDataset):

    def __init__(self, data: pd.DataFrame, tokenizer, max_token_len, feature_vector_type="RELEVANT_COUNT"):
        super().__init__(data, tokenizer, max_token_len)
        self.glossary = load_glossary()
        self.feature_vector_type = feature_vector_type

    def __getitem__(self, index):
        warnings.simplefilter(action="ignore", category=FutureWarning)
        item = self.data.iloc[index]
        comment = str(item.Review)
        print("\nComment ", index, ": ", comment)
        label = torch.FloatTensor(self.data.iloc[index, 1:])
        encoding = self.tokenizer.encode_plus(
                                comment,
                                add_special_tokens=True,
                                max_length=self.max_token_len,
                                return_token_type_ids=False,
                                padding="max_length",
                                truncation=True,
                                return_attention_mask=True,
                                return_tensors='pt'
        )
        feature_vector = None
        if self.feature_vector_type == "RELEVANT_COUNT":
            feature_vector = build_count_feature_vector(self.max_token_len, comment, self.glossary, full=True)

        elif self.feature_vector_type == "RELEVANT_POSITION":
            feature_vector = build_position_feature_vector(self.max_token_len, comment, self.glossary, self.tokenizer)

        else:
            raise ValueError(f"ERROR: Invalid feature vector type: {self.feature_vector_type}")

        if len(encoding["input_ids"].flatten()) != self.max_token_len:
            print("Bad length, expected ", self.max_token_len, ", got ", len(encoding["input_ids"].flatten().len()))

        return {'input_ids': encoding["input_ids"].flatten(),
                'attention_mask': encoding["attention_mask"].flatten(),
                'label': label,
                'feature_vector': feature_vector}

In [21]:
class CommentDataModuleFV(CommentDataModule):

    def __init__(self, train_data, test_data, tokenizer, batch_size, max_token_len):
        super().__init__(train_data, test_data, tokenizer, batch_size, max_token_len)

    def setup(self, stage=None):
        self.train_dataset = CommentDatasetFV(self.train_data, self.tokenizer, self.max_token_len)
        self.test_dataset = CommentDatasetFV(self.test_data, self.tokenizer, self.max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2,
                          persistent_workers=True, collate_fn=collate_function)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2,
                          persistent_workers=True, collate_fn=collate_function)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2,
                          persistent_workers=True, collate_fn=collate_function)

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2,
                          persistent_workers=True, collate_fn=collate_function)

In [22]:
class CrossCommentDataModuleFV(CrossCommentDataModule):

    def __init__(self, k_fold, n_folds, split_seed, full_dataset, tokenizer, batch_size, max_token_len, feature_vector_type):
        super().__init__(k_fold, n_folds, split_seed, full_dataset, tokenizer, batch_size, max_token_len)

        self.feature_vector_type = feature_vector_type

    def setup(self, stage=None):

        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.split_seed)
        all_splits = [k for k in kf.split(self.full_dataset)]
        train_indexes, val_indexes = all_splits[self.k_fold]
        train_indexes, val_indexes = train_indexes.tolist(), val_indexes.tolist()

        self.train_dataset = CommentDatasetFV(self.full_dataset.iloc[train_indexes], self.tokenizer,
                                              self.max_token_len, self.feature_vector_type)
        self.test_dataset = CommentDatasetFV(self.full_dataset.iloc[val_indexes], self.tokenizer,
                                             self.max_token_len, self.feature_vector_type)

In [23]:
class CommentFilterFV(CommentFilter):

    def __init__(self, config: dict):
        super().__init__(config)

        self.vector_size = self.pretrained_model.config.hidden_size + config['max_token_len']

        self.linear_classifier = torch.nn.Linear(self.vector_size, 1).to(device)
        torch.nn.init.xavier_uniform_(self.linear_classifier.weight)

        self.use_mlp = config['use_mlp']

        self.multilayer_perceptron = nn.Sequential(
            nn.Linear(self.vector_size, self.vector_size),
            nn.ReLU(),
            nn.Linear(self.vector_size, self.vector_size),
            nn.ReLU(),
            self.linear_classifier
        ).to(device)

    def forward(self, input_ids, attention_mask, labels=None, feature_vectors=None):

        # roberta layer
        output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
        output = output.last_hidden_state.mean(dim=1)

        output_with_domain_information = torch.cat((output, feature_vectors), dim=1)

        if self.use_mlp:
            logits = self.multilayer_perceptron(output_with_domain_information)
        else:
            logits = self.linear_classifier(output_with_domain_information)

        # logits = self.classifier(output_with_domain_information)
        logits = self.sigmoid(logits)

        # calculate loss
        loss = 0
        if labels is not None:
            loss = self.criterion(logits, labels)
        return loss, logits

    def training_step(self, batch, batch_index):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        feature_vectors = batch["feature_vector"].to(device)
        loss, outputs = self.forward(input_ids, attention_mask, labels, feature_vectors)

        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_index):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        feature_vectors = batch["feature_vector"].to(device)
        loss, outputs = self(input_ids, attention_mask, labels, feature_vectors)

        metrics = {
            "val_loss": loss,
            "accuracy": self.accuracy(outputs, labels),
            "precision": self.precision(outputs, labels),
            "recall": self.recall(outputs, labels),
            "F1-score": self.f1_score(outputs, labels)
        }

        self.predictions.append(outputs)
        self.references.append(labels)

        self.log_dict(metrics, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return {"loss": loss, "predictions": outputs, "labels": labels}

    def test_step(self, batch, batch_index):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        feature_vectors = batch["feature_vector"].to(device)
        loss, outputs = self(input_ids, attention_mask, labels, feature_vectors)

        metrics = {
            "train_loss": loss,
            "accuracy": self.accuracy(outputs, labels),
        }

        self.predictions.append(outputs)
        self.references.append(labels)

        self.log_dict(metrics, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

In [24]:
def metrics_average(metrics_list):

    metrics_sum = {
        "accuracy": 0,
        "precision": 0,
        "recall": 0,
        "F1-score": 0
    }

    for dic in metrics_list:
        for metric in metrics_sum:
            metrics_sum[metric] += dic[metric]

    return {metric: sum_m / len(metrics_list) for metric, sum_m in metrics_sum.items()}

def cross_validation_relevance(model_name, data_name, n_folds, config, mode="base"):

    model_path = "../models/" + model_name
    data_path = "../data/" + data_name + ".csv"

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    split_seed = 123
    results = []

    dataframe = pd.read_csv(data_path)

    for k in range(n_folds):

        data_module = None

        # model init
        if mode == "base":
            data_module = CrossCommentDataModule(k, n_folds, split_seed, dataframe, tokenizer,
                                                 batch_size=config['batch_size'], max_token_len=config['max_token_len'])
            relevance_model = CommentFilter(config)

        elif mode == "FV":
            data_module = CrossCommentDataModuleFV(k, n_folds, split_seed, dataframe, tokenizer,
                                                   batch_size=config['batch_size'], max_token_len=config['max_token_len'],
                                                   feature_vector_type=config['FV_type'])
            relevance_model = CommentFilterFV(config)
        else:
            raise ValueError(f"ERROR: Invalid relevance mode: {mode}")

        data_module.setup()

        config["train_size"] = len(data_module.train_dataloader())

        # training
        trainer = pl.Trainer(max_epochs=config['n_epochs'], log_every_n_steps=5, enable_progress_bar=True)
        trainer.fit(relevance_model, data_module)

        # evaluation
        trainer.test(relevance_model, data_module)
        metrics = relevance_model.get_metrics()

        results.append(metrics)
        print(f" {k+1}-Fold metrics: {metrics}")

        save_path = f"../models/fine-tuned/relevance_model {model_name}(L+RC)-{data_name}-K{k + 1}.pth"
        torch.save(relevance_model, save_path)

    print("Results :", results)
    print("Avg results: ", metrics_average(results))

# Evaluación

In [None]:
warnings.simplefilter(action="ignore", category=FutureWarning)

dataset = "templerun2_labeled"
model = "BERTweet - base"

# fine_tuned = "../models/fine-tuned/comment_relevance_detector (facebook).pth"
n_folds = 5

config = {
    'model': "../models/" + model,
    'batch_size': 16,
    'lr': 2e-5,
    'warmup': 0.2,
    'train_size': None,
    'weight_decay': 0.001,
    'max_token_len': 130,
    'n_epochs': 2,
    'FV_type': 'RELEVANT_COUNT',    # RELEVANT_COUNT or RELEVANT_POSITION
    'use_mlp': False,
    'mlp_dimension': 100
}

cross_validation_relevance(model, dataset, n_folds, config, "FV")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name                  | Type            | Params
----------------------------------------------------------
0 | pretrained_model      | RobertaModel    | 134 M 
1 | linear_classifier     | Linear          | 899   
2 | sigmoid               | Sigmoid         | 0     
3 | criterion             | BCELoss         | 0     
4 | accuracy              | BinaryAccuracy  | 0     
5 | precision             | BinaryPrecision | 0     
6 | recall                | BinaryRecall    | 0     
7 | f1_score              | BinaryF1Score   | 0     
8 | multilayer_perceptron | Sequential      | 1.6 M 
----------------------------------------------------------
136 M     Trainable params
0         Non-trainable params
136 M     Total params
546.062   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]