<a href="https://colab.research.google.com/github/RaduSima/SSL_Project2024/blob/master/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ! pip install datasets
# ! pip install -U accelerate
# ! pip install -U transformers

import itertools

import pandas as pd
import torch
import accelerate
from transformers import BigBirdForSequenceClassification, BigBirdTokenizer, Trainer, TrainingArguments
from datasets import Dataset


import gc
import numpy
import pickle as pkl
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
class OrdinalRegressionHead(torch.nn.Module):
    """
    Ordinal regression head for classification problems. The way it works is by having a single layer that outputs a single value, which is then added to a learnable bias. The output is then passed through a sigmoid function.
    The bias is a learnable parameter that is used to shift the output of the single layer to the desired range. It will be learned to have descending order.

    class = sum(b_i > 0.5) + 1

    class 1: 0, 0, 0, 0
    class 2: 1, 0, 0, 0
    class 3: 1, 1, 0, 0
    class 4: 1, 1, 1, 0
    class 5: 1, 1, 1, 1
    and so on ...

    TODO: should have more than one fc layer.
    """
    def __init__(self, in_features, num_classes, intermediate_layers=None):
        """
        The constructor for OrdinalRegressionHead class.

        Parameters
        ----------
        in_features : int
            The number of input features. This is the number of features of the output layer of the big model.
        num_classes : int
            The number of classes in the classification problem.
        """
        super(OrdinalRegressionHead, self).__init__()

        if intermediate_layers is None:
            intermediate_layers = []

        input_size = in_features
        layers = []
        for layer_size in intermediate_layers:
            layers.append(torch.nn.Linear(input_size, layer_size))
            layers.append(torch.nn.ReLU())
            input_size = layer_size
        layers.append(torch.nn.Linear(input_size, 1, bias=False))

        self.fc = torch.nn.Sequential(*layers)

        self.b = torch.nn.Parameter(torch.zeros(num_classes - 1))
        self.activation = torch.nn.Sigmoid()

    def forward(self, x):
        """
        The forward method for OrdinalRegressionHead class.

        Parameters
        ----------
        x : tensor
            the input tensor.

        Returns
        -------
        tuple(tensor, tensor)
            The logits and the output of the model.
            The logits are useful in the training phase, as BCEWithLogitsLoss is used.
        """
        x = self.fc(x)
        y = x + self.b
        return y, self.activation(y)

class OrdinalRegressionClassifier(torch.nn.Module):
    def __init__(self, embeddings_size, num_classes, intermediate_layers=None) -> None:
        super(OrdinalRegressionClassifier, self).__init__()
        self.head = OrdinalRegressionHead(embeddings_size, num_classes, intermediate_layers)
        self.loss = torch.nn.BCEWithLogitsLoss()

    def forward(self, embeddings, labels):
        logits, output = self.head(embeddings)
        return self.loss(logits, labels), output


In [None]:
class OurBigBirdModel(torch.nn.Module):
    """
    OurBigBirdModel class is a class for the model that uses the BigBird model and the OrdinalRegressionHead.
    It uses the pretrained classifier of a BigBird model and adds an ordinal regression head on top of it.

    """
    def __init__(self, bert, num_classes=5, intermediate_layers=None):
        super(OurBigBirdModel, self).__init__()

        self.bert = bert
        # The output of the bert model is 768, as it is the output of the last hidden state.
        self.classifier = OrdinalRegressionHead(768, num_classes, intermediate_layers=intermediate_layers)

        self.loss = torch.nn.BCEWithLogitsLoss()

    def forward(self, input_ids, attention_mask, labels):
        """
        The forward method for OurBigBirdModel class.

        Parameters
        ----------
        input_ids : tensor
            The input tensor, used for bert.
        attention_mask : tensor
            The attention mask tensor, used for bert.
        labels : tensor
            The target labels for the classification problem.

        Returns
        -------
        tuple(tensor, tensor)
            The loss and the output of the model.
            The loss is useful in the training phase, as we are using Trainer from HuggingFace.
        """
        x = self.bert(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        logits, output = self.classifier(x)
        return self.loss(logits, labels), output

class EmbeddingBigBirdModel(torch.nn.Module):
    def __init__(self, bert):
        super(EmbeddingBigBirdModel, self).__init__()
        self.bert = bert
        # The output of the bert model is 768, as it is the output of the last hidden state.
        return

    def forward(self, input_ids, attention_mask):
        """
        The forward method for OurBigBirdModel class.

        Parameters
        ----------
        input_ids : tensor
            The input tensor, used for bert.
        attention_mask : tensor
            The attention mask tensor, used for bert.
        labels : tensor
            The target labels for the classification problem.

        Returns
        -------
        tuple(tensor, tensor)
            The loss and the output of the model.
            The loss is useful in the training phase, as we are using Trainer from HuggingFace.
        """
        return self.bert(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

In [None]:
def remove_percentage(df, percent):
    """
    Useful method for development purposes. It removes a percentage of the rows from the dataframe.

    Parameters
    ----------
    df : dataframe
        The dataframe to remove rows from.
    percent : float
        The percentage of rows to remove.

    Returns
    -------
    dataframe
        The dataframe with the rows removed.
    """
    if percent == 0:
        return df
    numpy.random.seed(42)
    num_rows_to_remove = int(len(df) * percent)
    df_removed = df.sample(frac=1).iloc[num_rows_to_remove:]
    return df_removed


def convert_label_to_one_hot_encodings(labels: list[float], num_classes, max_label=3500):
    """
    Convert the labels to a class representation. The class representation is a one-hot encoding of the labels.
    This preparation is for the ordinal regression problem.

    If the one-hot encoding of the label 3 for multi-class classification is [0, 0, 1, 0, 0],
      then the one-hot encoding of label 3 for ordinal regression is [1, 1, 0, 0].

    To convert from multi-class to ordinal regression, we can do the following trick:
        - compute the one-hot encoding of the labels for multi-class classification
        - replace all the 0s with 1s until the first 1 is found (for multi-class classification a 1 is found at the index of the class)
        - remove the first column of the one-hot encoding (the first 1)

    Parameters
    ----------
    labels : list[float]
        The list of labels to convert.
    num_classes : int
        The number of classes in the classification problem.
    max_label : int, optional
        The maximum label in the dataset. This is used to normalize the labels. The default is 2800.

    Returns
    -------
    tensor
        The tensor of the one-hot encoding of the labels.
    """
    # reminder: in ordinal regression, class = sum(output > 0.5) + 1

    # normalize the labels
    labels = numpy.array(labels) / max_label
    class_labels = numpy.zeros((len(labels), num_classes - 1))
    for i, label in enumerate(labels):
        class_labels[i] = numpy.array(
            [1 if j / num_classes <= label else 0 for j in range(1, num_classes)])
    return torch.tensor(class_labels)


def compute_metrics(pred):
    """
    Compute the metrics for the model. The metrics are accuracy, recall, precision, f1, and neighborhood accuracy.

    Parameters
    ----------
    pred : object
        The predictions of the model, wrapped in an object by Hugging Face trainer.

    Returns
    -------
    dict
        The dictionary of the metrics.
    """
    labels = pred.label_ids
    preds = pred.predictions

    return _compute_metrics(preds, labels)

def _compute_metrics(preds, labels):
    threshold = 0.5
    target_class = numpy.sum(labels > threshold, axis=-1) + 1
    output_class = numpy.sum(preds > threshold, axis=-1) + 1

    # compute accuracy, recall, precision, f1 for threshold 0.5
    accuracy = accuracy_score(target_class, output_class)
    recall = recall_score(target_class, output_class, average='macro', zero_division=0)
    precision = precision_score(target_class, output_class, average='macro', zero_division=0)
    f1 = f1_score(target_class, output_class, average='macro', zero_division=0)

    # compute neighborhood accuracy -- consider accurate all predictions that are off by 1
    neighborhood_accuracy = numpy.sum(
        numpy.abs(target_class - output_class) <= 1).item() / (len(labels) * 1.0)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "neighborhood_accuracy": neighborhood_accuracy
    }


def get_embedding(model, encoding):
    """
    Get the embedding from the model.

    Parameters
    ----------
    model : torch.nn.Module
        The model to get the embeddings from.
    encoding : dict
        The encoding of the text.

    Returns
    -------
    tensor
        The tensor of the embeddings.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids = torch.tensor(encoding["input_ids"]).to(device)
    attention_mask = torch.tensor(encoding["attention_mask"]).to(device)

    model = model.to(device)
    model.eval()
    embeddings = []

    with torch.no_grad():
        for i in range(len(input_ids)):
            embedding = model(input_ids[i].unsqueeze(0), attention_mask=attention_mask[i].unsqueeze(0)).to(device)
            embeddings.append(embedding)
            gc.collect()
            torch.cuda.empty_cache()
            print(f"Status {i+1}/{len(input_ids)}", end="\r")
    print("Embeddings done.")
    embeddings = torch.cat(embeddings)
    return embeddings

def save_embedding(embeddings, filename):
    """
    Save the embeddings to a file.

    Parameters
    ----------
    embeddings : tensor
        The tensor of the embeddings.
    filename : str
        The filename to save the embeddings to.
    """
    with open(filename, 'wb') as f:
        pkl.dump(embeddings, f)

def load_embedding(filename):
    """
    Load the embeddings from a file.

    Parameters
    ----------
    filename : str
        The filename to load the embeddings from.

    Returns
    -------
    tensor
        The tensor of the embeddings.
    """
    with open(filename, 'rb') as f:
        embeddings = pkl.load(f)
    return embeddings

def prepare_dataset(embeddings, labels, num_classes=5, max_label=3500):
    labels_tensor = convert_label_to_one_hot_encodings(labels, num_classes, max_label=max_label)
    dataset = Dataset.from_dict(
        {
        "embeddings": embeddings,
        "labels": labels_tensor
        })
    return dataset


In [None]:
percentage_to_remove = 0.5

load_embeddings = False

train_data = pd.read_csv('./data/AMT10/AMT10_train.csv')
val_data = pd.read_csv('./data/AMT10/AMT10_validation.csv')
test_data = pd.read_csv('./data/AMT10/AMT10_test.csv')

train_data = remove_percentage(train_data, percentage_to_remove)
val_data = remove_percentage(val_data, percentage_to_remove)
test_data = remove_percentage(test_data, percentage_to_remove)

train_texts, train_labels = train_data['description'].tolist(), train_data['rating'].tolist()
val_texts, val_labels = val_data['description'].tolist(), val_data['rating'].tolist()
test_texts, test_labels = test_data['description'].tolist(), test_data['rating'].tolist()

if load_embeddings:
    train_embeddings = load_embedding('./data/AMT10/train_embeddings.pkl')
    train_embeddings = torch.tensor(train_embeddings)
    val_embeddings = load_embedding('./data/AMT10/val_embeddings.pkl')
    val_embeddings = torch.tensor(val_embeddings)
    test_embeddings = load_embedding('./data/AMT10/test_embeddings.pkl')
    test_embeddings = torch.tensor(test_embeddings)
else:
    model_name = "google/bigbird-roberta-base"
    tokenizer = BigBirdTokenizer.from_pretrained(model_name)
    model = BigBirdForSequenceClassification.from_pretrained(model_name)

    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True)

    embedding_model = EmbeddingBigBirdModel(model.bert)
    embedding_model.eval()

    train_embeddings = get_embedding(embedding_model, train_encodings)
    val_embeddings = get_embedding(embedding_model, val_encodings)
    test_embeddings = get_embedding(embedding_model, test_encodings)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    save_embedding(train_embeddings.cpu().numpy(), './data/AMT10/train_embeddings.pkl')
    save_embedding(val_embeddings.cpu().numpy(), './data/AMT10/val_embeddings.pkl')
    save_embedding(test_embeddings.cpu().numpy(), './data/AMT10/test_embeddings.pkl')

def get_model_class_from_name(name):
    if name == "OurBigBirdModel":
        return OurBigBirdModel
    elif name == "OrdinalRegressionClassifier":
        return OrdinalRegressionClassifier
    else:
        return None

In [None]:
param_grid = {
    'learning_rate': [
        1e-5, 
        3e-5, 
        5e-5
    ],
    'per_device_train_batch_size': [
        8, 
        16
    ],
    'num_train_epochs': [
        10, 
        15
    ],
    
    # Model
    'model_class': [
        'OrdinalRegressionClassifier',
    ],
    'num_classes': [
        5,
        10,
        35
    ],
    'intermediate_layers': [
        [256, 128, 64, 32],
        [512, 128, 32],
        [256, 64, 16],
    ]
}

param_combinations = list(itertools.product(*param_grid.values()))

best_eval_metric = float('-inf')
best_params = None
best_trainer = None

for params in param_combinations:
    params = dict(zip(param_grid.keys(), params))
    learning_rate = params['learning_rate']
    train_batch_size = params['per_device_train_batch_size']
    num_epochs = params['num_train_epochs']
    model_class = get_model_class_from_name(params['model_class'])
    num_classes = params['num_classes']
    intermediate_layers = params['intermediate_layers']
    
    to_train_model = model_class(embeddings_size=768, num_classes=num_classes, intermediate_layers=intermediate_layers)

    train_dataset=prepare_dataset(train_embeddings, train_labels, num_classes=num_classes)
    val_dataset=prepare_dataset(val_embeddings, val_labels, num_classes=num_classes)
    test_dataset=prepare_dataset(test_embeddings, test_labels, num_classes=num_classes)


    training_args = TrainingArguments(
    learning_rate=learning_rate,
    output_dir="./results",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    )

    trainer = Trainer(
        model=to_train_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

    eval_results = trainer.evaluate(test_dataset)

    eval_metric = eval_results['eval_accuracy']

    if eval_metric > best_eval_metric:
        best_eval_metric = eval_metric
        best_params = params
        best_trainer = trainer

print(f"Best Evaluation Metric: {best_eval_metric}")
print(f"Best Hyperparameters: {best_params}")

# Evaluate the model on the testing set
eval_results = best_trainer.evaluate(test_dataset)
print(eval_results)

torch.save(best_trainer.model.state_dict(), "./models/ordinal_regression_model.pth")
