## Setup

In [None]:
# Dependencies to run the notebook 
# Uncomment below to install - may require additional installations depending on your python version

# python version == 3.10.14
# %pip install tensorflow==2.14.0 keras-self-attention transformers datasets scikit-learn accelerate -U --quiet

In [None]:
from datasets import load_dataset, load_from_disk
from transformers import RobertaTokenizerFast, RobertaModel

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D, Bidirectional, \
                                    Input, Attention, Lambda, AdditiveAttention
from tensorflow.keras.optimizers.legacy import Adam

import keras, random
from keras_self_attention import SeqSelfAttention

import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score

In [None]:
# Pretrained model's name from HuggingFace for embeddings
EMB_MODEL_NAME = 'roberta-base'

# Fine-tuned model path (best model after hyperparameter tuning)
ENSEMBLE_PATH = './ensemble/models_2'

# Path to dataset in which all *.csv files are stored
DATASET_PATH = './data'
TRAIN_FILE_NAME, VAL_FILE_NAME = 'train.csv', 'dev.csv'

# Path to hyperparameter tuning logs
LOG_PATH = 'ensemble_logs.txt'

## Data preparation

In [None]:
# Load training and validation datasets from csv files as a DatasetDict
data_files = {"train": TRAIN_FILE_NAME, "validation": VAL_FILE_NAME}
dataset = load_dataset("csv", data_dir=DATASET_PATH, data_files=data_files)
dataset

In [None]:
# Load embedding model and tokenizer
embedding_model = RobertaModel.from_pretrained(EMB_MODEL_NAME)
tokenizer = RobertaTokenizerFast.from_pretrained(EMB_MODEL_NAME)

In [None]:
def text_preprocessing(samples):
    ''' Replace None instances in hypothesis by empty strings. '''
    samples['hypothesis'] = "" if samples['hypothesis'] is None else samples['hypothesis']
    return samples

# Preprocess texts (hypothesis) from the dataset
dataset = dataset.map(text_preprocessing, batched=False)

In [None]:
def text_tokenization(samples):
    ''' Tokenize the premise and hypothesis into sentence pair classification format. 
        Truncate to 150 tokens and pad to max length. 
        By using 150 max-length, 99.4% of training samples are left untruncated.
    '''
    return tokenizer(samples['premise'], samples['hypothesis'], truncation=True, max_length=150, padding='max_length', return_tensors='pt') 

# Tokenize dataset (premises and hypothesis) for sentence pair classification
tokenized_dataset = dataset.map(text_tokenization, batched=False, remove_columns=['premise', 'hypothesis'])
tokenized_dataset.set_format('pt', columns=['input_ids', 'attention_mask'], output_all_columns=True)
tokenized_dataset

In [None]:
def get_embeddings(samples):
    ''' Get the embeddings from the embedding model. This allows to not include
        the embedding model within each model's architecture in the ensemble. 
    '''
    embeddings = embedding_model(samples['input_ids'], attention_mask=samples['attention_mask']).last_hidden_state.detach().cpu()
    samples['embeddings'] = embeddings[0]
    return samples

# Get embeddings from tokenized samples through the RoBERTa-Base model
dataset = tokenized_dataset.map(get_embeddings, batched=False, remove_columns=['input_ids', 'attention_mask'])

## Ensemble preparation

In [None]:
# Model parameters
sample_size = 150
embedding_dim = 768
n_lstm = 128
drop_lstm = 0.2
optimizer = 'adam'
batch_size = 12

# Hyperparameters to tune
ensemble_size = 25
max_epochs = [1, 2, 3, 5, 8] # 8

In [None]:
# Putting datasets into tensors and caching them in memory to avoid loading them at each epoch
training_data = tf.data.Dataset.from_tensor_slices((dataset['train']['embeddings'][:-1000], np.array(dataset['train']['label'][:-1000]).astype(np.int32)))
training_data = training_data.batch(batch_size=batch_size).cache() 

valid_data_x = tf.data.Dataset.from_tensor_slices(dataset['validation']['embeddings'])
valid_data_x = valid_data_x.batch(batch_size=batch_size).cache()
valid_data_y = np.array(dataset['validation']['label']).astype(np.int32)

2024-04-23 23:54:59.385729: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14791 MB memory:  -> device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:1d:00.0, compute capability: 7.0


In [None]:
models = []
bootstrap_datasets_idx = []

# Initialise each learner of the ensemble
for _ in range(ensemble_size):
    
    # Learner initialisation
    model = Sequential()
    model.add(Input(shape=(sample_size, embedding_dim), dtype='float32'))
    model.add(Bidirectional(LSTM(units=n_lstm, return_sequences=True)))
    model.add(SeqSelfAttention(attention_activation='sigmoid'))
    model.add(Dropout(drop_lstm))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy'],
    )
    
    models.append(model)
    
    # Bootstraping: Given the training dataset of size n, we randomly select n samples, with replacement.
    dataset_idx = [random.randint(0, len(dataset['train'])-1) for _ in range(len(dataset['train']))]
    bootstrap_datasets_idx.append(dataset_idx)


In [None]:
def predict_ensemble(models, test_data, nb_samples):
    ''' Function to predict the ensemble model's predictions on a test dataset.
        Each model makes its prediction and the ensemble model computes the 
        geometric mean over all predictions as its final output.
    '''
    m = len(models)
    preds_C1 = np.zeros((m, nb_samples))
    preds_C2 = np.zeros((m, nb_samples))
    
    all_norm_C1, all_norm_C2 = [], []

    # Collect predictions
    for i, model in enumerate(models):
        preds = model.predict(test_data, batch_size=batch_size).squeeze()  # Ensure we have a 1D array per model
        preds_C1[i] = preds
        preds_C2[i] = 1 - preds
    
        # Calculate the product along the 'model' axis and then take the m-th root to compute the geometric mean.
        geom_mean_C1 = np.power(np.prod(preds_C1[:i+1], axis=0), 1/(i+1))
        geom_mean_C2 = np.power(np.prod(preds_C2[:i+1], axis=0), 1/(i+1))

        # Compute normalization constant Z for each sample
        Z = geom_mean_C1 + geom_mean_C2

        # Normalize probabilities
        normalized_C1 = geom_mean_C1 / Z
        normalized_C2 = geom_mean_C2 / Z
        
        all_norm_C1.append(normalized_C1)
        all_norm_C2.append(normalized_C2)

    return all_norm_C1, all_norm_C2

## Evaluation preparation

In [None]:
def compute_metrics(predictions, labels):
    ''' Function to calculate metrics given model predictions and expected labels. 
        Metrics: F1 (micro/macro/wei), Precision (macro), Recall (macro), ROC (macro), and Accuracy.
        We use macro averages as the classes are imbalanced.
    '''
    return {
        "F1_micro": f1_score(labels, predictions, average='micro'),
        "F1_macro": f1_score(labels, predictions, average='macro'),
        "F1_weighted": f1_score(labels, predictions, average='weighted'),
        "Precision_macro": precision_score(labels, predictions, average='macro'),
        "Recall_macro": recall_score(labels, predictions, average='macro'),
        "ROC_macro": roc_auc_score(labels, predictions, average = 'macro'),
        "Accuracy": accuracy_score(labels, predictions),
    }

In [None]:
def evaluate_ensemble(ensemble, inputs, labels):
    ''' Function to evaluate the ensemble model on a validation dataset.
    '''
    all_normalized_C1, _ = predict_ensemble(ensemble, inputs, nb_samples=len(labels))
    all_metrics = []
    for norm_C1 in all_normalized_C1:
        predictions = np.array([1 if s >= 0.5 else 0 for s in norm_C1])
        all_metrics.append(compute_metrics(predictions, labels))
    return all_metrics

## Hyperparameter tuning & Training

In [None]:
# Epochs: 1 -> X
for epoch in range(len(max_epochs)):
    
    # Train all models in ensemble for x additional epochs
    for model_idx, model in enumerate(models):
        # training_data = dataset['train'].select(bootstrap_datasets_idx[model_idx]) # bagging
        model.fit(
            training_data, 
            epochs=max(1, max_epochs[epoch]-max_epochs[epoch-1])
        )
        model.save(f'{ENSEMBLE_PATH}/m{model_idx}_e{max_epochs[epoch]}.h5')
        print(f'{model_idx}_{max_epochs[epoch]} Done!')
    
    # Evaluate ensemble with 1 to N models in ensemble
    ensemble_scores = {
        'F1_macro':[], 'F1_weighted':[], 'Precision_macro':[], 
        'Recall_macro':[], 'ROC_macro':[], 'Accuracy':[],
    }
    ensemble_score = evaluate_ensemble(
        ensemble = models,
        inputs = valid_data_x,
        labels = valid_data_y,
    )
    for scores in ensemble_score:
        for metric, score in scores.items():
            ensemble_scores[metric].append(score)
    
    # Save score logs
    with open(LOG_PATH, 'a+') as log_file:
        log_file.write(f'Epoch: {max_epochs[epoch]}\n')
        for metric, scores in ensemble_scores.items():
            log_file.write(f'{metric} --> {" ".join([str(i) for i in scores])}\n')
        log_file.write('\n\n')


## Best model evaluation

In [None]:
# Best model is 11 learners over 8 epochs and is selected manually based on validation accuracy
final_ensemble = models[:11]

In [None]:
# Evaluate final ensemble on validation dataset
ensemble_score = evaluate_ensemble(
    ensemble = final_ensemble,
    inputs = valid_data_x,
    labels = valid_data_y,
)[-1]

In [None]:
# Display final ensemble scores
for metric, score in ensemble_score.items():
    print(f'{metric}: {score}.')