## Setup

In [1]:
# Dependencies to run the notebook 
# Uncomment below to install - may require additional installations depending on your python version

# python version == 3.10.14
# %pip install tensorflow==2.14.0 keras-self-attention transformers datasets scikit-learn accelerate -U --quiet

In [None]:
import keras
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from transformers import RobertaTokenizerFast, RobertaModel
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
from keras_self_attention import SeqSelfAttention

In [3]:
# Pretrained model's name from HuggingFace used for embeddings
EMB_MODEL_NAME = 'roberta-base'

# Path to all models of the ensemble
ENSEMBLE_PATH = './ensemble/models'
ENSEMBLE_SIZE = 11
BATCH_SIZE = 12

# Path to dataset in which the test file (xxx.csv) is located
DATASET_PATH = './data'
TEST_FILE_NAME = 'test.csv'

# Path to save the predictions
PREDICTION_PATH = './ensemble'

In [10]:
# Do evaluation on top of predictions?
EVALUATE = False

## Data preparation

In [5]:
# Load testing dataset from csv file as a DatasetDict
data_files = {"test": TEST_FILE_NAME}
raw_dataset = load_dataset("csv", data_dir=DATASET_PATH, data_files=data_files)
raw_dataset

Generating test split: 3302 examples [00:00, 182927.08 examples/s]


DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis'],
        num_rows: 3302
    })
})

In [None]:
# Load embedding model and tokenizer
embedding_model = RobertaModel.from_pretrained(EMB_MODEL_NAME)
tokenizer = RobertaTokenizerFast.from_pretrained(EMB_MODEL_NAME)

In [7]:
# Load data processing functions
def text_preprocessing(samples):
    ''' Replace None instances in hypothesis by empty strings. '''
    samples['hypothesis'] = "" if samples['hypothesis'] is None else samples['hypothesis']
    return samples

def text_tokenization(samples):
    ''' Tokenize the premise and hypothesis into sentence pair classification format. 
        Truncate to 150 tokens and pad to max length. 
        By using 150 max-length, 99.4% of training samples are left untruncated.
    '''
    return tokenizer(samples['premise'], samples['hypothesis'], truncation=True, max_length=150, padding='max_length', return_tensors='pt') 

def get_embeddings(samples):
    ''' Get the embeddings from the embedding model. This allows to not include
        the embedding model within each model's architecture in the ensemble. 
    '''
    embeddings = embedding_model(samples['input_ids'], attention_mask=samples['attention_mask']).last_hidden_state.detach().cpu()
    samples['embeddings'] = embeddings[0]
    return samples

In [8]:
# Preprocess texts (hypothesis)
raw_dataset = raw_dataset.map(text_preprocessing, batched=False)

# Tokenize dataset (premises and hypothesis) for sentence pair classification
tokenized_dataset = raw_dataset.map(text_tokenization, batched=False, remove_columns=['premise', 'hypothesis'])
tokenized_dataset.set_format('pt', columns=['input_ids', 'attention_mask'], output_all_columns=True)

# Get embeddings from tokenized samples through the RoBERTa-Base model
dataset = tokenized_dataset.map(get_embeddings, batched=False, remove_columns=['input_ids', 'attention_mask'])

Map: 100%|██████████| 3302/3302 [00:00<00:00, 67822.33 examples/s]
Map: 100%|██████████| 3302/3302 [00:00<00:00, 7176.79 examples/s]
Map: 100%|██████████| 3302/3302 [02:41<00:00, 20.39 examples/s]


In [11]:
# Put data into sensors and cache them for fast GPU access
inputs_unbatched = tf.data.Dataset.from_tensor_slices(dataset['test']['embeddings'])
inputs = inputs_unbatched.batch(batch_size=BATCH_SIZE).cache()

# Put labels into sensors if doing evaluation 
if EVALUATE:
    labels = np.array(dataset['test']['label']).astype(np.int32)

## Predict with ensemble of models

In [None]:
ensemble_models = []

# Load all trained models of the ensemble from ENSEMBLE_PATH folder
for model_idx in range(ENSEMBLE_SIZE):
    ensemble_models.append(
        tf.keras.models.load_model(
            f'{ENSEMBLE_PATH}/{model_idx}_F.h5', 
            custom_objects=SeqSelfAttention.get_custom_objects()
        )
    )

In [13]:
def predict_ensemble(models, test_data, nb_samples):
    ''' Function to predict the ensemble model's predictions on a test dataset.
        Each model makes its prediction and the ensemble model computes the 
        geometric mean over all predictions as its final output.
    '''
    m = len(models)
    preds_C1 = np.zeros((m, nb_samples))
    preds_C2 = np.zeros((m, nb_samples))

    # Collect predictions
    for i, model in enumerate(models):
        preds = model.predict(test_data, batch_size=BATCH_SIZE).squeeze()  # Ensure we have a 1D array per model
        preds_C1[i] = preds
        preds_C2[i] = 1 - preds
    
    # Calculate the product along the 'model' axis and then take the m-th root to compute the geometric mean.
    geom_mean_C1 = np.power(np.prod(preds_C1, axis=0), 1/m)
    geom_mean_C2 = np.power(np.prod(preds_C2, axis=0), 1/m)

    # Compute normalization constant Z for each sample
    Z = geom_mean_C1 + geom_mean_C2

    # Normalize probabilities
    normalized_C1 = geom_mean_C1 / Z
    normalized_C2 = geom_mean_C2 / Z

    # Get predictions
    predictions = np.array([1 if s >= 0.5 else 0 for s in normalized_C1])

    return predictions

In [14]:
# Get predictions from the ensemble
predictions = predict_ensemble(ensemble_models, inputs, nb_samples=len(inputs_unbatched))



In [19]:
# Peacking at the first four predictions
for i in range(5):
    print('Premise:', raw_dataset['test'][i]['premise'])
    print('Hypothesis:', raw_dataset['test'][i]['hypothesis'])
    print('Prediction:', predictions[i], '\n')

Premise: Boy wearing red hat, blue jacket pushing plow in snow.
Hypothesis: The boy is surrounded by snow
Prediction: 1 

Premise: A blond woman in a black shirt is standing behind a counter.
Hypothesis: The woman is standing.
Prediction: 1 

Premise: Three people in uniform are outdoors and are observing a scene which is out of the picture.
Hypothesis: Uniformed people are outside
Prediction: 1 

Premise: A person, in a striped blue shirt and pants, is running along.
Hypothesis: The person is running
Prediction: 1 

Premise: A man, woman, and child get their picture taken in front of the mountains.
Hypothesis: A family on vacation is posing.
Prediction: 1 



In [16]:
# Write predictions to a csv file with one column (prediction)
with open(f'{PREDICTION_PATH}/ensemble-predictions.csv', 'w') as f:
    f.write('prediction\n')
    for pred in predictions:
        f.write(f'{pred}\n')

## Evaluation (optional)

In [None]:
def compute_metrics(predictions, labels):
    ''' Function to calculate metrics given model predictions and expected labels. 
        Metrics: F1 (micro/macro/wei), Precision (macro), Recall (macro), ROC (macro), and Accuracy.
        We use macro averages as the classes are imbalanced.
    '''
    return {
        "F1_micro": f1_score(labels, predictions, average='micro'),
        "F1_macro": f1_score(labels, predictions, average='macro'),
        "F1_weighted": f1_score(labels, predictions, average='weighted'),
        "Precision_macro": precision_score(labels, predictions, average='macro'),
        "Recall_macro": recall_score(labels, predictions, average='macro'),
        "ROC_macro": roc_auc_score(labels, predictions, average = 'macro'),
        "Accuracy": accuracy_score(labels, predictions),
    }

In [None]:
# Evaluate model predictions againt references, if EVALUATE is True
if EVALUATE:
    evaluation_results = compute_metrics(predictions, labels)
    print('Evaluation metrics:\n', evaluation_results)

Evaluation metrics:
 {'F1_micro': 0.9619952494061758, 'F1_macro': 0.961986187825817, 'F1_weighted': 0.9620057050758204, 'Precision_macro': 0.9620373504852178, 'Recall_macro': 0.962546245304866, 'ROC_macro': 0.9625462453048659, 'Accuracy': 0.9619952494061758}
