## Setup

In [1]:
# Dependencies to run the notebook 
# Uncomment below to install - may require additional installations depending on your python version

# python version == 3.10.14
# %pip install torch transformers peft datasets scikit-learn wandb accelerate -U --quiet

In [2]:
import torch
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
from datasets import load_dataset
from peft import PeftModel
from transformers import RobertaTokenizerFast, DataCollatorWithPadding, RobertaForSequenceClassification, Trainer

In [3]:
# Pretrained model's name from HuggingFace
PT_MODEL_NAME = 'roberta-large'

# Fine-tuned model path (best model)
FT_MODEL_PATH = './LoRA/Final-model'

# Path to dataset in which the test file (xxx.csv) is located
DATASET_PATH = './data'
TEST_FILE_NAME = 'test.csv'

# Path to save the predictions
PREDICTION_PATH = './LoRA'

In [4]:
# Do evaluation on top of predictions?
EVALUATE = False

## Data preparation

In [5]:
# Load testing dataset from csv file as a DatasetDict
data_files = {"test": TEST_FILE_NAME}
dataset = load_dataset("csv", data_dir=DATASET_PATH, data_files=data_files)
dataset

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis'],
        num_rows: 3302
    })
})

In [6]:
# Load model's tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained(PT_MODEL_NAME)

In [7]:
# Load data processing functions
def text_preprocessing(samples):
    ''' Replace None instances in hypothesis by empty strings. '''
    samples['hypothesis'] = "" if samples['hypothesis'] is None else samples['hypothesis']
    return samples

def text_tokenization(samples):
    ''' Tokenize the premise and hypothesis into sentence pair classification format. '''
    return tokenizer(samples['premise'], samples['hypothesis'], truncation=True)

In [8]:
# Preprocess texts (hypothesis)
dataset = dataset.map(text_preprocessing, batched=False)

# Tokenize dataset (premises and hypothesis) for sentence pair classification
tokenized_dataset = dataset.map(text_tokenization, batched=True, remove_columns=['premise', 'hypothesis'])
tokenized_dataset

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3302
    })
})

## Predict with LoRA model

In [None]:
# Load pretrained model
pretrained_model = RobertaForSequenceClassification.from_pretrained(
    PT_MODEL_NAME, num_labels=2, device_map='cuda' if torch.cuda.is_available() else 'cpu',
)

# Add-in trained LoRA layers
model = PeftModel.from_pretrained(
    pretrained_model, FT_MODEL_PATH, device_map='cuda' if torch.cuda.is_available() else 'cpu',
)

In [None]:
# Prepare the trained model to predict on test data
trainer = Trainer(
    model=model,
    data_collator=DataCollatorWithPadding(tokenizer),
)

# Get model predictions on testing data
logits = trainer.predict(tokenized_dataset['test']).predictions
predictions = np.argmax(logits, axis=-1)

In [11]:
# Write predictions to a csv file with one column (prediction)
with open(f'{PREDICTION_PATH}/lora-predictions.csv', 'w') as f:
    f.write('prediction\n')
    for pred in predictions:
        f.write(f'{pred}\n')

In [19]:
# Peacking at the first four predictions
for i in range(5):
    print('Premise:', dataset['test'][i]['premise'])
    print('Hypothesis:', dataset['test'][i]['hypothesis'])
    print('Prediction:', predictions[i], '\n')

Premise: Boy wearing red hat, blue jacket pushing plow in snow.
Hypothesis: The boy is surrounded by snow
Prediction: 1 

Premise: A blond woman in a black shirt is standing behind a counter.
Hypothesis: The woman is standing.
Prediction: 1 

Premise: Three people in uniform are outdoors and are observing a scene which is out of the picture.
Hypothesis: Uniformed people are outside
Prediction: 1 

Premise: A person, in a striped blue shirt and pants, is running along.
Hypothesis: The person is running
Prediction: 1 

Premise: A man, woman, and child get their picture taken in front of the mountains.
Hypothesis: A family on vacation is posing.
Prediction: 1 



## Evaluation (optional)

In [13]:
def compute_metrics(predictions, labels):
    ''' Function to calculate metrics given model predictions and expected labels. 
        Metrics: F1 (micro/macro/wei), Precision (macro), Recall (macro), ROC (macro), and Accuracy.
        We use macro averages as the classes are imbalanced.
    '''
    return {
        "F1_micro": f1_score(labels, predictions, average='micro'),
        "F1_macro": f1_score(labels, predictions, average='macro'),
        "F1_weighted": f1_score(labels, predictions, average='weighted'),
        "Precision_macro": precision_score(labels, predictions, average='macro'),
        "Recall_macro": recall_score(labels, predictions, average='macro'),
        "ROC_macro": roc_auc_score(labels, predictions, average = 'macro'),
        "Accuracy": accuracy_score(labels, predictions),
    }

In [14]:
# Evaluate model predictions againt references, if EVALUATE is True
if EVALUATE:
    evaluation_results = compute_metrics(predictions, tokenized_dataset['test']['label'])
    print('Evaluation metrics:\n', evaluation_results)