In [None]:
# set WORKDIR to the top of experiment repository
%cd ..
# %pwd

In [None]:
import mlflow
from sklearn.metrics import matthews_corrcoef, f1_score
from src.utils import timestamp
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from transformers.pipelines.text_classification import ClassificationFunction
import torch

from src.settings import (
    MLFLOW_EXPERIMENT_NAME,
    )


# Check if a GPU is available and set the device
device = 0 if torch.cuda.is_available() else -1

# select the model for evaluation
all_models = {
    'all-mpnet-base-v2': 'sentence-transformers/all-mpnet-base-v2',
    'stsb-roberta-large': 'cross-encoder/stsb-roberta-large',
    'stsb-roberta-base': 'cross-encoder/stsb-roberta-base',
    'Legal-BERT': 'nlpaueb/legal-bert-base-uncased',
    'EURLEX-BERT': 'nlpaueb/bert-base-uncased-eurlex',
    'SciBERT': 'allenai/scibert_scivocab_uncased',
    # fine-tuned models below
    'stsb-roberta-base_FT': 'saved_models/stsb-roberta-base_FT',
    'stsb-roberta-large_FT': 'saved_models/stsb-roberta-large_FT',
    'Legal-BERT_FT': 'saved_models/legal-bert-base-uncased_FT',
}

selected_model = all_models['stsb-roberta-large_FT']


In [None]:
# load test dataset
dataset_path = 'data/test_clean.parquet'
df_test = pd.read_parquet(dataset_path)

# X
sentence_pairs = list(zip(df_test['text'].tolist(),df_test['text_b'].tolist()))
# sentence pairs as list of dicts for transformer's pipeline
sentence_pairs_lods = [{"text": x[0], "text_pair": x[1]} for x in sentence_pairs]

# y_true
labels_true = df_test['label'].tolist()


In [None]:
# set num_labels for selected model - cross-encoder support only 1 label
num_labels = 1 #1 if selected_model.split('/')[0] in ['cross-encoder'] else 2

# load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(selected_model, num_labels=num_labels)
# tokenizer = AutoTokenizer.from_pretrained(selected_model)
tokenizer = AutoTokenizer.from_pretrained(all_models['stsb-roberta-large'])

# init mlflow experiment (use existing one)
experiment = mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

# run experiment
with mlflow.start_run(experiment_id=experiment.experiment_id, log_system_metrics=True) as run:
    # set run name
    mlflow.set_tag(key='mlflow.runName',
                    value=f"Test_{selected_model.split('/')[1]}_{timestamp()}")
    
    # log parameters
    mlflow.log_params({
        'PyTorch Device': torch.cuda.get_device_name(torch.cuda.current_device()),
        'Model': selected_model,
        'Dataset': dataset_path,
        'AutoModel Parameters': model,
        'Tokenizer': tokenizer,
    })

    # run pipeline for model predictions
    pipe = pipeline("text-classification", 
                    model = model, 
                    tokenizer = tokenizer, 
                    padding = True, 
                    truncation = True,
                    # max_length = 512, 
                    device = device, 
                    function_to_apply = ClassificationFunction.SIGMOID,
                    top_k=1,  # return only top 1 predicted label with score
                    )

    predictions = pipe(sentence_pairs_lods)

    threshold = 0.50
    mlflow.log_metric("Threshold", threshold)
    labels_pred = [0 if x[0]['score'] <= threshold else 1 for x in predictions]

    # # binarization for cross-encoders
    # if selected_model.split('/')[0] in ['cross-encoder', 'saved_models']:
    #     # for threshold in [
    #     #     0.33,
    #     #     0.53,
    #     #     0.66,
    #     #     0.75,
    #     #     0.85,
    #     # ]:
    #     # threshold
    #     threshold = 0.75
    #     labels_pred = [0 if x[0]['score'] <= threshold else 1 for x in predictions]

    # else:
    #     # works with top_k=1 (returns only the most probable label with score) for transformer models
    #     labels_pred = [0 if x[0]['label'] == 'LABEL_0' else 1 for x in predictions]

    f1_score_value = f1_score(y_true=labels_true, y_pred=labels_pred, pos_label=1, average='binary')
    mlflow.log_metric("F1 Score", f1_score_value)

    matthews_corrcoef_value = matthews_corrcoef(y_true=labels_true, y_pred=labels_pred)
    mlflow.log_metric("Matthews Correlation Coefficient", matthews_corrcoef_value)

    print(f"F1 Score: {f1_score_value}\nMatthews Correlation Coefficient: {matthews_corrcoef_value}")

# end experiment
mlflow.end_run()
