#### Set Up

In [1]:
from utils import load_data_json

books = load_data_json('../data/output/interrogate_llm_zeroshot/cosine_similarity/books.json')
halu_eval_long_answer = load_data_json('../data/output/interrogate_llm_zeroshot/cosine_similarity/halu_eval_long_answer.json')
halu_eval_long_answer_knowledge = load_data_json('../data/output/interrogate_llm_zeroshot/cosine_similarity/halu_eval_long_answer_knowledge.json')

In [2]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

TEST_SIZE=0.3
SEED=42

def get_evaluation_metrics(data):
    scores = np.array([sample['cosine_similarity'] for sample in data])
    y_true = np.array([sample['is_hallucinated'] for sample in data])

    scores_train, scores_test, y_train, y_test = train_test_split(
        scores, y_true, test_size=TEST_SIZE, stratify=y_true, random_state=SEED)
    
    # Find the optimal F1 score threshold
    thresholds = np.linspace(0, 1, 101)
    f1_scores = []

    for t in thresholds:
        y_pred_train = (scores_train >= t).astype(int)
        f1_scores.append(f1_score(y_train, y_pred_train))

    best_f1_threshold = thresholds[np.argmax(f1_scores)]
    y_pred_test = (scores_test >= best_f1_threshold).astype(int)
    return {
        'F1 best threshold': best_f1_threshold,
        'F1 Score': f1_score(y_test, y_pred_test),
        'Accuracy': accuracy_score(y_test, y_pred_test),
        'Precision': precision_score(y_test, y_pred_test),
        'Recall': recall_score(y_test, y_pred_test),
        'AUC (on the entire dataset)': roc_auc_score(y_true, scores),
        'Balanced Accuracy': balanced_accuracy_score(y_test, y_pred_test)
    }


In [3]:
import json

res = {
    'books': get_evaluation_metrics(books),
    'halu_eval_long_answer': get_evaluation_metrics(halu_eval_long_answer),
    'halu_eval_long_answer_knowledge': get_evaluation_metrics(halu_eval_long_answer_knowledge),
}

with open('../data/output/interrogate_llm_zeroshot/evaluations.json', 'w', encoding='utf-8') as f:
    json.dump(res, f, indent=4)