#### Set Up

In [None]:
import json
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.metrics import precision_recall_curve, auc

In [None]:
scores_dict = {
    "GPT 4o Mini": "data/scores_gpt4o_mini.json",
    "Qwen3 4B Instruct": "data/scores_qwen3_4b_instruct.json",
    "BERTScore": "data/scores_bertscore.json"
}

def load_scores(scores_dict):
    scores = {}
    
    for model in scores_dict:
        with open(scores_dict[model]) as f:
            scores[model] = json.load(f)
    return scores

scores = load_scores(scores_dict)

In [None]:
with open("data/dataset.json", "r") as f:
    dataset = json.loads(f.read())

print(f"The length of the dataset: {len(dataset)}")
print("The keys of each sample:", list(dataset[0].keys()))

In [None]:
indices = [str(sample['wiki_bio_test_idx']) for sample in dataset] 
labels_mapping = {
    'accurate': 0.0,
    'minor_inaccurate': 0.5,
    'major_inaccurate': 1.0,
}
human_labels_detect_true = {}
human_labels_detect_false = {}
human_labels_detect_hard_false = {}
human_labels_passage_mean = []

for i in range(len(dataset)):
    sample = dataset[i]
    idx = sample['wiki_bio_test_idx']
    raw_label = np.array([labels_mapping[x] for x in sample['annotation']])
    
    human_labels_detect_true[idx] = (raw_label < 0.499).astype(np.int32).tolist()
    human_labels_detect_false[idx] = (raw_label > 0.499).astype(np.int32).tolist()
    if np.mean(raw_label) < 0.99:
        human_labels_detect_hard_false[idx] = (raw_label > 0.99).astype(np.int32).tolist()
    human_labels_passage_mean.append(np.mean(raw_label))

print("Length of true human labels:", len(human_labels_detect_true))
print("Length of false human labels:", len(human_labels_detect_false))
print("Length of hard false human labels", len(human_labels_detect_hard_false))
print("Length of passage mean human labels:", len(human_labels_passage_mean))

#### Experiments

In [None]:
def get_random_baseline(human_label):
    arr = []
    
    for v in human_label.values():
        arr.extend(v)
    return np.mean(arr)

random_baseline_true = get_random_baseline(human_labels_detect_true)
random_baseline_false = get_random_baseline(human_labels_detect_false)
random_baseline_false_hard = get_random_baseline(human_labels_detect_hard_false)

print("Random baseline true:", np.round(random_baseline_true, 3))
print("Random baseline false:", np.round(random_baseline_false, 3))
print("Random baseline hard false:", np.round(random_baseline_false_hard, 3))

In [None]:
def unroll_labels(labels, indices):
    unrolled_labels = []
    
    for idx in indices:
        unrolled_labels.extend(labels[idx])
    return unrolled_labels

def get_pr_with_human_labels(preds, human_labels, pos_label=1, oneminus_pred=False):
    indices = [k for k in human_labels.keys()]
    flatten_labels = unroll_labels(human_labels, indices)
    flatten_preds = unroll_labels(preds, indices)

    if oneminus_pred:
        flatten_preds = [1.0-x for x in flatten_preds]        
    assert(len(flatten_preds) == len(flatten_labels))
    
    p, r, threshold = precision_recall_curve(flatten_labels, flatten_preds, pos_label=pos_label)
    return p, r, threshold

def get_auc(preds, human_labels, oneminus_pred=False):
    prec, rec, threshold = get_pr_with_human_labels(preds, human_labels, pos_label=1, oneminus_pred=oneminus_pred)
    return np.round(auc(prec, rec) * 100, 3)

In [None]:
def get_passage_metrics(preds, human_labels_passage_mean):
    passage_mean_preds = [np.mean(preds[id]) for id in preds]
    pearsonr = stats.pearsonr(passage_mean_preds, human_labels_passage_mean)
    spearmanr = stats.spearmanr(passage_mean_preds, human_labels_passage_mean)
    return {"Pearson": pearsonr[0], "Spearman": spearmanr[0]}

def get_result(name, preds, human_labels_passage_mean):
    passage_metrics = get_passage_metrics(preds, human_labels_passage_mean)
    
    row = [
        name,
        np.round(get_auc(preds, human_labels_detect_false), 3),
        np.round(get_auc(preds, human_labels_detect_hard_false), 3),
        np.round(get_auc(preds, human_labels_detect_true), 3),
        np.round(passage_metrics['Pearson'], 3),
        np.round(passage_metrics['Spearman'], 3),
    ]
    return row

In [None]:
if __name__ == "__main__":
    result = pd.DataFrame(columns=["Method", "NonFact", "Hard NonFact", "Factual", "Pearson", "Spearman"])

    random_baseline_row = [
        "Random Baseline",
        np.round(random_baseline_false * 100, 3),
        np.round(random_baseline_false_hard * 100, 3),
        np.round(random_baseline_false * 100, 3),
        "", 
        "",
    ]
    result.loc[len(result)] = random_baseline_row

    for model_name, preds in scores.items():
        result.loc[len(result)] = get_result(model_name, preds, human_labels_passage_mean)
    print(result)