#### Load Data

In [None]:
import json

scores_dict = {
    "scores_ngram": {
        "1-gram": "data/scores_ngram/scores_1gram.json",
        "2-gram": "data/scores_ngram/scores_2gram.json",
        "3-gram": "data/scores_ngram/scores_3gram.json",
        "4-gram": "data/scores_ngram/scores_4gram.json",
        "5-gram": "data/scores_ngram/scores_5gram.json",
    },
    "BERTScore": "data/scores_bertscore.json",
    "NLI": "data/scores_nli.json",
    "scores_prompt": {
        "Llama": "data/scores_prompt/scores_llama.json",
        "Solar Pro": "data/scores_prompt/scores_solar_pro.json"
    },
}

def load_scores(scores_dict):
    ngram_scores = {}
    raw_scores = {}
    
    for key in scores_dict:
        if key == "scores_ngram":
            for ngram_key in scores_dict[key]:
                with open(scores_dict[key][ngram_key]) as file:
                    ngram_scores[ngram_key] = json.load(file)
        elif key == "scores_prompt":
            for prompt_key in scores_dict[key]:
                with open(scores_dict[key][prompt_key]) as file:
                    raw_scores[prompt_key] = json.load(file)
        else:
            with open(scores_dict[key]) as file:
                raw_scores[key] = json.load(file)
    raw_scores['scores_ngram'] = ngram_scores
    return raw_scores
    
raw_scores = load_scores(scores_dict) 

In [None]:
import math

def map_ngram_scores(value):
    if isinstance(value, list):
        return [1 / (1 + math.exp(-x)) if x != float('inf') else 1.0 for x in value]
    else:
        return 1 / (1 + math.exp(-value)) if value != float('inf') else 1.0

def process_ngram_scores(ngram_name, scores):
    scores_sent_avg = {}
    scores_sent_max = {}
    scores_doc_avg = {}
    scores_doc_avg_max = {}
    
    for idx in scores:
        scores_sent = scores[idx]['sent_level']
        scores_doc = scores[idx]['doc_level']
        
        scores_sent_avg[int(idx)] = map_ngram_scores(scores_sent['avg_neg_logprob'])
        scores_sent_max[int(idx)] = map_ngram_scores(scores_sent['max_neg_logprob'])
        scores_doc_avg[int(idx)] = map_ngram_scores(scores_doc['avg_neg_logprob'])
        scores_doc_avg_max[int(idx)] = map_ngram_scores(scores_doc['avg_max_neg_logprob'])
    return {
        ngram_name + " Sent Avg": scores_sent_avg,
        ngram_name + " Sent Max": scores_sent_max,
        ngram_name + " Doc Avg": scores_doc_avg,
        ngram_name + " Doc Avg Max": scores_doc_avg_max
    }
    
def process_scores(scores):
    processed_scores = {}
    
    for idx in scores:
        processed_scores[int(idx)] = scores[idx]
    return processed_scores
        
processed_scores = {}

for key in raw_scores:
    if key == "scores_ngram":
        processed_scores[key] = {}
        
        for ngram_key in raw_scores[key]:
            processed_scores[key][ngram_key] = process_ngram_scores(ngram_key, raw_scores[key][ngram_key])
    else:
        processed_scores[key] = process_scores(raw_scores[key])

#### Load Wikibio Dataset

In [3]:
import json

with open("data/dataset_v3.json", "r") as f:
    content = f.read()
    
dataset = json.loads(content)
print("The length of the dataset: {}".format(len(dataset)))

The length of the dataset: 238


In [4]:
import numpy as np

label_mapping = {
    'accurate': 0.0,
    'minor_inaccurate': 0.5,
    'major_inaccurate': 1.0,
}

indices = [x['wiki_bio_test_idx'] for x in dataset] 
human_label_detect_False   = {}
human_label_detect_True    = {}
human_label_detect_False_h = {}
human_label_raw = {}

for i_ in range(len(dataset)):
    dataset_i = dataset[i_]
    idx = dataset_i["wiki_bio_test_idx"]
    raw_label = np.array([label_mapping[x] for x in dataset_i['annotation']])
    
    human_label_raw[idx] = raw_label
    human_label_detect_False[idx] = (raw_label > 0.499).astype(np.int32).tolist()
    human_label_detect_True[idx] = (raw_label < 0.499).astype(np.int32).tolist()
    average_score = np.mean(raw_label)
    if (average_score < 0.99):
        human_label_detect_False_h[idx] = (raw_label > 0.99).astype(np.int32).tolist()
        
print("Length of False:", len(human_label_detect_False))
print("Length of True:", len(human_label_detect_True)) 
print("Length of False_h:", len(human_label_detect_False_h))

human_label_passage_avg = []

for id in indices:
    human_label_passage_avg.append(np.mean(human_label_raw[id]))

Length of False: 238
Length of True: 238
Length of False_h: 206


#### Experiments

In [5]:
arr_false = []
arr_false_h = []
arr_true = []

for v in human_label_detect_False.values():
    arr_false.extend(v)
for v in human_label_detect_False_h.values():
    arr_false_h.extend(v)
for v in human_label_detect_True.values():
    arr_true.extend(v)
    
random_baseline_false = np.mean(arr_false)
random_baseline_false_h = np.mean(arr_false_h)
random_baseline_true = np.mean(arr_true)

print("Random baseline false:", np.round(random_baseline_false, 2))
print("Random baseline false h:", np.round(random_baseline_false_h, 2))
print("Random baseline true:", np.round(random_baseline_true, 2))

Random baseline false: 0.73
Random baseline false h: 0.3
Random baseline true: 0.27


In [6]:
from sklearn.metrics import precision_recall_curve, auc

def unroll_pred(scores, indices):
    unrolled = []
    
    for idx in indices:
        unrolled.extend(scores[idx])
    return unrolled

def get_PR_with_human_labels(preds, human_labels, pos_label=1, oneminus_pred=False):
    indices = [k for k in human_labels.keys()]
    
    unroll_preds = unroll_pred(preds, indices)
    if oneminus_pred:
        unroll_preds = [1.0-x for x in unroll_preds]
        
    unroll_labels = unroll_pred(human_labels, indices)
    assert(len(unroll_preds) == len(unroll_labels))
    
    p, r, threshold = precision_recall_curve(unroll_labels, unroll_preds, pos_label=pos_label)
    return p, r, threshold

def get_AUC(p, r):
    return (auc(r, p) * 100)

In [7]:
import pandas as pd
import numpy as np
from scipy import stats

def get_accuracy_score(scores, labels, oneminus_pred=False):
    prec, rec, threshold = get_PR_with_human_labels(scores, labels, pos_label=1, oneminus_pred=oneminus_pred)
    return np.round(get_AUC(prec, rec), 2)

def get_passage_scores(scores):
    passage_scores = []
    
    for id in indices:
        passage_scores.append(np.mean(scores[id]))
    
    score_pearsonnr = stats.pearsonr(passage_scores, human_label_passage_avg)
    score_spearmanr = stats.spearmanr(passage_scores, human_label_passage_avg)
    return np.round(score_pearsonnr[0] * 100, 2), np.round(score_spearmanr[0] * 100, 2)

def get_baseline_row(name, scores):
    passage_scores = get_passage_scores(scores)

    row = {
        "Method": name,
        "NonFact": get_accuracy_score(scores, human_label_detect_False),
        "NonFact-H": get_accuracy_score(scores, human_label_detect_False_h),
        "Factual": get_accuracy_score(scores, human_label_detect_True, oneminus_pred=True),
        "Pearson": passage_scores[0],
        "Spearman": passage_scores[1]
    }
    return list(row.values())

df = pd.DataFrame(columns=["Method", "NonFact", "NonFact-H", "Factual", "Pearson", "Spearman"])
random_baseline = {
        "Method": "Random Baseline",
        "NonFact": np.round(random_baseline_false * 100, 2),
        "NonFact-H": np.round(random_baseline_false_h * 100, 2),
        "Factual": np.round(random_baseline_true * 100, 2),
        "Pearson": "NULL",
        "Spearman": "NULL",
    }
df.loc[len(df)] = list(random_baseline.values())

for key in processed_scores:
    if key == "scores_ngram":
        for ngram_key in processed_scores[key]:
            # Get the first two entry of ngram scores
            i = 0
            
            for ngram_category_key in processed_scores[key][ngram_key]:
                if (i > 1): 
                    break
                df.loc[len(df)] = get_baseline_row(ngram_category_key, processed_scores[key][ngram_key][ngram_category_key])
                i += 1
    else:
        df.loc[len(df)] = get_baseline_row(key, processed_scores[key])

In [8]:
df

Unnamed: 0,Method,NonFact,NonFact-H,Factual,Pearson,Spearman
0,Random Baseline,72.96,29.72,27.04,,
1,BERTScore,81.24,45.51,43.41,57.33,54.54
2,NLI,92.5,45.17,66.08,74.14,73.78
3,Llama 3.2 1B,61.87,23.4,19.61,-40.54,-38.62
4,Solar Pro,93.53,54.94,69.35,77.69,76.42
5,1-gram Sent Avg,81.52,40.33,41.78,39.2,39.82
6,1-gram Sent Max,85.64,41.05,58.44,63.15,66.09
7,2-gram Sent Avg,82.83,44.11,52.67,59.14,62.0
8,2-gram Sent Max,85.04,38.97,58.11,56.53,66.21
9,3-gram Sent Avg,83.27,43.79,53.85,59.64,65.07
