In [1]:
import pickle
from sklearn.metrics import ndcg_score
import numpy as np
from scipy.stats import pearsonr, kendalltau, spearmanr

In [2]:
import json

id2query = {}

# Open the JSONL file
with open('processed_data/merged_trec.jsonl', 'r') as file:
    # Iterate over each line
    for line in file:
        # Convert line to JSON
        data = json.loads(line)
        qid, qtext = data['qid'], data['query_text']
        if qid not in id2query:
            id2query[qid] = qtext
            
print(len(id2query))

217


In [3]:
def WIG(qtokens, score_list, k):
    corpus_score = np.mean(score_list)
    wig_norm = (np.mean(score_list[:k]) - corpus_score)/ np.sqrt(len(qtokens))
    wig_no_norm = np.mean(score_list[:k]) / np.sqrt(len(qtokens))

    return wig_norm, wig_no_norm


def NQC(score_list, k):
    corpus_score = np.mean(score_list)
    nqc_norm = np.std(score_list[:k]) / corpus_score
    nqc_no_norm = np.std(score_list[:k])

    return nqc_norm, nqc_no_norm

In [19]:
# literal_explanation

with open("wandb/run-20240613_135457-un82bbx0/files/test_results_ep4.pkl", 'rb') as fin:
    run = pickle.load(fin)

preds, labels, qids = run['predictions'], run['labels'], run['qids']
dct = {}

for p, l, q in zip(preds, labels, qids):
    if q not in dct:
        dct[q] = {
            "preds": [],
            "labels": []
        }
    dct[q]['preds'].append(p[0])
    dct[q]['labels'].append(l[0])
    
test_qids = set(dct.keys())
    
ndcg10s = []
wig_norms, wig_no_norms = [], []
nqc_norms, nqc_no_norms = [], []

for q, d in dct.items():
    
    qtokens = id2query[q].split()
    score_list = sorted(d['preds'], reverse=True)
    
    wig_norm, wig_no_norm = WIG(qtokens, score_list, 10)
    nqc_norm, nqc_no_norm = NQC(score_list, 10)
    
    ndcg10s.append(
        ndcg_score(
            np.expand_dims(d['labels'], axis=0),
            np.expand_dims(d['preds'], axis=0),
            k=10
        )
    )
    
    wig_norms.append(wig_norm)
    wig_no_norms.append(wig_no_norm)
    nqc_norms.append(nqc_norm)
    nqc_no_norms.append(nqc_no_norm)
    
print(np.mean(ndcg10s))

for metric in [wig_norms, wig_no_norms, nqc_norms, nqc_no_norms]:
# for metric in [wig_no_norms, nqc_no_norms]:
    
    pearson_corr, _ = pearsonr(ndcg10s, metric)
    kendall_corr, _ = kendalltau(ndcg10s, metric)
    spearman_corr, _ = spearmanr(ndcg10s, metric)
    
    print(f"Pearson's: {pearson_corr:.3f}; Kendall's: {kendall_corr:.3f}; Spearman's: {spearman_corr:.3f}")

0.5244204480654114
Pearson's: 0.063; Kendall's: 0.104; Spearman's: 0.139
Pearson's: 0.182; Kendall's: 0.112; Spearman's: 0.168
Pearson's: 0.075; Kendall's: 0.048; Spearman's: 0.077
Pearson's: 0.164; Kendall's: 0.114; Spearman's: 0.161


In [22]:
# FC

with open("wandb/run-20240613_135557-6z0br05e/files/test_results_ep5.pkl", 'rb') as fin:
# with open("wandb/run-20240613_135557-za78loj0/files/test_results_ep8.pkl", 'rb') as fin:
    run = pickle.load(fin)
    
preds, labels, qids = run['predictions'], run['labels'], run['qids']
dct = {}

for p, l, q in zip(preds, labels, qids):
    if q not in dct:
        dct[q] = {
            "preds": [],
            "labels": []
        }
    dct[q]['preds'].append(p[0])
    dct[q]['labels'].append(l[0])
    
ndcg10s = []
wig_norms, wig_no_norms = [], []
nqc_norms, nqc_no_norms = [], []

for q, d in dct.items():
    
    qtokens = id2query[q].split()
    score_list = sorted(d['preds'], reverse=True)
    
    wig_norm, wig_no_norm = WIG(qtokens, score_list, 10)
    nqc_norm, nqc_no_norm = NQC(score_list, 10)
    
    ndcg10s.append(
        ndcg_score(
            np.expand_dims(d['labels'], axis=0),
            np.expand_dims(d['preds'], axis=0),
            k=10
        )
    )
    
    wig_norms.append(wig_norm)
    wig_no_norms.append(wig_no_norm)
    nqc_norms.append(nqc_norm)
    nqc_no_norms.append(nqc_no_norm)
    
print(np.mean(ndcg10s))

for metric in [wig_norms, wig_no_norms, nqc_norms, nqc_no_norms]:
# for metric in [wig_no_norms, nqc_no_norms]:
    
    pearson_corr, _ = pearsonr(ndcg10s, metric)
    kendall_corr, _ = kendalltau(ndcg10s, metric)
    spearman_corr, _ = spearmanr(ndcg10s, metric)
    
    print(f"Pearson's: {pearson_corr:.3f}; Kendall's: {kendall_corr:.3f}; Spearman's: {spearman_corr:.3f}")

0.24488353324209958
Pearson's: -0.001; Kendall's: -0.012; Spearman's: -0.020
Pearson's: 0.123; Kendall's: 0.044; Spearman's: 0.052
Pearson's: -0.385; Kendall's: -0.242; Spearman's: -0.355
Pearson's: -0.334; Kendall's: -0.210; Spearman's: -0.321


In [21]:
# conditional_explanation

with open("wandb/run-20240613_135825-vjzfhrm4/files/test_results_ep4.pkl", 'rb') as fin:
    run = pickle.load(fin)
    
preds, labels, qids = run['predictions'], run['labels'], run['qids']
dct = {}

for p, l, q in zip(preds, labels, qids):
    if q not in dct:
        dct[q] = {
            "preds": [],
            "labels": []
        }
    dct[q]['preds'].append(p[0])
    dct[q]['labels'].append(l[0])
    
ndcg10s = []
wig_norms, wig_no_norms = [], []
nqc_norms, nqc_no_norms = [], []

for q, d in dct.items():
    
    qtokens = id2query[q].split()
    score_list = sorted(d['preds'], reverse=True)
    
    wig_norm, wig_no_norm = WIG(qtokens, score_list, 10)
    nqc_norm, nqc_no_norm = NQC(score_list, 10)
    
    ndcg10s.append(
        ndcg_score(
            np.expand_dims(d['labels'], axis=0),
            np.expand_dims(d['preds'], axis=0),
            k=10
        )
    )
    
    wig_norms.append(wig_norm)
    wig_no_norms.append(wig_no_norm)
    nqc_norms.append(nqc_norm)
    nqc_no_norms.append(nqc_no_norm)
    
print(np.mean(ndcg10s))

for metric in [wig_norms, wig_no_norms, nqc_norms, nqc_no_norms]:
# for metric in [wig_no_norms, nqc_no_norms]:
    
    pearson_corr, _ = pearsonr(ndcg10s, metric)
    kendall_corr, _ = kendalltau(ndcg10s, metric)
    spearman_corr, _ = spearmanr(ndcg10s, metric)
    
    print(f"Pearson's: {pearson_corr:.3f}; Kendall's: {kendall_corr:.3f}; Spearman's: {spearman_corr:.3f}")

0.533821570498212
Pearson's: 0.268; Kendall's: 0.165; Spearman's: 0.230
Pearson's: 0.284; Kendall's: 0.143; Spearman's: 0.235
Pearson's: 0.183; Kendall's: 0.146; Spearman's: 0.209
Pearson's: 0.214; Kendall's: 0.152; Spearman's: 0.227


In [20]:
# NC (no calibration)

dct = {}

with open('processed_data/merged_trec.jsonl', 'r') as file:
    # Iterate over each line
    for line in file:
        # Convert line to JSON
        data = json.loads(line)
        qid, label, score = data['qid'], data['label'], data['bert_score']
        if qid in test_qids:
            if qid not in dct:
                dct[qid] = {
                    "preds": [],
                    "labels": []
                }
            dct[qid]['preds'].append(score)
            dct[qid]['labels'].append(label)
    
ndcg10s = []
wig_norms, wig_no_norms = [], []
nqc_norms, nqc_no_norms = [], []

for q, d in dct.items():
    
    qtokens = id2query[q].split()
    score_list = sorted(d['preds'], reverse=True)
    
    wig_norm, wig_no_norm = WIG(qtokens, score_list, 10)
    nqc_norm, nqc_no_norm = NQC(score_list, 10)
    
    ndcg10s.append(
        ndcg_score(
            np.expand_dims(d['labels'], axis=0),
            np.expand_dims(d['preds'], axis=0),
            k=10
        )
    )
    
    wig_norms.append(wig_norm)
    wig_no_norms.append(wig_no_norm)
    nqc_norms.append(nqc_norm)
    nqc_no_norms.append(nqc_no_norm)
    
print(np.mean(ndcg10s))

for metric in [wig_norms, wig_no_norms, nqc_norms, nqc_no_norms]:
# for metric in [wig_no_norms, nqc_no_norms]:
    
    pearson_corr, _ = pearsonr(ndcg10s, metric)
    kendall_corr, _ = kendalltau(ndcg10s, metric)
    spearman_corr, _ = spearmanr(ndcg10s, metric)
    
    print(f"Pearson's: {pearson_corr:.3f}; Kendall's: {kendall_corr:.3f}; Spearman's: {spearman_corr:.3f}")

0.4935985754589764
Pearson's: 0.176; Kendall's: 0.091; Spearman's: 0.144
Pearson's: 0.213; Kendall's: 0.144; Spearman's: 0.213
Pearson's: 0.003; Kendall's: -0.083; Spearman's: -0.113
Pearson's: -0.171; Kendall's: -0.136; Spearman's: -0.197


In [23]:
# PC

with open("wandb/run-20240613_165246-8rfhmmpa/files/test_results_ep10.pkl", 'rb') as fin:
    run = pickle.load(fin)
    
preds, labels, qids = run['predictions'], run['labels'], run['qids']
dct = {}

for p, l, q in zip(preds, labels, qids):
    if q not in dct:
        dct[q] = {
            "preds": [],
            "labels": []
        }
    dct[q]['preds'].append(p[0])
    dct[q]['labels'].append(l[0])
    
ndcg10s = []
wig_norms, wig_no_norms = [], []
nqc_norms, nqc_no_norms = [], []

for q, d in dct.items():
    
    qtokens = id2query[q].split()
    score_list = sorted(d['preds'], reverse=True)
    
    wig_norm, wig_no_norm = WIG(qtokens, score_list, 10)
    nqc_norm, nqc_no_norm = NQC(score_list, 10)
    
    ndcg10s.append(
        ndcg_score(
            np.expand_dims(d['labels'], axis=0),
            np.expand_dims(d['preds'], axis=0),
            k=10
        )
    )
    
    wig_norms.append(wig_norm)
    wig_no_norms.append(wig_no_norm)
    nqc_norms.append(nqc_norm)
    nqc_no_norms.append(nqc_no_norm)
    
print(np.mean(ndcg10s))

for metric in [wig_norms, wig_no_norms, nqc_norms, nqc_no_norms]:
# for metric in [wig_no_norms, nqc_no_norms]:
    
    pearson_corr, _ = pearsonr(ndcg10s, metric)
    kendall_corr, _ = kendalltau(ndcg10s, metric)
    spearman_corr, _ = spearmanr(ndcg10s, metric)
    
    print(f"Pearson's: {pearson_corr:.3f}; Kendall's: {kendall_corr:.3f}; Spearman's: {spearman_corr:.3f}")

0.4935985754589764
Pearson's: 0.198; Kendall's: 0.113; Spearman's: 0.175
Pearson's: 0.226; Kendall's: 0.158; Spearman's: 0.228
Pearson's: -0.173; Kendall's: -0.123; Spearman's: -0.166
Pearson's: -0.206; Kendall's: -0.139; Spearman's: -0.197
