In [1]:
import warnings
warnings.filterwarnings('ignore')
import pyterrier as pt
if not pt.started():
    pt.init()

import os
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', False)

import ir_measures
from ir_measures import * # imports all supported measures, e.g., AP, nDCG, RR, P

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [2]:
# dataset_name = 'msmarco-passage'
# dataset = pt.get_dataset(f'irds:{dataset_name}')

dataset = pt.get_dataset(f'irds:msmarco-passage')
eval_dataset = pt.get_dataset(f'irds:msmarco-passage/dev')
topics = eval_dataset.get_topics()
# qrels = eval_dataset.get_qrels()

import ir_datasets
eval = ir_datasets.load("msmarco-passage/dev")
# topics = pd.DataFrame(eval.queries_iter())
qrels = pd.DataFrame(eval.qrels_iter())


In [3]:
qrels.shape

(59273, 4)

In [4]:
import tqdm
for qid in tqdm.tqdm(topics[:5]['qid']):
    print(type(qid))

100%|██████████| 5/5 [00:00<00:00, 85250.08it/s]

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>





In [5]:
import json

def save(dict, file):
    with open(file,'w') as f:
        json.dump(dict, f)

def load(file):
    with open(file,'r') as f:
        ms_docids = json.load(f)
    return ms_docids

def Gini(v):
    v = np.array(v)
    bins = np.linspace(0., 100., 11)
    total = float(np.sum(v))
    yvals = [0]
    for b in bins[1:]:
        bin_vals = v[v <= np.percentile(v, b)]
        bin_fraction = (np.sum(bin_vals) / total) * 100.0
        yvals.append(bin_fraction)
    # perfect equality area
    pe_area = np.trapz(bins, x=bins)
    # lorenz area
    lorenz_area = np.trapz(yvals, x=bins)
    gini_val = (pe_area - lorenz_area) / float(pe_area)
    return gini_val

In [6]:
import statistics
def calc_stats(modelname,df, threshold, topics):
    scoredF = f'./results/{modelname}_docids_100_threshold_{threshold}.json'
    if not os.path.exists(scoredF):
        init_msmarco_dict = './results/ms_docids.json'
        docids_score = load(init_msmarco_dict)
        for qid in tqdm.tqdm(topics['qid']):
            D = df[df['qid'] == np.int64(qid)]
            for dno in D['docno']:
                rank = D[D['docno'] == dno]['rank'].values[0]
                score = 100 / np.log(rank + 2)  # plus 2 because the ranks start from zero
                docids_score[str(dno)] += score
        save(docids_score, scoredF)

    docids_score = load(scoredF)
    scores_df = pd.DataFrame.from_dict(docids_score, orient="index", columns=["score"])
    scores_df = scores_df[scores_df["score"] > 0]
    scores = scores_df['score'].to_list()

    mean = statistics.mean(scores)
    std_dev = statistics.stdev(scores)
    gini_value = Gini(scores)
    return mean, std_dev, gini_value

In [7]:
# def transform_df(df):
#     df_run = df.rename(columns={'qid':'query_id','docid':'doc_id'})
#     df_run[['query_id','doc_id']] = df_run[['query_id','doc_id']].astype(str)

#     return df_run


In [8]:
import glob
res = pd.DataFrame()
for file in glob.glob('/nfs/resources/cxj/retrievability-bias/colbert/df_colbert_30*.csv'):
    df = pd.concat([res,pd.read_csv(file,index_col=0).reset_index()],ignore_index=True)

In [9]:
df.head()

Unnamed: 0,qid,docid,docno,score,rank
0,100007,96171,137813,18.810621,9
1,100007,169024,242346,18.449924,37
2,100007,169030,242353,18.538689,28
3,100007,205373,294515,17.751354,93
4,100007,249859,358161,18.137604,58


In [10]:
calc_stats('colbert',df,30,topics)

(40.663329624539074, 30.663810236402185, 0.3011090185532994)

In [11]:
df2 = pd.DataFrame()
df2['query_id'] = df['qid'].astype(str)
df2['doc_id'] = df['docno'].astype(str)
df2['score'] = df['score']

In [12]:
ir_measures.calc_aggregate([RR, nDCG@10], qrels, df2)

{RR: 0.04989511915500925, nDCG@10: 0.05673301366116784}

In [None]:
res = pd.DataFrame()
for file in glob.glob('/nfs/resources/cxj/retrievability-bias/colbert/df_bm25_colbert_30_*.csv'):
    df = pd.concat([res,pd.read_csv(file,index_col=0).reset_index()],ignore_index=True)

In [None]:
calc_stats('bm25_colbert',df,30,topics)

In [None]:
df2 = pd.DataFrame()
df2['query_id'] = df['qid'].astype(str)
df2['doc_id'] = df['docno'].astype(str)
df2['score'] = df['score']

In [None]:
ir_measures.calc_aggregate([RR, nDCG@10], qrels, df2)

In [None]:
df = pd.read_csv('/nfs/resources/cxj/retrievability-bias/bm25/df_bm25_monot5_30_.csv', index_col=0).reset_index()

In [None]:
calc_stats('bm25_monot5',df,30,topics)

In [None]:
df2 = pd.DataFrame()
df2['query_id'] = df['qid'].astype(str)
df2['doc_id'] = df['docno'].astype(str)
df2['score'] = df['score']

In [None]:
ir_measures.calc_aggregate([RR, nDCG@10], qrels, df2)

In [None]:
df = pd.read_csv('/nfs/resources/cxj/retrievability-bias/splade/df_splade_90.csv', index_col=0).reset_index()

In [None]:
calc_stats('splade',df,90,topics)

In [None]:
df2 = pd.DataFrame()
df2['query_id'] = df['qid'].astype(str)
df2['doc_id'] = df['docno'].astype(str)
df2['score'] = df['score']

In [None]:
ir_measures.calc_aggregate([RR, nDCG@10], qrels, df2)

In [70]:
df = pd.read_csv('/nfs/resources/cxj/retrievability-bias/bm25/df_bm25_30.csv', index_col=0).reset_index()


In [None]:
df2 = pd.DataFrame()
df2['query_id'] = df['qid'].astype(str)
df2['doc_id'] = df['docno'].astype(str)
df2['score'] = df['score']

In [71]:
df.head()

Unnamed: 0,qid,docid,docno,score,rank
0,1048578,5258326,7471198,61.687367,0
1,1048578,5068140,7187241,51.659057,1
2,1048578,3801998,5365326,51.630145,2
3,1048578,4805741,6802210,47.519237,3
4,1048578,5068138,7187239,44.768203,4


In [74]:
calc_stats('bm25',df,30,topics)

(82.29475302247792, 246.9971036759308, 0.47144859868381284)

In [73]:
ir_measures.calc_aggregate([RR, nDCG@10], qrels, df2)

{nDCG@10: 0.2246598189895539, RR: 0.19501269514963218}

In [75]:
df = pd.read_csv('/nfs/resources/cxj/retrievability-bias/splade/df_splade_30.csv', index_col=0).reset_index()
df2 = pd.DataFrame()
df2['query_id'] = df['qid'].astype(str)
df2['doc_id'] = df['docno'].astype(str)
df2['score'] = df['score']

In [76]:
df.head()

Unnamed: 0,qid,docid,docno,score,rank
0,1048578,5068136,7187234,1346.051053,0
1,1048578,1451614,2078221,1324.122223,1
2,1048578,3801998,5365326,1322.758848,2
3,1048578,3232092,4567130,1308.443114,3
4,1048578,3802000,5365328,1249.605868,4


In [77]:
calc_stats('splade',df,30,topics)

100%|██████████| 101093/101093 [40:08<00:00, 41.97it/s]


(72.07897727079515, 67.02232864644255, 0.39601066004471325)

In [78]:
ir_measures.calc_aggregate([RR, nDCG@10], qrels, df2)

{nDCG@10: 0.41843710445589793, RR: 0.36964461948385996}

In [12]:
df = pd.read_csv('/nfs/resources/cxj/retrievability-bias/splade/df_splade_60.csv', index_col=0).reset_index()


In [13]:
df.head()

Unnamed: 0,qid,docid,docno,score,rank
0,1048578,2915219,7187234,1346.051053,0
1,1048578,837032,2078221,1324.376221,1
2,1048578,2184213,5365326,1322.035154,2
3,1048578,2184215,5365328,1249.605868,3
4,1048578,2915221,7187239,1170.215931,4


In [14]:
calc_stats('splade',df,60,topics)

100%|██████████| 101093/101093 [37:50<00:00, 44.52it/s]


(102.5659422654148, 97.23124851721596, 0.4097076534494958)

In [15]:
df2 = pd.DataFrame()
df2['query_id'] = df['qid'].astype(str)
df2['doc_id'] = df['docno'].astype(str)
df2['score'] = df['score']
ir_measures.calc_aggregate([RR, nDCG@10], qrels, df2)

{nDCG@10: 0.3473603967529894, RR: 0.31029837051269815}

In [None]:
df = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_contriever_100.csv').reset_index()

In [45]:
df.shape

(10109300, 8)

In [None]:
groupby = df.groupby('query_id')

In [None]:
groupby.groups

In [None]:
calc_stats('contriever_my',df,100,topics)

In [None]:
df = transform_df(df)
ir_measures.calc_aggregate([RR, nDCG@10], qrels, df)

In [None]:

def calc_rtr_score(df, docid_dict, savename):
    for query_id in pt.tqdm(topics['query_id']):
        D = df[df['qid'] == np.int64(query_id)]
        for dno in D['docno']:
            rank =  D[D['docno'] == dno]['rank'].values[0]
            score = 100/np.log(rank + 2) # plus 2 because the ranks start from zero
            docid_dict[str(dno)] += score
    
    save(docid_dict,savename)   


In [None]:
def G(v):
    v = np.array(v)
    bins = np.linspace(0., 100., 11)
    total = float(np.sum(v))
    yvals = [0]
    for b in bins[1:]:
        bin_vals = v[v <= np.percentile(v, b)]
        bin_fraction = (np.sum(bin_vals) / total) * 100.0
        yvals.append(bin_fraction)
    # perfect equality area
    pe_area = np.trapz(bins, x=bins)
    # lorenz area
    lorenz_area = np.trapz(yvals, x=bins)
    gini_val = (pe_area - lorenz_area) / float(pe_area)
    # return bins, yvals, gini_val
    print('gini:', gini_val)

In [None]:
import statistics

def calc_stats(values):
    # values = list(data.values())

    mean = statistics.mean(values)
    std_dev = statistics.stdev(values) 
    print(f"Mean: {mean}")
    print(f"Standard Deviation: {std_dev}")


In [None]:
def calc_metrics(modelname):
    scoredF = f'./results/{modelname}_docids_100.json'
    if not os.path.exists(scoredF):
        csv = f'/nfs/datasets/cxj/retrievability-bias/results_{modelname}_100.csv'
        df = pd.read_csv(csv)
        init_msmarco_dict = './results/ms_docids.json'
        docids_score = load(init_msmarco_dict)
        for query_id in pt.tqdm(topics['query_id']):
            D = df[df['qid'] == np.int64(query_id)]
            for dno in D['docno']:
                rank =  D[D['docno'] == dno]['rank'].values[0]
                score = 100/np.log(rank + 2) # plus 2 because the ranks start from zero
                docids_score[str(dno)] += score
        save(docids_score, scoredF)
        
    docids_score = load(scoredF)
    scores_df = pd.DataFrame.from_dict(docids_score, orient="index",columns=["score"])
    scores_df = scores_df[scores_df["score"]>0]
    scores = scores_df['score'].to_list()
    calc_stats(scores)
    gini_value = G(scores)


In [None]:
calc_metrics('bm25')

In [None]:
calc_metrics('splade')

In [None]:
calc_metrics('colbert')

In [None]:
contreiver = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_contriever_raw.csv',index_col=1)
contreiver.head(2)

In [None]:
contreiver = contreiver.groupby('qid')
# contreiver = contreiver.progress_apply(lambda x: x.sort_values('rank', ascending=True))
contreiver = contreiver.progress_apply(lambda x: x.sort_values('score', ascending=False))
contreiver = contreiver.groupby('qid').head(100)

In [None]:
contreiver.head(200)

In [None]:
contreiver.to_csv('/nfs/datasets/cxj/retrievability-bias/results_contriever_100.csv')

In [None]:
calc_metrics('contriever')

In [None]:
contreiver = transform_df(contreiver)
ir_measures.calc_aggregate([RR, nDCG@10], qrels, contreiver)

In [None]:
calc_metrics('bm25_colbert')

In [None]:
bm25 = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_bm25_100.csv')
bm25.head(2)

In [None]:
del bm25['score']
bm25 = transform_df(bm25)
metrics = ir_measures.calc_aggregate([RR, nDCG@10], qrels, bm25)

In [None]:
type(metrics)

In [None]:
monot5 = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_bm25_monot5_100.csv')

In [None]:
splade = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_splade_100.csv')
del splade['score']
# splade['docid'] = splade['docno']
splade = transform_df(splade)
ir_measures.calc_aggregate([RR, nDCG@10], qrels, splade)

In [None]:
colbert = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_colbert_100.csv')
# del colbert['score']
# colbert['docid'] = colbert['docno']
colbert = transform_df(colbert)
ir_measures.calc_aggregate([RR, nDCG@10], qrels, colbert)

In [None]:
bm25_colbert = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_bm25_colbert_100.csv')
del bm25_colbert['score']
bm25_colbert['docid'] = bm25_colbert['docno']
bm25_colbert = transform_df(bm25_colbert)
ir_measures.calc_aggregate([RR, nDCG@10], qrels, bm25_colbert)

In [None]:
csv = pd.read_csv('./results/r_result.csv').reset_index()