In [1]:
import warnings
warnings.filterwarnings('ignore')
import pyterrier as pt
if not pt.started():
    pt.init()

import os
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', False)

import ir_measures
from ir_measures import * # imports all supported measures, e.g., AP, nDCG, RR, P

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [2]:
# dataset_name = 'msmarco-passage'
# dataset = pt.get_dataset(f'irds:{dataset_name}')

# eval_dataset = pt.get_dataset(f'irds:{dataset_name}/dev')
# topics = eval_dataset.get_topics()
# qrels = eval_dataset.get_qrels()

import ir_datasets
eval = ir_datasets.load("msmarco-passage/dev")
topics = pd.DataFrame(eval.queries_iter())
qrels = pd.DataFrame(eval.qrels_iter())


In [3]:
qrels.shape

(59273, 4)

In [None]:
import json

def save(dict, file):
    with open(file,'w') as f:
        json.dump(dict, f)


In [None]:
def load(file):
    with open(file,'r') as f:
        ms_docids = json.load(f)
    return ms_docids

In [None]:

def calc_rtr_score(df, docid_dict, savename):
    for query_id in pt.tqdm(topics['query_id']):
        D = df[df['qid'] == np.int64(query_id)]
        for dno in D['docno']:
            rank =  D[D['docno'] == dno]['rank'].values[0]
            score = 100/np.log(rank + 2) # plus 2 because the ranks start from zero
            docid_dict[str(dno)] += score
    
    save(docid_dict,savename)   


In [4]:
def G(v):
    v = np.array(v)
    bins = np.linspace(0., 100., 11)
    total = float(np.sum(v))
    yvals = [0]
    for b in bins[1:]:
        bin_vals = v[v <= np.percentile(v, b)]
        bin_fraction = (np.sum(bin_vals) / total) * 100.0
        yvals.append(bin_fraction)
    # perfect equality area
    pe_area = np.trapz(bins, x=bins)
    # lorenz area
    lorenz_area = np.trapz(yvals, x=bins)
    gini_val = (pe_area - lorenz_area) / float(pe_area)
    # return bins, yvals, gini_val
    print('gini:', gini_val)

In [None]:
import statistics

def calc_stats(values):
    # values = list(data.values())

    mean = statistics.mean(values)
    std_dev = statistics.stdev(values) 
    print(f"Mean: {mean}")
    print(f"Standard Deviation: {std_dev}")


In [None]:
def calc_metrics(modelname):
    scoredF = f'./results/{modelname}_docids_100.json'
    if not os.path.exists(scoredF):
        csv = f'/nfs/datasets/cxj/retrievability-bias/results_{modelname}_100.csv'
        df = pd.read_csv(csv)
        init_msmarco_dict = './results/ms_docids.json'
        docids_score = load(init_msmarco_dict)
        for query_id in pt.tqdm(topics['query_id']):
            D = df[df['qid'] == np.int64(query_id)]
            for dno in D['docno']:
                rank =  D[D['docno'] == dno]['rank'].values[0]
                score = 100/np.log(rank + 2) # plus 2 because the ranks start from zero
                docids_score[str(dno)] += score
        save(docids_score, scoredF)
        
    docids_score = load(scoredF)
    scores_df = pd.DataFrame.from_dict(docids_score, orient="index",columns=["score"])
    scores_df = scores_df[scores_df["score"]>0]
    scores = scores_df['score'].to_list()
    calc_stats(scores)
    gini_value = G(scores)


In [36]:
def transform_df(df):
    df_run = df.rename(columns={'qid':'query_id','docid':'doc_id'})
    df_run[['query_id','doc_id']] = df_run[['query_id','doc_id']].astype(str)

    return df_run

In [None]:
calc_metrics('bm25')

In [None]:
calc_metrics('splade')

In [None]:
calc_metrics('colbert')

In [None]:
calc_metrics('contriever')

In [None]:
calc_metrics('bm25_colbert')

In [5]:
bm25 = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_bm25_100.csv')
bm25.head(2)

Unnamed: 0,qid,docid,docno,rank,score,query,text
0,1048578,7187236,7187236,0,67.528593,cost of endless pools swim spa,The Endless Pool 15' endless pools swim spa Endless Pool Nightmare Buy A Big Spa Instead Great training pool
1,1048578,7471198,7471198,1,62.121928,cost of endless pools swim spa,"Cal Spas is the leading maker of home resort products. Cal Spas makes a great jetted swim spa with thoughtful attention paid to all design features. Read our Cal Spas Swim Spa Review. Dimesion One offers swim spas through their Aquatic Fitness Systems brand. Read our Dimension One Swim Spa Review. Endless Pools is one of the industry leaders in swim spas and counter-current pools. Known for their superior swim current, Endless Pools sell a range of swim spas and modular pools."


In [None]:
del bm25['score']
bm25 = transform_df(bm25)
ir_measures.calc_aggregate([RR, nDCG@10], qrels, bm25)

In [7]:
monot5 = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_monot5_100.csv')
monot5.head(2)

Unnamed: 0.1,Unnamed: 0,qid,docid,docno,query,text,score,rank
0,0,1048578,7187236,7187236,cost of endless pools swim spa,The Endless Pool 15' endless pools swim spa Endless Pool Nightmare Buy A Big Spa Instead Great training pool,-1.152498,17
1,1,1048578,7471198,7471198,cost of endless pools swim spa,"Cal Spas is the leading maker of home resort products. Cal Spas makes a great jetted swim spa with thoughtful attention paid to all design features. Read our Cal Spas Swim Spa Review. Dimesion One offers swim spas through their Aquatic Fitness Systems brand. Read our Dimension One Swim Spa Review. Endless Pools is one of the industry leaders in swim spas and counter-current pools. Known for their superior swim current, Endless Pools sell a range of swim spas and modular pools.",-3.906874,22


In [None]:
splade = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_splade_100.csv')
splade.head(2)

In [None]:
del splade['score']
# splade['docid'] = splade['docno']
splade = transform_df(splade)
ir_measures.calc_aggregate([RR, nDCG@10], qrels, splade)

In [27]:
colbert = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_colbert_100.csv',index_col=1).reset_index()
colbert.head(2)

Unnamed: 0,qid,docid,rank,docno,score
0,13021,6728409,0,6728409,100
1,13021,7076490,1,7076490,99


In [17]:
del colbert['qid.1']

In [19]:
colbert['docno'] = colbert['docid']

In [20]:
colbert['score'] = 100-colbert['rank']

In [25]:
colbert.head(2)

Unnamed: 0,docid,qid,rank,docno,score
0,6728409,13021,0,6728409,100
1,7076490,13021,1,7076490,99


In [26]:
colbert.to_csv('/nfs/datasets/cxj/retrievability-bias/results_colbert_100.csv',index=False)

In [None]:

# del colbert['score']
# colbert['docid'] = colbert['docno']
colbert = transform_df(colbert)
ir_measures.calc_aggregate([RR, nDCG@10], qrels, colbert)

In [40]:
contreiver = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_contriever_raw.csv',index_col=1).reset_index()
contreiver.head(2)

Unnamed: 0,query,qid,docno,docid,score,rank
0,cost of endless pools swim spa,1048578,7453627,7453627,2.209815,0
1,cost of endless pools swim spa,1048578,3819219,3819219,2.108093,1


In [None]:
contreiver = contreiver.groupby('qid')
# contreiver = contreiver.progress_apply(lambda x: x.sort_values('rank', ascending=True))
contreiver = contreiver.progress_apply(lambda x: x.sort_values('score', ascending=False))
contreiver = contreiver.groupby('qid').head(100)

100%|██████████| 101093/101093 [00:37<00:00, 5246.56it/s]

In [None]:
contreiver.head(200)

In [None]:
contreiver.to_csv('/nfs/datasets/cxj/retrievability-bias/results_contriever_100.csv')

In [37]:
contriever = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_contriever_100.csv',index_col=1).reset_index()

In [38]:
contriever.head(2)

Unnamed: 0.1,qid,Unnamed: 0,query,docno,docid,score,rank
0,1048578,0,cost of endless pools swim spa,7453627,7453627,2.209815,0
1,1048578,1,cost of endless pools swim spa,3819219,3819219,2.108093,1


In [39]:
contriever = transform_df(contriever)
ir_measures.calc_aggregate([RR, nDCG@10], qrels, contriever)

{nDCG@10: 0.014830054037404066, RR: 0.013490700854761381}

In [14]:
bm25_colbert = pd.read_csv('/nfs/datasets/cxj/retrievability-bias/results_bm25_colbert_100.csv')
bm25_colbert.head(2)

Unnamed: 0.1,Unnamed: 0,qid,query,docno,score,rank
0,0,1000,2015 blue jays best players,7391782,24.29726,0
1,1,1000,2015 blue jays best players,3376671,22.79686,1


In [None]:

del bm25_colbert['score']
bm25_colbert['docid'] = bm25_colbert['docno']
bm25_colbert = transform_df(bm25_colbert)
ir_measures.calc_aggregate([RR, nDCG@10], qrels, bm25_colbert)