In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)

import pyterrier as pt
if not pt.java.started():
    pt.java.init()

import ir_datasets
import ir_measures
from ir_measures import * # imports all supported measures, e.g., AP, nDCG, RR, P
import statistics

import os
os.environ["PIP_ROOT_USER_ACTION"] = "ignore"
import glob
from itertools import islice

In [None]:
# dataset_name = 'msmarco-passage'
# dataset = pt.get_dataset(f'irds:{dataset_name}')

dataset = pt.get_dataset(f'irds:msmarco-passage')
# df_dataset = pd.DataFrame(dataset.get_corpus_iter(verbose=True))
eval_dev = pt.get_dataset(f'irds:msmarco-passage/dev')
dev_topics = eval_dev.get_topics()
# qrels = eval_dev.get_qrels()

dev_eval = ir_datasets.load("msmarco-passage/dev")
# topics = pd.DataFrame(eval.queries_iter())
dev_qrels = pd.DataFrame(dev_eval.qrels_iter())



dl19 = pt.get_dataset('irds:msmarco-passage/trec-dl-2019')
dl19_topics = dl19.get_topics()
# dl19_qrels = dl19.get_qrels()

dl20 = pt.get_dataset('irds:msmarco-passage/trec-dl-2020')
dl20_topics = dl20.get_topics()
# dl20_qrels = dl20.get_qrels()

dl19_eval = ir_datasets.load("msmarco-passage/trec-dl-2019")
# topics = pd.DataFrame(eval.queries_iter())
dl19_qrels = pd.DataFrame(dl19_eval.qrels_iter())


dl20_eval = ir_datasets.load("msmarco-passage/trec-dl-2020")
# topics = pd.DataFrame(eval.queries_iter())
dl20_qrels = pd.DataFrame(dl20_eval.qrels_iter())

dl1920_topics = pd.concat([dl19_topics, dl20_topics], ignore_index=True)
dl1920_qrels = pd.concat([dl19_qrels, dl20_qrels], ignore_index=True)


In [None]:
def Gini(v):
    v = np.array(v)
    bins = np.linspace(0., 100., 11)
    total = float(np.sum(v))
    yvals = [0]
    for b in bins[1:]:
        bin_vals = v[v <= np.percentile(v, b)]
        bin_fraction = (np.sum(bin_vals) / total) * 100.0
        yvals.append(bin_fraction)
    # perfect equality area
    pe_area = np.trapz(bins, x=bins)
    # lorenz area
    lorenz_area = np.trapz(yvals, x=bins)
    gini_val = (pe_area - lorenz_area) / float(pe_area)
    return gini_val

def calc_stats_v2(modelname,df,scoredF, topics,qrels):
    qids_to_keep = topics['qid'].to_list()
    mask = np.logical_or.reduce([df["qid"] == val for val in qids_to_keep])
    df_filtered = df[mask]
    grouped = df_filtered.groupby("docno")[['r_score']].sum().reset_index()
    grouped.to_csv(scoredF, index=False)

    # if not os.path.exists(scoredF):
    #     # topics['qid'] = topics['qid'].astype(str)
    #     qids_to_keep = topics['qid'].to_list()
    #     mask = np.logical_or.reduce([df["qid"] == val for val in qids_to_keep])
    #     df_filtered = df[mask]
    #     grouped = df_filtered.groupby("docno")[['r_score']].sum().reset_index()
    #     grouped.to_csv(scoredF,index=False)
    # else:
    #     grouped = pd.read_csv(scoredF, index_col=0).reset_index()

    scores = grouped['r_score'].to_list()

    print('start statistics')
    mean = statistics.mean(scores)
    std_dev = statistics.stdev(scores)
    gini_value = Gini(scores)

    df2 = pd.DataFrame()
    df2['query_id'] = df['qid'].astype(str)
    df2['doc_id'] = df['docno'].astype(str)
    df2['score'] = df['score']
    m = ir_measures.calc_aggregate([nDCG@10, RR], qrels, df2)
    print(mean, std_dev, gini_value, m[nDCG@10], m[RR])
    return mean, std_dev, gini_value, m[nDCG@10], m[RR]

In [None]:
dev_topics['cluster'] = 0
dev_all = dev_topics.groupby('cluster')

In [None]:
dl1920_topics['cluster'] = 0
dl1920_all = dl1920_topics.groupby('cluster')

In [None]:
dev_topics_sampled = pd.read_csv('./results/smapled_dev_queries_50.csv', index_col=0).reset_index() # 2000 clusters with each 50 queries.

In [None]:
dev_2000_grps = dev_topics_sampled.groupby('cluster')
# print(len(grouped.groups.keys()))

In [None]:
# dl1920_grouped.size().sum()

In [None]:
df = pd.read_csv('/nfs/resources/cxj/retrievability-bias/tctcolbert/df_tctcolbert_rscore_dl1920_0.csv', index_col=0).reset_index()
df['qid'] = df['qid'].astype(str)

In [None]:
df.head()

In [None]:
for cluster_id, queries_df in dl1920_all:
    print(queries_df.shape)
    scoredF = f'/root/tctcolbert_dev_all_T0_G0.csv'
    if os.path.exists(scoredF):
        os.remove(scoredF)

    mean, std_dev, gini_value, ndcg10, rr = calc_stats_v2('tctcolbert',df,scoredF,queries_df[:5], dl1920_qrels)

In [None]:
root_dir = f'/root/retrievability-bias'
nfs_dir = f'/nfs/datasets/cxj/retrievability-bias-from_resources_ok'

# all_topics = ['dl1920_all', 'dev_all', 'dev_2000_grps']
all_topics = ['dev_2000_grps']
pt.tqdm.pandas()

for eval_topics in all_topics:
    # for modelname in ['bm25', 'bm25_monot5', 'splade', 'tctcolbert','bm25_tctcolbert']:
    for modelname in ['tctcolbert','bm25_tctcolbert']:
        this_model_res = []
        for threshold in [0, 30, 60, 90]:
            """
            Calc retrievability score for each doc
            """ 
            print(f'start {modelname} ----> {eval_topics} ----> threshold {threshold}')
            rscore_csv = f'/nfs/resources/cxj/retrievability-bias/{modelname}/df_{modelname}_rscore_{eval_topics}_{threshold}.csv'
            if os.path.exists(rscore_csv):
                df = pd.read_csv(rscore_csv, index_col=0).reset_index()
            else:
                origin_topics = eval_topics.split('_')[0]
                csv = f'/nfs/resources/cxj/retrievability-bias/{modelname}/df_{modelname}_{origin_topics}_{threshold}.csv'
                df = pd.read_csv(csv, index_col=0).reset_index()
                df['r_score'] = df['rank'].progress_apply(lambda x: 100 / np.log(x + 2))
                print(f'saving {rscore_csv}')
                df.to_csv(rscore_csv, index=False)
                print(f'done')
    
            """
            Calc stats for each group 
            """   
            if eval_topics == 'dev_all':
                grouped = dev_all
                qrels = dev_qrels
            elif eval_topics == 'dl1920_all':
                grouped = dl1920_all
                qrels = dl1920_qrels
            else:
                grouped = dev_2000_grps
                qrels = dev_qrels
            
            res = []
            for cluster_id, queries_df in grouped:
                print(f'Calc stats {modelname} ----> {eval_topics} ----> threshold {threshold} --> cluster_id = {cluster_id}')
                scoredF = f'{nfs_dir}/{modelname}/groups/{modelname}_{eval_topics}_T{threshold}_G{cluster_id}.csv'
                df['qid'] = df['qid'].astype(str)
                mean, std_dev, gini_value, nDCG10, rr = calc_stats_v2(modelname, df, scoredF, queries_df, qrels)
                group_res = [modelname, threshold, cluster_id, mean, std_dev, gini_value, nDCG10, rr]
                res.append(group_res)
    
            """
            put into a dataframe for all cluster_ids 
            """ 
            print(f'merge for each threshold for {modelname} ----> {eval_topics} ----> threshold {threshold}')
            df_threshold = pd.DataFrame(res, columns=['modelname', 'threshold', 'cluster_id', 'mean', 'std', 'gini', 'nDCG@10', 'RR'])
            res_csv = f'{nfs_dir}/{modelname}/groups/result_{eval_topics}_T{threshold}_allgroups.csv'
            print(f'saving {res_csv}')
            df_threshold.to_csv(res_csv, index=False)
            print('done')
    
            """
            Calc results for this threshold 
            """  
            print(f'Calc ginis for each threshold for {modelname} ----> {eval_topics} ----> threshold {threshold}')
            ginis = df_threshold['gini']
            min_gini, mean_gini, max_gini = ginis.min(), ginis.mean(), ginis.max()
            nDCG10 = df_threshold['nDCG@10'].mean()
            rr = df_threshold['RR'].mean()
            this_model_res.append([modelname, threshold, min_gini, mean_gini, max_gini, nDCG10, rr])
    
        """
        Merge all thresholds for this model.
        """
        print(f'Merge into one file for {modelname} ----> {eval_topics}')
        res_df = pd.DataFrame(this_model_res, columns=['modelname', 'threshold', 'min_gini', 'mean_gini', 'max_gini','nDCG@10', 'RR'])
        res_csv = f'{nfs_dir}/allresults/result_{modelname}_{eval_topics}_stats.csv'
        print(f'saving {res_csv}')
        res_df.to_csv(res_csv, index=False)
        os.system(f'cp -r {res_csv} /nfs/primary/retrievability-bias/results/')
        print(f'copied {res_csv}')

In [None]:
# """
# Calc stats of each group for each threshold
# """

# nfs_dir = f'/nfs/resources/cxj/retrievability-bias'
# pt.tqdm.pandas()

# # for modelname in ['bm25', 'bm25_monot5', 'splade', 'colbert', 'bm25_colbert']:
# for modelname in ['tctcolbert', 'bm25_tctcolbert']:
#     for threshold in [0, 30, 60, 90]:
    
#         csv2 = f'/nfs/resources/cxj/retrievability-bias/{modelname}/df_{modelname}_rscore_{threshold}.csv'
#         print(f'reading {csv2}')
#         df = pd.read_csv(csv2, index_col=0).reset_index()
        
#         res = []
#         for cluster_id, queries_df in grouped:
#             print(f'start {modelname} ----> threshold {threshold} --> cluster_id = {cluster_id}')
#             scoredF = f'{nfs_dir}/{modelname}/groups/{modelname}_T{threshold}_G{cluster_id}.csv'
#             mean, std, gini = calc_stats_v2(modelname, df, scoredF, queries_df)
#             group_res = [modelname, threshold, cluster_id, mean, std, gini]
#             print(group_res)
#             res.append(group_res)
    
#         print(f'start creating df per threshold')
#         df_threshold = pd.DataFrame(res, columns=['modelname', 'threshold', 'cluster_id', 'mean', 'std', 'gini'])
#         res_csv = f'{nfs_dir}/{modelname}/groups/result_T{threshold}_allgroups.csv'
#         print(f'saving {res_csv}')
#         df_threshold.to_csv(res_csv, index=False)
#         print('done')

In [None]:
# """
# mean,man, min gini for each threshold
# """

# nfs_dir = f'/nfs/resources/cxj/retrievability-bias'
# root_dir = f'/nfs/primary/retrievability-bias'
# # for modelname in ['bm25', 'bm25_monot5', 'splade', 'colbert', 'bm25_colbert']:
# for modelname in ['tctcolbert', 'bm25_tctcolbert']:
# res = []
#     for threshold in [0, 30, 60, 90]:
#         res_csv = f'{nfs_dir}/{modelname}/groups/result_T{threshold}_allgroups.csv'
#         df_threshold = pd.read_csv(res_csv, index_col=0).reset_index()
#         ginis = df_threshold['gini']
#         min_gini, mean_gini, max_gini = ginis.min(), ginis.mean(), ginis.max()
#         res.append([modelname, threshold, min_gini, mean_gini, max_gini])
        
#     res_df = pd.DataFrame(res, columns=['modelname', 'threshold', 'min_gini', 'mean_gini', 'max_gini'])
#     res_csv = f'{nfs_dir}/result_{modelname}_stats.csv'
#     print(f'saving {res_csv}')
#     res_df.to_csv(res_csv, index=False)
#     os.system(f'cp -r {res_csv} {root_dir}/results/')