In [1]:
import pandas as pd
import ir_datasets as irds
from ir_measures import read_trec_qrels
from ir_measures import * 
from ir_measures import evaluator
import pyterrier as pt
from scipy.stats import ttest_ind
import os
from os import path as path
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Data Ingestion

In [2]:
DIR = '../..' # if this breaks replace with a full path to the repo

DATASET = "msmarco-passage/trec-dl-2019/judged"
qrel_directory = f'{DIR}/judgments/main/qrels/'
annotation_directory = f'{DIR}/judgments/main/doccano/'

In [3]:
dataset = irds.load(DATASET)
original_qrels = pd.DataFrame(dataset.qrels_iter())

In [4]:
all_qrels = []
for file in os.listdir(qrel_directory):
    print(file)
    if file.endswith('.txt'):
        qrels = pd.DataFrame(read_trec_qrels(qrel_directory + file))
        annotator = file.replace('.txt', '').replace('-qrels', '')
        qrels['annotator'] = annotator
        all_qrels.append(qrels)

all_qrels = pd.concat(all_qrels)

andrew-parry-qrels.txt
ferdinand-schlatt-qrels.txt
froebe-qrels.txt
guglielmo-faggioli-qrels.txt
harry-scells-qrels.txt
saber-zerhoudi-qrels.txt
sean-macavaney-qrels.txt
eugene-yang-qrels.txt


In [5]:
all_comments = []
for file in os.listdir(annotation_directory):
    if file.endswith('.jsonl'):
        comments = pd.read_json(annotation_directory + file, lines=True)
        annotator = file.replace('.jsonl', '')
        comments['annotator'] = annotator
        # filter in case that comments 'Comments' column is an empty list
        comments = comments[comments['Comments'].apply(lambda x: len(x) > 0)]
        all_comments.append(comments)

all_comments = pd.concat(all_comments)[['text', 'annotator', 'label', 'Comments']]
all_comments['label'] = all_comments['label'].apply(lambda x: x[0])

In [6]:
RUN_DIR = f'{DIR}/runs/trec-dl-2019'
runs = {}
for run in os.listdir(RUN_DIR):
    frame = pt.io.read_results(path.join(RUN_DIR, run)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
    frame['pool'] = 'dl-19-official' in run
    runs[run] = frame

# Qrel Pooling

In [7]:
query_groups = all_qrels.groupby('annotator')['query_id'].apply(set).reset_index()

# Step 2: Rename the column to 'queries'
query_groups.rename(columns={'query_id': 'queries'}, inplace=True)
query_groups = query_groups.set_index("annotator").queries.to_dict()

In [8]:
all_qrels

Unnamed: 0,query_id,doc_id,relevance,iteration,annotator
0,855410,8651770,2,0,andrew-parry
1,855410,8651771,2,0,andrew-parry
2,855410,8651772,1,0,andrew-parry
3,855410,8651775,3,0,andrew-parry
4,146187,1230566,1,0,andrew-parry
...,...,...,...,...,...
1110,1114646,3915244,0,0,eugene-yang
1111,168216,4713638,0,0,eugene-yang
1112,168216,1696466,0,0,eugene-yang
1113,168216,4245224,0,0,eugene-yang


In [9]:
all_qrels['queries'] = all_qrels['annotator'].map(lambda x : query_groups[x])
all_qrels['query_tuple'] = all_qrels['queries'].apply(lambda x : tuple(list(x)))
# Create a mapping of each unique tuple to a unique number
unique_queries = {tuple_queries: idx for idx, tuple_queries in enumerate(all_qrels['query_tuple'].unique())}
print(unique_queries)
# Assign the unique number to a new column
all_qrels['query_group_number'] = all_qrels['query_tuple'].map(lambda x : unique_queries[x])

{('87452', '573724', '1110199', '1133167', '156493', '451602', '130510', '1114646', '146187', '855410', '168216', '489204', '490595'): 0, ('359349', '1121402', '87181', '148538', '264014', '1124210', '1129237', '527433', '168216'): 1, ('183378', '833860', '104861', '915593', '1114819', '962179', '47923', '168216', '19335'): 2, ('207786', '1113437', '1037798', '405717', '1115776', '443396', '182539', '1063750', '1117099', '131843', '1106007', '1112341', '1103812', '168216', '1121709'): 3}


In [10]:
all_qrels.columns

Index(['query_id', 'doc_id', 'relevance', 'iteration', 'annotator', 'queries',
       'query_tuple', 'query_group_number'],
      dtype='object')

In [11]:
all_qrels

Unnamed: 0,query_id,doc_id,relevance,iteration,annotator,queries,query_tuple,query_group_number
0,855410,8651770,2,0,andrew-parry,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
1,855410,8651771,2,0,andrew-parry,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
2,855410,8651772,1,0,andrew-parry,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
3,855410,8651775,3,0,andrew-parry,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
4,146187,1230566,1,0,andrew-parry,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
...,...,...,...,...,...,...,...,...
1110,1114646,3915244,0,0,eugene-yang,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
1111,168216,4713638,0,0,eugene-yang,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
1112,168216,1696466,0,0,eugene-yang,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
1113,168216,4245224,0,0,eugene-yang,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0


In [44]:
annotator_idx = all_qrels.drop_duplicates(["annotator", "query_tuple"])

In [46]:
annotator_idx = annotator_idx.set_index('annotator').query_tuple.to_dict()

In [12]:
import pandas as pd
import itertools

# Assuming your dataframe is named 'df'

# Group by 'query_group_number' and collect unique annotators
grouped_annotators = all_qrels.groupby('query_group_number')['annotator'].unique().to_dict()

# Create a list of all possible combinations of annotators for each group
combinations = [
    [annotators[0], annotators[1]]
    for annotators in grouped_annotators.values()
]

# Create a list of all possible combinations across groups
all_combinations = [
    comb for comb in itertools.product(*combinations)
]

# Iterate through each combination and concatenate the corresponding data
combinations = {}
for comb in all_combinations:
    combinations[tuple(comb)] = pd.concat([all_qrels[all_qrels['annotator'] == annotator] for annotator in comb])[['query_id', 'doc_id', 'relevance']]
    # Process the concatenated_df as needed (e.g., save it, analyze it)

In [31]:
all_qrels

Unnamed: 0,query_id,doc_id,relevance,iteration,annotator,queries,query_tuple,query_group_number
0,855410,8651770,2,0,andrew-parry,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
1,855410,8651771,2,0,andrew-parry,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
2,855410,8651772,1,0,andrew-parry,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
3,855410,8651775,3,0,andrew-parry,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
4,146187,1230566,1,0,andrew-parry,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
...,...,...,...,...,...,...,...,...
1110,1114646,3915244,0,0,eugene-yang,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
1111,168216,4713638,0,0,eugene-yang,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
1112,168216,1696466,0,0,eugene-yang,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0
1113,168216,4245224,0,0,eugene-yang,"{87452, 573724, 1110199, 1133167, 156493, 4516...","(87452, 573724, 1110199, 1133167, 156493, 4516...",0


# Evaluation

In [13]:
metrics = [AP(rel=2), NDCG(cutoff=10), R(rel=2)@100, P(rel=2, cutoff=10), RR(rel=2), RR(rel=2, cutoff=10)]

In [14]:
original_evaluator = evaluator(metrics, qrels=original_qrels)

## Per Annotator Performance

In [None]:
out = {
    'annotators' : [],
    'run' : [],
    'metric' : [],
    'value' : [],
}
progress = tqdm(total=len(combinations) * len(runs))
for annotators, qrels in combinations.items():
    eval = evaluator(metrics, qrels=qrels)
    for name, run in runs.items():
        progress.update(1)
        measure = eval.calc_aggregate(run)
        measure = {str(k) : v for k, v in measure.items()}
        for k, v in measure.items():
            out['annotators'].append(annotators)
            out['run'].append(name)
            out['metric'].append(k)
            out['value'].append(v)


100%|██████████| 976/976 [26:52<00:00,  1.65s/it]

  1%|          | 7/976 [00:00<00:15, 62.75it/s][A
  1%|▏         | 14/976 [00:00<00:57, 16.70it/s][A
  2%|▏         | 18/976 [00:02<03:20,  4.78it/s][A

In [16]:
out = pd.DataFrame(out)

In [17]:
out.columns

Index(['annotators', 'run', 'metric', 'value'], dtype='object')

## Original Order

In [18]:
from collections import defaultdict
output = {k : defaultdict(list) for k in all_qrels.query_tuple.unique()}
for name, run in runs.items():
    for q_set in output.keys():
        measure = original_evaluator.calc_aggregate(run[run.query_id.isin(list(q_set))])
        measure = {str(k): v for k, v in measure.items()}
        for k, v in measure.items():
            output[q_set][k].append((name, v))

In [19]:
output = {q_set : {k : [x[0] for x in sorted(v, key=lambda x : x[1], reverse=True)] for k, v in metric_set.items()} for q_set, metric_set in output.items()}

## Comparison

In [24]:
grouped_data = out.groupby("annotators")
metric_variances = {}
for annotators, group in grouped_data:
    metric_variances[annotators] = {}
    for metric, subgroup in group.groupby("metric"):
        metric_variances[annotators][metric] = subgroup["value"].var()

In [25]:
system_grouped_data = out.groupby("run")
system_metric_variances = {}
for run, group in system_grouped_data:
    system_metric_variances[run] = {}
    for metric, subgroup in group.groupby("metric"):
        system_metric_variances[run][metric] = subgroup["value"].var()

In [39]:
annotator_to_queries

{('eugene-yang', 'eugene-yang'): ('207786',
  '1113437',
  '1037798',
  '405717',
  '1115776',
  '443396',
  '182539',
  '1063750',
  '1117099',
  '131843',
  '1106007',
  '1112341',
  '1103812',
  '168216',
  '1121709')}

In [37]:
out

Unnamed: 0,annotators,run,metric,value
0,"(andrew-parry, ferdinand-schlatt, guglielmo-fa...",colbert_monoelectra-base_msmarco-passage-trec-...,RR(rel=2)@10,0.913566
1,"(andrew-parry, ferdinand-schlatt, guglielmo-fa...",colbert_monoelectra-base_msmarco-passage-trec-...,nDCG@10,0.767889
2,"(andrew-parry, ferdinand-schlatt, guglielmo-fa...",colbert_monoelectra-base_msmarco-passage-trec-...,P(rel=2)@10,0.683721
3,"(andrew-parry, ferdinand-schlatt, guglielmo-fa...",colbert_monoelectra-base_msmarco-passage-trec-...,RR(rel=2),0.912791
4,"(andrew-parry, ferdinand-schlatt, guglielmo-fa...",colbert_monoelectra-base_msmarco-passage-trec-...,AP(rel=2),0.480322
...,...,...,...,...
5851,"(eugene-yang, froebe, harry-scells, sean-macav...",tirex_set-encoder-large_msmarco-passage-trec-d...,nDCG@10,0.727026
5852,"(eugene-yang, froebe, harry-scells, sean-macav...",tirex_set-encoder-large_msmarco-passage-trec-d...,P(rel=2)@10,0.651163
5853,"(eugene-yang, froebe, harry-scells, sean-macav...",tirex_set-encoder-large_msmarco-passage-trec-d...,RR(rel=2),0.842636
5854,"(eugene-yang, froebe, harry-scells, sean-macav...",tirex_set-encoder-large_msmarco-passage-trec-d...,AP(rel=2),0.377025


In [58]:
list(out[out['metric']=='P(rel=2)@10'].groupby('run').value)

[('colbert_monoelectra-base_msmarco-passage-trec-dl-2019-judged.run',
  2       0.683721
  368     0.683721
  734     0.683721
  1100    0.683721
  1466    0.683721
  1832    0.683721
  2198    0.683721
  2564    0.683721
  2930    0.683721
  3296    0.683721
  3662    0.683721
  4028    0.683721
  4394    0.683721
  4760    0.683721
  5126    0.683721
  5492    0.683721
  Name: value, dtype: float64),
 ('colbert_monoelectra-large_msmarco-passage-trec-dl-2019-judged.run',
  8       0.688372
  374     0.688372
  740     0.688372
  1106    0.688372
  1472    0.688372
  1838    0.688372
  2204    0.688372
  2570    0.688372
  2936    0.688372
  3302    0.688372
  3668    0.688372
  4034    0.688372
  4400    0.688372
  4766    0.688372
  5132    0.688372
  5498    0.688372
  Name: value, dtype: float64),
 ('colbert_rankgpt4-turbo_msmarco-passage-trec-dl-2019-judged.run',
  14      0.686047
  380     0.686047
  746     0.686047
  1112    0.686047
  1478    0.686047
  1844    0.686047
  221

In [48]:
rbo_values = {}
system_order_changes = []
out['queries'] = out['annotators'].map(lambda x : annotator_idx[x[0]])
for annotator, group in out.groupby("annotators"):
    query_group = group.queries.iloc[0]
    rbo_values[annotators] = {}
    for metric, subgroup in group.groupby("metric"):
        system_order_1 = subgroup.sort_values(by='value', ascending=False).run.to_list()
        system_order_2 = output[query_group][metric]  # Use the original system order
        val = rbo.RankingSimilarity(system_order_1, system_order_2).rbo()
        rbo_values[annotators][metric] = val

In [49]:
rbo_values

{('andrew-parry',
  'ferdinand-schlatt',
  'guglielmo-faggioli',
  'saber-zerhoudi'): {'AP(rel=2)': 0.838976691538069,
  'P(rel=2)@10': 0.8611696600159712,
  'R(rel=2)@100': 0.942012271557254,
  'RR(rel=2)': 0.655011462667698,
  'RR(rel=2)@10': 0.6553870431562835,
  'nDCG@10': 0.8522789987193002}}