In [1]:
import pandas as pd
import ir_datasets as irds
from ir_measures import read_trec_qrels
from ir_measures import * 
from ir_measures import evaluator
import pyterrier as pt
if not pt.started(): pt.init()
from scipy.stats import ttest_ind
import os
from os import path as path

PyTerrier 0.10.0 has loaded Terrier 5.10 (built by craigm on 2024-08-22 17:33) and terrier-helper 0.0.8



In [2]:
!pip install seaborn 
!pip install matplotlib

[0m

# Data Ingestion

In [3]:
DIR = '../..' # if this breaks replace with a full path to the repo

DATASET = "msmarco-passage/trec-dl-2019/judged"
qrel_directory = f'{DIR}/judgments/main/qrels/'
annotation_directory = f'{DIR}/judgments/main/doccano/'

In [4]:
dataset = irds.load(DATASET)
original_qrels = pd.DataFrame(dataset.qrels_iter())

In [5]:
all_qrels = []
for file in os.listdir(qrel_directory):
    print(file)
    if file.endswith('.txt'):
        qrels = pd.DataFrame(read_trec_qrels(qrel_directory + file))
        annotator = file.replace('.txt', '').replace('-qrels', '')
        qrels['annotator'] = annotator
        all_qrels.append(qrels)

all_qrels = pd.concat(all_qrels)

andrew-parry-qrels.txt
ferdinand-schlatt-qrels.txt
froebe-qrels.txt
guglielmo-faggioli-qrels.txt
harry-scells-qrels.txt
saber-zerhoudi-qrels.txt
sean-macavaney-qrels.txt
eugene-yang-qrels.txt


In [6]:
def find_missing_zero_judgments(original_df, new_df):
    # Ensure the dataframes have the same column names

    # Create a unique identifier for each query-document pair
    original_df['pair_id'] = original_df['query_id'] + '_' + original_df['doc_id']
    new_df['pair_id'] = new_df['query_id'] + '_' + new_df['doc_id']

    # Find original judgments not in new judgments
    missing_judgments = original_df[~original_df['pair_id'].isin(new_df['pair_id'])]

    # Filter for relevance 0
    missing_zero_judgments = missing_judgments[missing_judgments['relevance'] == 0]

    # Drop the 'pair_id' column as it's no longer needed
    missing_zero_judgments = missing_zero_judgments.drop('pair_id', axis=1)

    return missing_zero_judgments
missing = find_missing_zero_judgments(original_qrels, all_qrels)
missing['annotator'] = 'original'

In [7]:
all_comments = []
for file in os.listdir(annotation_directory):
    if file.endswith('.jsonl'):
        comments = pd.read_json(annotation_directory + file, lines=True)
        annotator = file.replace('.jsonl', '')
        comments['annotator'] = annotator
        # filter in case that comments 'Comments' column is an empty list
        comments = comments[comments['Comments'].apply(lambda x: len(x) > 0)]
        all_comments.append(comments)

all_comments = pd.concat(all_comments)[['text', 'annotator', 'label', 'Comments']]
all_comments['label'] = all_comments['label'].apply(lambda x: x[0])

In [8]:
RUN_DIR = f'{DIR}/runs/trec-dl-2019'
BM25_TUNED = 'dl-19-official-input.bm25tuned_p.gz'
BM25_BASE = 'dl-19-official-input.bm25base_p.gz'
SET_ENCODER_COLBERT = 'colbert_monoelectra-base_msmarco-passage-trec-dl-2019-judged.run'
COLBERT = 'maik-froebe-colbert-run.txt'
SPLADE = 'maik-froebe-splade-run.txt'
RANK_ZEPHYR = 'maik-froebe-rank-zephyr-run.txt'
COLBERT_ZEPHYR = 'colbert_rankzephyr_msmarco-passage-trec-dl-2019-judged.run'
GPT4 = 'colbert_rankgpt4o-full_msmarco-passage-trec-dl-2019-judged.run'

In [9]:
BM25_TUNED_RUN = pt.io.read_results(path.join(RUN_DIR, BM25_TUNED)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
BM25_BASE_RUN = pt.io.read_results(path.join(RUN_DIR, BM25_BASE)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
SET_ENCODER_COLBERT_RUN = pt.io.read_results(path.join(RUN_DIR, SET_ENCODER_COLBERT)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
COLBERT_RUN = pt.io.read_results(path.join(RUN_DIR, COLBERT)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
SPLADE_RUN = pt.io.read_results(path.join(RUN_DIR, SPLADE)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
RANK_ZEPHYR_RUN = pt.io.read_results(path.join(RUN_DIR, RANK_ZEPHYR)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
GPT4_RUN = pt.io.read_results(path.join(RUN_DIR, GPT4)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
COLBERT_ZEPHYR_RUN = pt.io.read_results(path.join(RUN_DIR, COLBERT_ZEPHYR)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
runs = {
    'bm25_tuned': BM25_TUNED_RUN,
    'bm25_base': BM25_BASE_RUN,
    'set_encoder_colbert': SET_ENCODER_COLBERT_RUN,
    'colbert': COLBERT_RUN,
    'splade': SPLADE_RUN,
    'rank_zephyr': RANK_ZEPHYR_RUN,
    'colbert_zephyr' : COLBERT_ZEPHYR_RUN,
    'gpt4' : GPT4_RUN
}

# Effectiveness

In [10]:
metrics = [AP(rel=2), NDCG(cutoff=10), R(rel=2)@100, P(rel=2, cutoff=10), RR(rel=2), RR(rel=2, cutoff=10)]

In [11]:
original_evaluator = evaluator(metrics, qrels=original_qrels)

In [12]:
for name, run in runs.items():
    if name not in ['bm25_base', 'splade', 'rank_zephyr', 'gpt4', 'colbert_zephyr']:
        continue
    print(f"---{name}---")
    og = original_evaluator.calc_aggregate(run)
    og = {str(k): round(v, 2) for k, v in og.items()}
    print(f"& {og['nDCG@10']} & {og['P(rel=2)@10']} & {og['RR(rel=2)@10']} & {og['R(rel=2)@100']}")
    print(f"-----oracle {name}----")
    filtered_run = pd.merge(run, original_qrels[['query_id', 'doc_id']], 
                            on=['query_id', 'doc_id'], 
                            how='inner').sort_values(['query_id', 'score'], ascending=[True, False])
    f = original_evaluator.calc_aggregate(filtered_run)
    f = {str(k): round(v, 2) for k, v in f.items()}
    print(f"& {f['nDCG@10']} & {f['P(rel=2)@10']} & {f['RR(rel=2)@10']} & {f['R(rel=2)@100']}")

---bm25_base---
& 0.51 & 0.41 & 0.7 & 0.49
-----oracle bm25_base----
& 0.51 & 0.41 & 0.7 & 0.65
---splade---
& 0.73 & 0.62 & 0.91 & 0.6
-----oracle splade----
& 0.73 & 0.63 & 0.91 & 0.69
---rank_zephyr---
& 0.72 & 0.64 & 0.82 & 0.49
-----oracle rank_zephyr----
& 0.72 & 0.65 & 0.82 & 0.65
---colbert_zephyr---
& 0.75 & 0.67 & 0.84 & 0.67
-----oracle colbert_zephyr----
& 0.77 & 0.7 & 0.85 & 0.67
---gpt4---
& 0.78 & 0.71 & 0.87 & 0.67
-----oracle gpt4----
& 0.8 & 0.73 & 0.89 & 0.67


In [13]:
import numpy as np

def convert_qrels(qrels, shuffle_grades=False, seed=None):
    # Set the random seed if provided, or generate a random seed dynamically
    rng = np.random.default_rng(seed)
    qrels = pd.concat([qrels, missing])

    # Step 1: Sort by query_id and relevance to ensure correct ranking order
    qrels = qrels.sort_values(by=['query_id', 'relevance'], ascending=[True, False])

    # Step 2: Shuffle within each relevance group for each query, if shuffle_grades=True
    if shuffle_grades:
        shuffled_qrels = []
        for query_id, group in qrels.groupby('query_id'):
            # For each query, process relevance levels
            group_shuffled = []
            for relevance_level, relevance_group in group.groupby('relevance'):
                # Shuffle rows within each relevance level, using the seed for reproducibility
                shuffled_group = relevance_group.sample(frac=1, random_state=rng.integers(1e9))
                group_shuffled.append(shuffled_group)

            # Combine shuffled relevance groups for the query, preserving relevance order
            shuffled_qrels.append(pd.concat(group_shuffled))

        # Concatenate all queries back into a single DataFrame
        qrels = pd.concat(shuffled_qrels).reset_index(drop=True)

    # Step 3: Assign a score based on the **sorted relevance**, independent of shuffling
    qrels['score'] = qrels.groupby('query_id').cumcount(ascending=True) + 1

    return qrels


In [14]:
annotator_out = []
for annotator, qrels in all_qrels.groupby('annotator'):
    annotator_evaluator = evaluator(metrics, qrels=pd.concat([qrels, missing]))
    relevant_queries = qrels['query_id'].unique()
    query_set = list(relevant_queries)
    rez = convert_qrels(qrels)
    measures = original_evaluator.calc_aggregate(rez)
    measures = {str(k): v for k, v in measures.items()}
    measures['annotator'] = annotator
    measures['run'] = 'annotator'
    measures['type'] = 'official'
    measures['queries'] = query_set
    annotator_out.append(measures)

    for run_name, run in runs.items():
        run = run[run['query_id'].isin(relevant_queries)]
        run_measures = original_evaluator.calc_aggregate(run)
        run_measures = {str(k): v for k, v in run_measures.items()}
        run_measures['annotator'] = annotator
        run_measures['run'] = run_name
        run_measures['type'] = 'official'
        run_measures['queries'] = query_set
        annotator_out.append(run_measures)

        run_measures = annotator_evaluator.calc_aggregate(run)
        run_measures = {str(k): v for k, v in run_measures.items()}
        run_measures['annotator'] = annotator
        run_measures['run'] = run_name
        run_measures['type'] = 'annotator'
        run_measures['queries'] = query_set
        annotator_out.append(run_measures)


annotator_out = pd.DataFrame(annotator_out)

In [15]:
annotator_out

Unnamed: 0,RR(rel=2),R(rel=2)@100,P(rel=2)@10,nDCG@10,RR(rel=2)@10,AP(rel=2),annotator,run,type,queries
0,0.002761,0.029561,0.000000,0.000000,0.000000,0.033615,andrew-parry,annotator,official,"[855410, 146187, 130510, 1110199, 490595, 5737..."
1,0.268217,0.165347,0.139535,0.173879,0.268217,0.118552,andrew-parry,bm25_tuned,official,"[855410, 146187, 130510, 1110199, 490595, 5737..."
2,0.144251,0.161907,0.067442,0.104050,0.143282,0.073287,andrew-parry,bm25_tuned,annotator,"[855410, 146187, 130510, 1110199, 490595, 5737..."
3,0.266279,0.162255,0.139535,0.175449,0.266279,0.117299,andrew-parry,bm25_base,official,"[855410, 146187, 130510, 1110199, 490595, 5737..."
4,0.138548,0.166204,0.081395,0.113249,0.136434,0.076963,andrew-parry,bm25_base,annotator,"[855410, 146187, 130510, 1110199, 490595, 5737..."
...,...,...,...,...,...,...,...,...,...,...
131,0.246512,0.184897,0.125581,0.200379,0.246512,0.115806,sean-macavaney,rank_zephyr,annotator,"[405717, 182539, 131843, 1113437, 443396, 1117..."
132,0.246512,0.217744,0.186047,0.232708,0.246512,0.128481,sean-macavaney,colbert_zephyr,official,"[405717, 182539, 131843, 1113437, 443396, 1117..."
133,0.247619,0.252800,0.148837,0.214382,0.246512,0.157529,sean-macavaney,colbert_zephyr,annotator,"[405717, 182539, 131843, 1113437, 443396, 1117..."
134,0.289535,0.217744,0.227907,0.279811,0.289535,0.148994,sean-macavaney,gpt4,official,"[405717, 182539, 131843, 1113437, 443396, 1117..."


In [16]:
annotator_out['query_tuple'] = annotator_out['queries'].apply(tuple)
unique_queries = {tuple_queries: idx for idx, tuple_queries in enumerate(annotator_out['query_tuple'].unique())}
annotator_out['query_idx'] = annotator_out['query_tuple'].apply(lambda x: unique_queries[x])

In [17]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_metric_by_type(df, metric):
    unique_groups = df['annotator'].unique()
    
    # Create a plot for each unique group of queries
    for group in unique_groups:
        subset = df[df['annotator'] == group]
        
        plt.figure(figsize=(10, 6))
        sns.barplot(x='run', y=metric, hue='type', data=subset)
        
        plt.title(f'Comparison of {metric} for query group {group}')
        plt.ylabel(metric)
        # y range 0 to 1
        plt.ylim(0, 1)
        plt.xlabel('Run')
        plt.legend(title='Type')
        plt.tight_layout()
        plt.show()

# Example: Plot 'RR(rel=2)'
#plot_metric_by_type(annotator_out, 'RR(rel=2)')

## Oracles

In [18]:
max_qrels = all_qrels.groupby(['query_id', 'doc_id']).agg({'relevance': 'max'}).reset_index()
min_qrels = all_qrels.groupby(['query_id', 'doc_id']).agg({'relevance': 'min'}).reset_index()
mean_qrels = all_qrels.groupby(['query_id', 'doc_id']).agg({'relevance': 'mean'}).reset_index()

In [19]:
max_run = convert_qrels(max_qrels)
min_run = convert_qrels(min_qrels)
mean_run = convert_qrels(mean_qrels)

In [20]:
print(original_evaluator.calc_aggregate(max_run))
print(original_evaluator.calc_aggregate(min_run))
print(original_evaluator.calc_aggregate(mean_run))

{RR(rel=2): 0.010402305325311135, R(rel=2)@100: 0.08406269625721531, P(rel=2)@10: 0.0, nDCG@10: 0.0, RR(rel=2)@10: 0.0, AP(rel=2): 0.13396197292199022}
{RR(rel=2): 0.010934269396891393, R(rel=2)@100: 0.10124930334114808, P(rel=2)@10: 0.0, nDCG@10: 0.0, RR(rel=2)@10: 0.0, AP(rel=2): 0.13561343866050465}
{RR(rel=2): 0.010378224940319394, R(rel=2)@100: 0.08350593476384009, P(rel=2)@10: 0.0, nDCG@10: 0.0, RR(rel=2)@10: 0.0, AP(rel=2): 0.13326383445160664}


In [21]:
NUM_SAMPLES = 100
random_iters = []
for i in range(100):
    max_run = convert_qrels(max_qrels, shuffle_grades=True, seed=i)
    max_measures = original_evaluator.calc_aggregate(max_run)
    max_measures = {str(k): v for k, v in max_measures.items()}
    max_measures['run'] = 'max'
    max_measures['iter'] = i
    min_run = convert_qrels(min_qrels, shuffle_grades=True, seed=i)
    min_measures = original_evaluator.calc_aggregate(min_run)
    min_measures = {str(k): v for k, v in min_measures.items()}
    min_measures['run'] = 'min'
    min_measures['iter'] = i
    mean_run = convert_qrels(min_qrels, shuffle_grades=True, seed=i)
    mean_measures = original_evaluator.calc_aggregate(min_run)
    mean_measures = {str(k): v for k, v in min_measures.items()}
    mean_measures['run'] = 'mean'
    mean_measures['iter'] = i
    random_iters.append(max_measures)
    random_iters.append(min_measures)
    random_iters.append(mean_measures)

random_iters = pd.DataFrame(random_iters)

In [22]:
random_iters.groupby('run').mean()

Unnamed: 0_level_0,RR(rel=2),R(rel=2)@100,P(rel=2)@10,nDCG@10,RR(rel=2)@10,AP(rel=2),iter
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
max,0.840562,0.855133,0.687651,0.777399,0.839433,0.690371,49.5
mean,0.830569,0.749785,0.679558,0.758397,0.829236,0.59228,49.5
min,0.830569,0.749785,0.679558,0.758397,0.829236,0.59228,49.5


In [23]:
random_iters

Unnamed: 0,RR(rel=2),R(rel=2)@100,P(rel=2)@10,nDCG@10,RR(rel=2)@10,AP(rel=2),run,iter
0,0.773044,0.857477,0.695349,0.761808,0.770930,0.681143,max,0
1,0.878837,0.746466,0.683721,0.766435,0.877907,0.594408,min,0
2,0.878837,0.746466,0.683721,0.766435,0.877907,0.594408,mean,0
3,0.836739,0.863189,0.700000,0.779929,0.835078,0.692640,max,1
4,0.817580,0.738129,0.683721,0.756717,0.816473,0.592764,min,1
...,...,...,...,...,...,...,...,...
295,0.836877,0.746844,0.695349,0.761283,0.836656,0.593085,min,98
296,0.836877,0.746844,0.695349,0.761283,0.836656,0.593085,mean,98
297,0.837625,0.854644,0.683721,0.769404,0.837625,0.688160,max,99
298,0.835649,0.740888,0.679070,0.759923,0.834755,0.598577,min,99


In [24]:
random_iters.groupby('run').var()

Unnamed: 0_level_0,RR(rel=2),R(rel=2)@100,P(rel=2)@10,nDCG@10,RR(rel=2)@10,AP(rel=2),iter
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
max,0.000902,1.9e-05,0.000109,6.7e-05,0.000901,3.2e-05,841.666667
mean,0.000315,0.000135,0.000102,4.4e-05,0.000319,3.4e-05,841.666667
min,0.000315,0.000135,0.000102,4.4e-05,0.000319,3.4e-05,841.666667
