In [1]:
import pandas as pd
import ir_datasets as irds
from ir_measures import read_trec_qrels
from ir_measures import * 
from ir_measures import evaluator
import pyterrier as pt
import numpy as np
from scipy.stats import ttest_ind
import os
from os import path as path
if not pt.started(): pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.10 (built by craigm on 2024-08-22 17:33) and terrier-helper 0.0.8



In [2]:
os.chdir('/nfs/primary/annotate/Annotation')

In [3]:
!ls

README.md  analysis  average_docs_per_annotator.png  judgments	pooling  runs


In [4]:
def get_vals(run : str, qrels):
    original_qrels = pd.DataFrame(irds.load("msmarco-passage/trec-dl-2019/judged").qrels_iter())
    original_qrels = original_qrels[original_qrels.query_id.isin(qrels.query_id.unique())]
    # Compute the performance per annotator
    metrics = [AP(rel=2), NDCG(cutoff=10), R(rel=2)@100, P(rel=2, cutoff=10), RR(rel=2), RR(rel=2, cutoff=10)]

    original_evaluate = evaluator(metrics, original_qrels)
    evaluate = evaluator(metrics, qrels)

    # calculate aggregate performance

    original_measures = original_evaluate.calc_aggregate(run)
    measures = evaluate.calc_aggregate(run)

    original_measures = {str(k) : v for k, v in original_measures.items()}
    measures = {str(k) : v for k, v in measures.items()}
    
    # calculate per query

    original_result = {
       str(name) : {} for name in metrics
    }

    result = {
         str(name) : {} for name in metrics
    }


    for metric in original_evaluate.iter_calc(run):
        original_result[str(metric.measure)][str(metric.query_id)] = metric.value

    for metric in evaluate.iter_calc(run):
        result[str(metric.measure)][str(metric.query_id)] = metric.value

    # calculate t-test

    t_test = {}
    old_variance = {}
    new_variance = {}

    for metric in metrics:
        original = original_result[str(metric)]
        new = {qid : result[str(metric)][qid] for qid in original.keys()}

        
        t_test[str(metric)] = ttest_ind(list(original.values()), list(new.values())).pvalue
        old_variance[str(metric)] = np.var(list(original.values()))
        new_variance[str(metric)] = np.var(list(new.values()))
        

    final = []
    for metric in metrics:
        final.append(
            {
                'metric' : str(metric),
                'original' : original_measures[str(metric)],
                'new' : measures[str(metric)],
                'p_value' : t_test[str(metric)],
                'original_variance' : old_variance[str(metric)],
                'new_variance' : new_variance[str(metric)],
            } 
        )
    
    return pd.DataFrame(final)


In [5]:
DATASET = "msmarco-passage/trec-dl-2019/judged"
qrel_directory = 'judgments/main/qrels/'


In [6]:
all_qrels = {}
for file in os.listdir(qrel_directory):
    if file.endswith('.txt'):
        qrels = pd.DataFrame(read_trec_qrels(qrel_directory + file))
        annotator = file.replace('.txt', '').replace('-qrels', '')
        all_qrels[annotator] = qrels

In [7]:
RUN_DIR = 'runs/trec-dl-2019'
BM25_TUNED = 'dl-19-official-input.bm25tuned_p.gz'
BM25_BASE = 'dl-19-official-input.bm25base_p.gz'
SET_ENCODER_COLBERT = 'colbert_monoelectra-base_msmarco-passage-trec-dl-2019-judged.run'
COLBERT = 'maik-froebe-colbert-run.txt'
SPLADE = 'maik-froebe-splade-run.txt'
RANK_ZEPHYR = 'maik-froebe-rank-zephyr-run.txt'

In [8]:
BM25_TUNED_RUN = pt.io.read_results(path.join(RUN_DIR, BM25_TUNED)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
BM25_BASE_RUN = pt.io.read_results(path.join(RUN_DIR, BM25_BASE)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
SET_ENCODER_COLBERT_RUN = pt.io.read_results(path.join(RUN_DIR, SET_ENCODER_COLBERT)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
COLBERT_RUN = pt.io.read_results(path.join(RUN_DIR, COLBERT)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
SPLADE_RUN = pt.io.read_results(path.join(RUN_DIR, SPLADE)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
RANK_ZEPHYR_RUN = pt.io.read_results(path.join(RUN_DIR, RANK_ZEPHYR)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})

runs = {
    'bm25_tuned': BM25_TUNED_RUN,
    'bm25_base': BM25_BASE_RUN,
    'set_encoder_colbert': SET_ENCODER_COLBERT_RUN,
    'colbert': COLBERT_RUN,
    'splade': SPLADE_RUN,
    'rank_zephyr': RANK_ZEPHYR_RUN
}

In [9]:
final_structure = []

In [10]:
for annotator, qrels in all_qrels.items():
    for run_name, run in runs.items():
        final = get_vals(run, qrels)
        final['annotator'] = annotator
        final['run_name'] = run_name
        final_structure.append(final)

final_structure = pd.concat(final_structure)
        

  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


In [11]:
final_structure

Unnamed: 0,metric,original,new,p_value,original_variance,new_variance,annotator,run_name
0,AP(rel=2),0.392135,0.242412,0.175489,0.093514,0.044515,andrew-parry,bm25_tuned
1,nDCG@10,0.575140,0.344164,0.038261,0.068443,0.064711,andrew-parry,bm25_tuned
2,R(rel=2)@100,0.546917,0.535538,0.927941,0.093204,0.092840,andrew-parry,bm25_tuned
3,P(rel=2)@10,0.461538,0.223077,0.020217,0.083905,0.026391,andrew-parry,bm25_tuned
4,RR(rel=2),0.887179,0.477137,0.008798,0.070690,0.177377,andrew-parry,bm25_tuned
...,...,...,...,...,...,...,...,...
1,nDCG@10,0.626019,0.574421,0.626651,0.074097,0.079969,sean-macavaney,rank_zephyr
2,R(rel=2)@100,0.463101,0.530038,0.607040,0.108480,0.123362,sean-macavaney,rank_zephyr
3,P(rel=2)@10,0.493333,0.360000,0.256752,0.092622,0.093067,sean-macavaney,rank_zephyr
4,RR(rel=2),0.724444,0.706667,0.900380,0.125995,0.151289,sean-macavaney,rank_zephyr
