In [15]:
import pandas as pd
import ir_datasets as irds
from ir_measures import read_trec_qrels
from ir_measures import * 
from ir_measures import evaluator
import pyterrier as pt
from scipy.stats import ttest_ind
import os
from os import path as path

In [16]:
def get_vals(run : str, qrel_file : str):
    original_qrels = pd.DataFrame(irds.load("msmarco-passage/trec-dl-2019/judged").qrels_iter())
    qrels = read_trec_qrels(qrel_file)

    # Compute the performance per annotator
    metrics = [AP(rel=2), NDCG(cutoff=10), R(rel=2)@100, P(rel=2, cutoff=10), RR(rel=2), RR(rel=2, cutoff=10)]

    original_evaluate = evaluator(metrics, original_qrels)
    evaluate = evaluator(metrics, qrels)

    # calculate aggregate performance

    original_measures = original_evaluate.calc_aggregate(run)
    measures = evaluate.calc_aggregate(run)

    # calculate per query

    original_result = {
       str(name) : {'query_id' : {}} for name in metrics
    }

    result = {
         str(name) : {'query_id' : {}} for name in metrics
    }


    for metric in original_evaluate.iter_calc(run):
        original_result[str(metric)][str(metric.query_id)] = metric.value

    for metric in evaluate.iter_calc(run):
        result[str(metric)][str(metric.query_id)] = metric.value

    # calculate t-test

    t_test = {}

    for metric in metrics:
        t_test[str(metric)] = ttest_ind(original_result[str(metric)]['query_id'].values(), result[str(metric)]['query_id'].values()).pvalue
    
    return original_measures, measures, t_test


In [17]:
DATASET = "msmarco-passage/trec-dl-2019/judged"
qrel_directory = '/home/andrew/Documents/Code/Annotation/judgments/pilot-round-01/qrels/'
annotation_directory = '/home/andrew/Documents/Code/Annotation/judgments/pilot-round-01/doccano/'

In [18]:
all_qrels = {}
for file in os.listdir(qrel_directory):
    if file.endswith('.txt'):
        qrels = pd.DataFrame(read_trec_qrels(qrel_directory + file))
        annotator = file.replace('.txt', '').replace('-qrels', '')
        all_qrels[annotator] = qrels

In [19]:
RUN_DIR = '/home/andrew/Documents/Code/Annotation/runs/trec-dl-2019'
BM25_TUNED = 'dl-19-official-input.bm25tuned_p.gz'
BM25_BASE = 'dl-19-official-input.bm25base_p.gz'
SET_ENCODER_COLBERT = 'colbert_monoelectra-base_msmarco-passage-trec-dl-2019-judged.run'
COLBERT = 'maik-froebe-colbert-run.txt'
SPLADE = 'maik-froebe-splade-run.txt'
RANK_ZEPHYR = 'maik-froebe-rank-zephyr-run.txt'

In [20]:
BM25_TUNED_RUN = pt.io.read_results(path.join(RUN_DIR, BM25_TUNED)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
BM25_BASE_RUN = pt.io.read_results(path.join(RUN_DIR, BM25_BASE)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
SET_ENCODER_COLBERT_RUN = pt.io.read_results(path.join(RUN_DIR, SET_ENCODER_COLBERT)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
COLBERT_RUN = pt.io.read_results(path.join(RUN_DIR, COLBERT)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
SPLADE_RUN = pt.io.read_results(path.join(RUN_DIR, SPLADE)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
RANK_ZEPHYR_RUN = pt.io.read_results(path.join(RUN_DIR, RANK_ZEPHYR)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})

runs = {
    'bm25_tuned': BM25_TUNED_RUN,
    'bm25_base': BM25_BASE_RUN,
    'set_encoder_colbert': SET_ENCODER_COLBERT_RUN,
    'colbert': COLBERT_RUN,
    'splade': SPLADE_RUN,
    'rank_zephyr': RANK_ZEPHYR_RUN
}

In [21]:
final_structure = {
    'run': [],
    'original_measures': [],
    'measures': [],
    't_test': [],
    'annotator': []
}

In [22]:
for annotator, qrels in all_qrels.items():
    for run_name, run in runs.items():
        original_measures, measures, t_test = get_vals(run, qrels)
        final_structure['run'].append(run_name)
        final_structure['original_measures'].append(original_measures)
        final_structure['measures'].append(measures)
        final_structure['t_test'].append(t_test)
        final_structure['annotator'].append(annotator)

TypeError: argument of type 'method' is not iterable