In [32]:
import os
import pyterrier as pt
os.environ["JAVA_HOME"] = "/home/andrew/Java"
os.environ["JVM_PATH"] = '/home/andrew/Java/jre/lib/server/libjvm.so'
if not pt.started():
    pt.init()
import ir_datasets as irds
import pandas as pd
from ir_measures import *
from ir_measures import evaluator

In [33]:
DATASET = "msmarco-passage/trec-dl-2019/judged"
RUN_DIR = '/home/andrew/Documents/Code/Annotation/runs/trec-dl-2019'

In [34]:
dataset = irds.load(DATASET)
original_qrels = pd.DataFrame(dataset.qrels_iter())

In [35]:
all_runs = []
for run in os.listdir(RUN_DIR):
    result = pt.io.read_results(os.path.join(RUN_DIR, run)).rename(columns={'qid': 'query_id', 'docno': 'doc_id'})
    result['runname'] = run
    all_runs.append(result)

all_runs = pd.concat(all_runs)

In [36]:
metrics = [nDCG@10]

In [37]:
evaluate = evaluator(metrics, original_qrels)

In [49]:
all_metrics = []

for run, results in all_runs.groupby('runname'):
    if not run.startswith('dl-19'):
        continue
    for obj in evaluate.iter_calc(results):
        all_metrics.append({
            'runname': run,
            'query_id': obj.query_id,
            'metric': obj.measure,
            'value': obj.value
        })

all_metrics = pd.DataFrame(all_metrics)

# Read Pool

In [60]:
POOL_DIR = '/home/andrew/Documents/Code/Annotation/pooling/pool-pilot-study.jsonl'
pool = pd.read_json(POOL_DIR, lines=True)

In [61]:
pool

Unnamed: 0,text,query,doc_id,query_id,label
0,Chamber Hypertrophy and Enlargment. In hypertr...,causes of left ventricular hypertrophy,2986227,87181,[]
1,The last common cause of right ventricular hyp...,causes of left ventricular hypertrophy,5197133,87181,[]
2,"Over time, this can cause the heart muscle to ...",causes of left ventricular hypertrophy,5469038,87181,[]
3,"Search for new homes, open houses, recently so...",causes of left ventricular hypertrophy,2396481,87181,[]
4,Ventricular hypertrophy. Ventricular hypertrop...,causes of left ventricular hypertrophy,47212,87181,[]
...,...,...,...,...,...
95,Most providers would have you believe that a l...,what does it mean if your tsh is low,1226400,640502,[]
96,1 A low TSH level with a high FT4 level and a ...,what does it mean if your tsh is low,4359297,640502,[]
97,The four TSH assays showed good agreement at l...,what does it mean if your tsh is low,6728990,640502,[]
98,Low TSH and high FT4 levels are usually seen i...,what does it mean if your tsh is low,6730883,640502,[]


## QREL CHECKS

In [55]:
values = original_qrels.relevance.value_counts()

In [59]:
len(original_qrels) # total number of judgements

9260

In [57]:
values[0] / (values[0] + values[1] + values[2] + values[3]) * 100 # percentage of non-relevant judgements

55.70194384449244

In [58]:
(values[0] + values[1]) / (values[0] + values[1] + values[2] + values[3]) * 100 # percentage of non-relevant and partially relevant judgements

72.9913606911447

# Per QID pseudo-Oracle

In [50]:
# find the best performance by each qid, keep the run name and value

best_performances = all_metrics.groupby(['query_id', 'metric']).apply(lambda x: x.loc[x['value'].idxmax()]).reset_index(drop=True)

In [51]:
# using best performance, find the rankings for each query 
all_rankings = []
for row in best_performances.itertuples():
    ranking = all_runs.loc[all_runs['query_id'] == row.query_id and all_runs.runname==row.runname]
    all_rankings.append(ranking)

all_rankings = pd.concat(all_rankings)

Unnamed: 0,runname,query_id,metric,value
0,dl-19-official-input.ICT-CKNRM_B50.gz,1037798,nDCG@10,0.403132
1,dl-19-official-input.TUA1-1.gz,104861,nDCG@10,1.0
2,dl-19-official-input.p_exp_rm3_bert.gz,1063750,nDCG@10,0.810045
3,dl-19-official-input.TUW19-p1-f.gz,1103812,nDCG@10,0.911504
4,dl-19-official-input.ICT-CKNRM_B.gz,1106007,nDCG@10,0.867676
5,dl-19-official-input.idst_bert_p2.gz,1110199,nDCG@10,0.889701
6,dl-19-official-input.idst_bert_pr2.gz,1112341,nDCG@10,0.866518
7,dl-19-official-input.TUW19-p2-f.gz,1113437,nDCG@10,0.581753
8,dl-19-official-input.TUW19-p3-f.gz,1114646,nDCG@10,0.842532
9,dl-19-official-input.TUW19-p1-f.gz,1114819,nDCG@10,0.924021


In [52]:
best_performances.value.mean()

0.8309784091185828

# Custom nDCG

In [None]:
# from https://github.com/kmbnw/rank_metrics/blob/master/python/ndcg.py

# Copyright 2016 Krysta M Bouzek
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#    http://www.apache.org/licenses/LICENSE-2.0

#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.

import numpy as np

"""
Implementation of normalized discounted cumulative gain.

Handy for testing ranking algorithms.

https://en.wikipedia.org/wiki/Discounted_cumulative_gain
"""

def cum_gain(relevance):
    """
    Calculate cumulative gain.
    This ignores the position of a result, but may still be generally useful.

    @param relevance: Graded relevances of the results.
    @type relevance: C{seq} or C{numpy.array}
    """

    if relevance is None or len(relevance) < 1:
        return 0.0

    return np.asarray(relevance).sum()


def dcg(relevance, alternate=True):
    """
    Calculate discounted cumulative gain.

    @param relevance: Graded and ordered relevances of the results.
    @type relevance: C{seq} or C{numpy.array}
    @param alternate: True to use the alternate scoring (intended to
    place more emphasis on relevant results).
    @type alternate: C{bool}
    """

    if relevance is None or len(relevance) < 1:
        return 0.0

    rel = np.asarray(relevance)
    p = len(rel)

    if alternate:
        # from wikipedia: "An alternative formulation of
        # DCG[5] places stronger emphasis on retrieving relevant documents"

        log2i = np.log2(np.asarray(range(1, p + 1)) + 1)
        return ((np.power(2, rel) - 1) / log2i).sum()
    else:
        log2i = np.log2(range(2, p + 1))
        return rel[0] + (rel[1:] / log2i).sum()


def idcg(relevance, alternate=True):
    """
    Calculate pseudo-ideal discounted cumulative gain (maximum possible DCG).

    @param relevance: Graded and ordered relevances of the results.
    @type relevance: C{seq} or C{numpy.array}
    @param alternate: True to use the alternate scoring (intended to
    place more emphasis on relevant results).
    @type alternate: C{bool}
    """

    if relevance is None or len(relevance) < 1:
        return 0.0

    # guard copy before sort
    rel = np.asarray(relevance).copy()
    rel.sort()
    return dcg(rel[::-1], alternate)

def pidcg(upper_bound, alternate=True):
    if upper_bound is None or len(upper_bound) < 1:
        return 0.0

    # guard copy before sort
    rel = np.asarray(upper_bound).copy()
    return dcg(rel, alternate)


def pndcg(relevance, upper_bound, nranks, alternate=True):
    """
    Calculate pseudo normalized discounted cumulative gain.

    @param relevance: Graded and ordered relevances of the results.
    @type relevance: C{seq} or C{numpy.array}
    @param nranks: Number of ranks to use when calculating NDCG.
    Will be used to rightpad with zeros if len(relevance) is less
    than nranks
    @type nranks: C{int}
    @param alternate: True to use the alternate scoring (intended to
    place more emphasis on relevant results).
    @type alternate: C{bool}
    """
    if relevance is None or len(relevance) < 1:
        return 0.0

    if (nranks < 1):
        raise Exception('nranks < 1')

    rel = np.asarray(relevance)
    pad = max(0, nranks - len(rel))

    # pad could be zero in which case this will no-op
    rel = np.pad(rel, (0, pad), 'constant')

    # now slice downto nranks
    rel = rel[0:min(nranks, len(rel))]

    ideal_dcg = idcg(upper_bound, alternate)
    if ideal_dcg == 0:
        return 0.0

    return dcg(rel, alternate) / ideal_dcg

In [None]:
class ndcg_process():
    def __init__(self, qrels, upper_bounds, cutoff=10) -> None:
        self.qrels = qrels
        self.upper_bounds = upper_bounds
        self.cutoff = cutoff

    def calc(self, qrels, results, upper_bound):
        # get top 10 of results
        qrels = qrels.set_index('doc_id').relevance.to_dict()
        results = results.sort('score', ascending=False).head(self.cutoff)
        #filter_upper_bound to contain only the documents in results
        upper_bound = upper_bound.loc[upper_bound['doc_id'].isin(results['doc_id'])].sort('score', ascending=False)
        # if any docs are missing from upper bound add them to the end with a score of 0
        missing_docs = results.loc[~results['doc_id'].isin(upper_bound['doc_id'])]
        missing_docs['score'] = 0
        upper_bound = pd.concat([upper_bound, missing_docs])

        result_relevance = np.array([qrels.get(doc_id, 0) for doc_id in results['doc_id']])
        upper_bound_relevance = np.array([qrels.get(doc_id, 0) for doc_id in upper_bound['doc_id']])

        return pndcg(result_relevance, upper_bound_relevance, self.cutoff)
    
    def iter_calc(self, run):
        for qid, results in run.groupby('query_id'):
            qrels = self.qrels.loc[self.qrels['query_id'] == qid]
            upper_bound = self.upper_bounds.loc[self.upper_bounds['query_id'] == qid]
            yield {'query_id' : qid, 'value' : self.calc(qrels, results, upper_bound)}
    
    def calc_aggregate(self, run):
        all_results = []
        for qid, results in run.groupby('query_id'):
            qrels = self.qrels.loc[self.qrels['query_id'] == qid]
            upper_bound = self.upper_bounds.loc[self.upper_bounds['query_id'] == qid]
            all_results.append(self.calc(qrels, results, upper_bound))
        return  {'value' : sum(all_results) / len(all_results)}