In [1]:
import ir_measures
from ir_measures import MAP, P, nDCG, Qrel, ScoredDoc
import json 
import pandas as pd

### Loading Ground Truth

In [2]:
# Loading ground truth data
with open('./datasets/validation_data/ground_truth_val.json','r') as f:
    data = json.load(f)

In [3]:
qrels = []

for query in data:
    qid = query['id']
    for doc in query['relevant_docs']:
        # Consider only relevant documents with a score greater than 0.25
        if doc[1]>0.25:
            qrels.append(Qrel(str(qid), str(doc[0]), 1))

### Evaluating benchmarks with ColBERT only retrieval

In [4]:
res_df = pd.read_csv('./outputs/output_val_colbert.csv')

runs = []
for index, row in res_df.iterrows():
    runs.append(ScoredDoc(str(int(row['query_ID'])), str(int(row['retrieved_body_ID'])), row['Similarity Score']))


print(ir_measures.calc_aggregate([MAP, P@10, nDCG], qrels, runs))

{nDCG: 0.4878523879259779, P@10: 0.4, AP: 0.2606678330508347}


### Evaluating benchmarks with ColBERT +  MiniLM-L-6-v2 reranker

In [5]:
res_df = pd.read_csv('./outputs/output_val_colbert+reranker.csv')

runs = []
for index, row in res_df.iterrows():
    runs.append(ScoredDoc(str(int(row['query_ID'])), str(int(row['retrieved_body_ID'])), row['Similarity Score']))


print(ir_measures.calc_aggregate([MAP, P@10, nDCG], qrels, runs))

{nDCG: 0.6636762746319775, P@10: 0.6476190476190476, AP: 0.4759561002148748}


### Evaluating benchmarks with all-mpnet-base-v2

In [4]:
res_df = pd.read_csv('./outputs/output_val_all-mpnet-base-v2.csv')

runs = []
for index, row in res_df.iterrows():
    runs.append(ScoredDoc(str(int(row['query_ID'])), str(int(row['retrieved_body_ID'])), row['Similarity Score']))


print(ir_measures.calc_aggregate([MAP, P@10, nDCG], qrels, runs))

{AP: 0.16583446445743538, P@10: 0.06666666666666668, nDCG: 0.4604490382512478}


### Evaluating benchmarks with all-mpnet-base-v2 + MiniLM-L-6-v2

In [6]:
res_df = pd.read_csv('./outputs/output_val_all-mpnet-base-v2_reranked.csv')

runs = []
for index, row in res_df.iterrows():
    runs.append(ScoredDoc(str(int(row['query_ID'])), str(int(row['retrieved_body_ID'])), row['Similarity Score']))


print(ir_measures.calc_aggregate([MAP, P@10, nDCG], qrels, runs))

{AP: 0.5680998648720024, P@10: 0.6380952380952379, nDCG: 0.7697985309363473}


### Evaluating benchmarks with tb17/MathBERT + MiniLM-L-6-v2

In [7]:
res_df = pd.read_csv('./outputs/output_val_math_bert_reranked.csv')

runs = []
for index, row in res_df.iterrows():
    runs.append(ScoredDoc(str(int(row['query_ID'])), str(int(row['retrieved_body_ID'])), row['Similarity Score']))


print(ir_measures.calc_aggregate([MAP, P@10, nDCG], qrels, runs))

{AP: 0.19650443390474479, P@10: 0.3666666666666667, nDCG: 0.37352356872469084}
