In [1]:
import json

import pandas as pd
from datasets import load_dataset
from evaluate import load
from dotenv import load_dotenv

load_dotenv()
canonical_dataset_name = "scifact"
dataset_name = "scifact-bge-m3-sparse-vectors"

In [2]:
ds = load_dataset(f"nirantk/{dataset_name}", split="corpus")
print(ds)

Dataset({
    features: ['_id', 'title', 'text', 'bge_m3_sparse_vector'],
    num_rows: 5183
})


In [3]:
trec_eval = load("trec_eval")

## Example Qrels and Runs

In [4]:
qrel = {
    "query": [0],
    "q0": ["q0"],
    "docid": ["doc_1"],
    "rel": [2]
}
run = {
    "query": [0, 0],
    "q0": ["q0", "q0"],
    "docid": ["doc_2", "doc_1"],
    "rank": [0, 1],
    "score": [1.5, 1.2],
    "system": ["test", "test"]
}
results = trec_eval.compute(predictions=[run], references=[qrel])

  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()


## Load reference Qrels from test.tsv

In [5]:
df = pd.read_csv(f"../data/{canonical_dataset_name}/qrels/test.tsv", sep="\t")
df.head()

## Convert to qrel
qrel = {
    "query": [int(q) for q in df["query-id"].tolist()],
    "q0": ["q0"] * len(df),
    "docid": [str(d) for d in df["corpus-id"].tolist()],
    "rel": df["score"].tolist()
}

In [6]:
def validate_data(predictions, references):
    # Define expected fields and types for predictions and references
    expected_pred_keys = {
        'query': int, 'q0': str, 'docid': str, 'rank': int, 'score': float, 'system': str
    }
    expected_ref_keys = {
        'query': int, 'q0': str, 'docid': str, 'rel': int
    }

    # Function to validate each record against expected fields and types
    def check_record(record, expected_keys):
        for key, expected_type in expected_keys.items():
            if key not in record:
                return f"Missing key: {key}"
            if not all(isinstance(item, expected_type) for item in record[key]):
                return f"Incorrect type for key {key}, expected {expected_type}, got {type(record[key][0])}"

        # Check for consistent lengths across fields
        length = len(record[next(iter(record))])  # get length of first item
        if not all(len(value) == length for value in record.values()):
            return "Inconsistent lengths among fields"

        return "Valid"

    # Validate predictions and references
    pred_validation = check_record(predictions, expected_pred_keys)
    ref_validation = check_record(references, expected_ref_keys)

    return pred_validation, ref_validation

In [7]:
with open("bge-m3-lexical.run.json") as f:
    run = json.load(f)

# validate_data(run, qrel)
results = trec_eval.compute(predictions=[run], references=[qrel])

  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()


In [8]:
results

{'runid': 'splade',
 'num_ret': 3000,
 'num_rel': 339,
 'num_rel_ret': 258,
 'num_q': 300,
 'map': 0.5812462962962962,
 'gm_map': 0.05696682318679945,
 'bpref': 0.0,
 'Rprec': 0.49155555555555547,
 'recip_rank': 0.5983492063492063,
 'P@5': 0.15533333333333332,
 'P@10': 0.086,
 'P@15': 0.05733333333333332,
 'P@20': 0.043,
 'P@30': 0.02866666666666666,
 'P@100': 0.0086,
 'P@200': 0.0043,
 'P@500': 0.00172,
 'P@1000': 0.00086,
 'NDCG@5': 0.6093764327586145,
 'NDCG@10': 0.6315781437701369,
 'NDCG@15': 0.6315781437701369,
 'NDCG@20': 0.6315781437701369,
 'NDCG@30': 0.6315781437701369,
 'NDCG@100': 0.6315781437701369,
 'NDCG@200': 0.6315781437701369,
 'NDCG@500': 0.6315781437701369,
 'NDCG@1000': 0.6315781437701369}

In [12]:
with open("bge-m3-sentence-piece-pair-rescore.run.json") as f:
    run = json.load(f)

# validate_data(run, qrel)
results = trec_eval.compute(predictions=[run], references=[qrel])
results

  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()


{'runid': 'bge-m3',
 'num_ret': 2990,
 'num_rel': 339,
 'num_rel_ret': 159,
 'num_q': 299,
 'map': 0.3345407310081223,
 'gm_map': nan,
 'bpref': 0.0,
 'Rprec': 0.27480490523968787,
 'recip_rank': 0.34441922811488035,
 'P@5': 0.0903010033444816,
 'P@10': 0.05317725752508361,
 'P@15': 0.03545150501672241,
 'P@20': 0.026588628762541804,
 'P@30': 0.017725752508361205,
 'P@100': 0.005317725752508362,
 'P@200': 0.002658862876254181,
 'P@500': 0.0010635451505016725,
 'P@1000': 0.0005317725752508362,
 'NDCG@5': 0.35261771991066787,
 'NDCG@10': 0.3720435190652301,
 'NDCG@15': 0.3720435190652301,
 'NDCG@20': 0.3720435190652301,
 'NDCG@30': 0.3720435190652301,
 'NDCG@100': 0.3720435190652301,
 'NDCG@200': 0.3720435190652301,
 'NDCG@500': 0.3720435190652301,
 'NDCG@1000': 0.3720435190652301}

In [18]:
with open("bge-m3-retokenize-rescore.run.json") as f:
    run = json.load(f)

# validate_data(run, qrel)
results = trec_eval.compute(predictions=[run], references=[qrel])
results

  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()


{'runid': 'splade',
 'num_ret': 3000,
 'num_rel': 339,
 'num_rel_ret': 194,
 'num_q': 300,
 'map': 0.4431805555555555,
 'gm_map': 0.007712705115718475,
 'bpref': 0.0,
 'Rprec': 0.38361111111111107,
 'recip_rank': 0.4547367724867725,
 'P@5': 0.11199999999999997,
 'P@10': 0.06466666666666666,
 'P@15': 0.043111111111111114,
 'P@20': 0.03233333333333333,
 'P@30': 0.021555555555555557,
 'P@100': 0.006466666666666667,
 'P@200': 0.0032333333333333333,
 'P@500': 0.0012933333333333334,
 'P@1000': 0.0006466666666666667,
 'NDCG@5': 0.45788106744585555,
 'NDCG@10': 0.48170774143122647,
 'NDCG@15': 0.48170774143122647,
 'NDCG@20': 0.48170774143122647,
 'NDCG@30': 0.48170774143122647,
 'NDCG@100': 0.48170774143122647,
 'NDCG@200': 0.48170774143122647,
 'NDCG@500': 0.48170774143122647,
 'NDCG@1000': 0.48170774143122647}