## Attempt using a simple reranking pipeline

In [1]:
import os
import pyterrier as pt
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from pathlib import Path
import ir_datasets
from tqdm import tqdm
from tirex_tracker import tracking, ExportFormat
from sentence_transformers import CrossEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
output = Path("./output")

def run_tira_experiment(pipeline, pipeline_name, dataset_id, topics, description):
    target_dir = output / "runs" / dataset_id / pipeline_name
    target_file = target_dir / "run.txt.gz"

    if target_file.exists():
        print(f"Run existiert bereits: {target_dir}")
        return

    target_dir.mkdir(parents=True, exist_ok=True)

    print(f"--- Starte: {pipeline_name} ---")

    with tracking(
        export_file_path=target_dir / "ir-metadata.yml", 
        export_format=ExportFormat.IR_METADATA, 
        system_description=description, 
        system_name=pipeline_name
    ):
        run = pipeline.transform(topics)

    pt.io.write_results(run, str(target_file))
    print(f"Gespeichert in: {target_file}")

Load index and corpus

In [3]:
ensure_pyterrier_is_loaded()
ds_id = "radboud-validation-20251114-training"
dataset = pt.datasets.get_dataset(f"irds:ir-lab-wise-2025/{ds_id}")
topics = dataset.get_topics("title")
qrels = dataset.get_qrels()

Java started and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [4]:
index_path = "./output/indexes/radboud-validation"
abs_index_path = os.path.abspath(index_path)

if not os.path.exists(abs_index_path + "/data.properties"):
    print("Building index...")
    irds_ds = ir_datasets.load(f"ir-lab-wise-2025/{ds_id}")
    
    def doc_iter():
        for doc in tqdm(irds_ds.docs_iter(), desc="Indexing"):
            yield {'docno': doc.doc_id, 'text': doc.default_text()}
            
    indexer = pt.IterDictIndexer(abs_index_path, meta={'docno': 100})
    index_ref = indexer.index(doc_iter())
    index = pt.IndexFactory.of(index_ref)
else:
    print("Loading existing index...")
    index = pt.IndexFactory.of(abs_index_path)

print(f"Index loaded: {index}")

Loading existing index...
Index loaded: <org.terrier.structures.Index at 0x77c57b5757b0 jclass=org/terrier/structures/Index jself=<LocalRef obj=0x5928f3e7f790 at 0x77c57b65bd70>>


Load models and piplines

In [5]:
irds_ds = ir_datasets.load(f"ir-lab-wise-2025/{ds_id}")

doc_text_map = {doc.doc_id: doc.default_text() for doc in tqdm(irds_ds.docs_iter(), desc="Loading Docs")}

def _lookup_text(row):
    return doc_text_map.get(row['docno'], "")

text_getter = pt.apply.text(_lookup_text)

Loading Docs: 63621it [00:22, 2812.88it/s]


In [6]:
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu")
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

def _cross_encoder_score(row):
    return float(model.predict([(row['query'], row['text'])])[0])

  bm25 = pt.BatchRetrieve(index, wmodel="BM25")


In [7]:
get_text = pt.apply.text(_lookup_text)
reranker = pt.apply.doc_score(_cross_encoder_score)

gamma = 0.1 

def normalize_scores(df):
    df = df.copy()
    epsilon = 1e-9
    
    # normalize BM25
    df['bm25_norm'] = df.groupby('qid')['bm25_score'].transform(
        lambda x: (x - x.min()) / (x.max() - x.min() + epsilon)
    )
    
    # normalize Cross-Encoder
    df['score_norm'] = df.groupby('qid')['score'].transform(
        lambda x: (x - x.min()) / (x.max() - x.min() + epsilon)
    )
    return df

# Function for interpolation
def _interpolate_scores(row):
    return gamma * row['bm25_score'] + (1 - gamma) * row['score']

pipeline_simple_rerank = (
    bm25 % 50
    >> pt.apply.rename({'score': 'bm25_score'}) 
    >> get_text
    >> reranker 
    >> pt.apply.generic(normalize_scores)
    >> pt.apply.doc_score(_interpolate_scores) 
)

Run experiments

In [8]:
print("Starte Simple Re-Ranking Experiment...")
experiment = pt.Experiment(
    [bm25, pipeline_simple_rerank],
    topics,
    qrels,
    eval_metrics=["map", "ndcg_cut_10", "P_10"],
    names=["BM25 Baseline", "BM25 + Cross-Encoder (Top 50)"],
    verbose=True
)
print(experiment)

Starte Simple Re-Ranking Experiment...


pt.Experiment: 100%|██████████| 2/2 [02:07<00:00, 63.83s/system]

                            name       map      P_10  ndcg_cut_10
0                  BM25 Baseline  0.381395  0.432143     0.451635
1  BM25 + Cross-Encoder (Top 50)  0.402169  0.492857     0.537967





In [9]:
run_tira_experiment(
    pipeline=pipeline_simple_rerank,
    pipeline_name="simple_rerank",
    dataset_id=ds_id,
    topics=topics,
    description="Standard BM25 retrieval with ms-marco reranker."
)

Run existiert bereits: output/runs/radboud-validation-20251114-training/simple_rerank


## Hypothesentests

In [10]:
experiment = pt.Experiment(
    retr_systems=[
        bm25,
        pipeline_simple_rerank,
    ],
    topics=topics,
    qrels=qrels,
    eval_metrics=["ndcg_cut_10"],
    names=["BM25", "BM25_marco_rerank"],
    verbose=True,
    perquery=True
)
experiment.sample(n=10)

pt.Experiment:   0%|          | 0/2 [00:00<?, ?system/s]

pt.Experiment: 100%|██████████| 2/2 [02:04<00:00, 62.45s/system]


Unnamed: 0,name,qid,measure,value
27,BM25,74,ndcg_cut_10,0.488341
20,BM25,52,ndcg_cut_10,0.0
44,BM25_marco_rerank,43,ndcg_cut_10,0.739405
55,BM25_marco_rerank,74,ndcg_cut_10,0.615917
53,BM25_marco_rerank,68,ndcg_cut_10,0.390379
31,BM25_marco_rerank,8,ndcg_cut_10,0.443482
37,BM25_marco_rerank,23,ndcg_cut_10,0.694407
29,BM25_marco_rerank,4,ndcg_cut_10,0.791132
24,BM25,64,ndcg_cut_10,0.75097
12,BM25,32,ndcg_cut_10,0.676489


In [11]:
experiment_bm25 = experiment[experiment["name"] == "BM25"]\
    .drop(columns=["name"])
experiment_rerank = experiment[experiment["name"] == "BM25_marco_rerank"]\
    .drop(columns=["name"])

experiment_paired = experiment_bm25.merge(
    experiment_rerank,
    on=["qid", "measure"],
    suffixes=("_bm25", "_rerank"),
)
experiment_paired.head(n=10)

Unnamed: 0,qid,measure,value_bm25,value_rerank
0,13,ndcg_cut_10,0.193254,0.396344
1,15,ndcg_cut_10,0.762609,0.36395
2,16,ndcg_cut_10,0.486344,0.506847
3,18,ndcg_cut_10,0.117003,0.50974
4,20,ndcg_cut_10,0.201879,0.199282
5,23,ndcg_cut_10,0.580776,0.694407
6,24,ndcg_cut_10,0.285758,0.377161
7,3,ndcg_cut_10,0.547365,0.455413
8,31,ndcg_cut_10,0.521239,0.634591
9,32,ndcg_cut_10,0.676489,0.845295


### Hypothesis 1

There is a statistically significant difference in mean nDCG@10 scores on the radboud-validation-20251114-training dataset between the standard BM25 pipeline and the pipeline extended with the ms-marco-MiniLM-L-6-v2 reranker (re-ranking the top-50 documents with linear score interpolation, $\alpha=0.1$). (two-sided t-test with $\alpha=0.05$)

Method: two-sided paired t-test

In [12]:
from scipy.stats import ttest_rel

ttest_rel(
    experiment_paired["value_bm25"],
    experiment_paired["value_rerank"],
    alternative='two-sided',
).pvalue

np.float64(0.028514097446418207)

$p$ is lower than specified $\alpha$. Thus the null hypothesis can be rejected.

### Hypothesis 2

The pipeline extended with the ms-marco-MiniLM-L-6-v2 reranker (re-ranking the top-50 documents with linear score interpolation, $\alpha=0.1$) achieves a statistically significant improvement in mean nDCG@10 scores on the radboud-validation-20251114-training dataset compared to the standard BM25 pipeline. (one-sided t-test with $\alpha=0.05$)

Method: one-sided paired t-test

In [13]:
from scipy.stats import ttest_rel

ttest_rel(
    experiment_paired["value_bm25"],
    experiment_paired["value_rerank"],
    alternative='less',
).pvalue

np.float64(0.014257048723209103)

$p$ is lower than specified $\alpha$. Thus the null hypothesis can be rejected.