In [1]:
import os
import pyterrier as pt
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from transformers import pipeline
from pathlib import Path
import ir_datasets
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ensure_pyterrier_is_loaded()
ds_id = "radboud-validation-20251114-training"
dataset = pt.datasets.get_dataset(f"irds:ir-lab-wise-2025/{ds_id}")
topics = dataset.get_topics("title")
qrels = dataset.get_qrels()

Java started and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
Download: 4.30MiB [00:01, 4.29MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-wise-2025/radboud-validation-20251114-training/


In [3]:
index_path = "./output/indexes/radboud-validation"
abs_index_path = os.path.abspath(index_path)

if not os.path.exists(abs_index_path + "/data.properties"):
    print("Building index...")
    irds_ds = ir_datasets.load(f"ir-lab-wise-2025/{ds_id}")
    
    def doc_iter():
        for doc in tqdm(irds_ds.docs_iter(), desc="Indexing"):
            yield {'docno': doc.doc_id, 'text': doc.default_text()}
            
    indexer = pt.IterDictIndexer(abs_index_path, meta={'docno': 100})
    index_ref = indexer.index(doc_iter())
    index = pt.IndexFactory.of(index_ref)
else:
    print("Loading existing index...")
    index = pt.IndexFactory.of(abs_index_path)

print(f"Index loaded: {index}")

Building index...


Download: 383MiB [01:12, 5.53MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-wise-2025/radboud-validation-20251114-training/


Indexing: 16763it [02:38, 237.73it/s]



Indexing: 63621it [06:47, 156.05it/s]


17:46:19.116 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer -- Indexed 2 empty documents
Index loaded: <org.terrier.structures.Index at 0x72cfc0fa0220 jclass=org/terrier/structures/Index jself=<LocalRef obj=0x61c8670a8b40 at 0x72cfc1a8ec90>>


In [4]:
BACKBONE_MODEL = "flan-t5-base"

PROMPT = """Instruction: Rewrite the user's search query to be optimal for a keyword-based search engine (BM25). Remove unnecessary words, fix typos, and add relevant technical context or synonyms to improve retrieval.

###
Example 1:
Input: why is my internet so slow today
Output: slow internet connection speed troubleshooting causes
###
Example 2:
Input: python list reverse
Output: python list reverse method syntax documentation
###
Example 3:
Input: symptoms of flu vs covid
Output: flu covid-19 symptoms difference comparison chart
###
Example 4:
Input: best italian food nyc near central park
Output: best italian restaurants new york city central park manhattan reviews
###
Example 5:
Input: {{ query_text }}
Output:"""

In [5]:
def optimize_query():
    generator = pipeline("text2text-generation", model=f"google/{BACKBONE_MODEL}", device=-1)

    def _rewrite_row(row):
        input_text = PROMPT.replace("{{ query_text }}", row['query'])
        results = generator(input_text, max_new_tokens=32, num_beams=5, early_stopping=True)
        text = results[0]['generated_text']
        return text.split("###")[0].strip()

    return pt.apply.query(_rewrite_row)

In [6]:
print("Previewing query optimization:")
llm_pipeline = optimize_query()
preview = llm_pipeline.transform(topics.head(50))
display(preview)

Previewing query optimization:


Device set to use cpu


Unnamed: 0,qid,query_0,query
0,3,split ergo keyboard,split ergo keyboard
1,4,metoo Hollywood,metoo Hollywood
2,7,gastritis symptoms,symptoms of gastritis
3,8,What is privacy by design(PbD)?,What is privacy by design(PbD)?
4,13,Impact of Exercise on Depression,Impact of Exercise on Depression
5,15,Autonomous car ethics in unavoidable accidents,Autonomous car ethics in unavoidable accidents
6,16,mahler sixth symphony concert 2025,mahler sixth symphony concert 2025
7,18,Climate change effects on agriculture,Climate change effects on agriculture
8,20,History of urban rooftop gardening in European...,History of urban rooftop gardening in European...
9,23,excel sum cells,Excel sum cells


In [7]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

retrieval_llm_pipeline = optimize_query() >> bm25

  bm25 = pt.BatchRetrieve(index, wmodel="BM25")
Device set to use cpu


In [8]:
pt.Experiment(
    [bm25, retrieval_llm_pipeline],
    names=['BM25', 'LLM_optimized_queries'],
    topics=topics,
    qrels=qrels,
    eval_metrics=['ndcg_cut_10', 'P_10', 'recall_100', 'map']
)

Unnamed: 0,name,map,P_10,recall_100,ndcg_cut_10
0,BM25,0.381395,0.432143,0.681198,0.451635
1,LLM_optimized_queries,0.381321,0.428571,0.681198,0.454508


In [11]:
from tirex_tracker import tracking, ExportFormat

output = Path("./output")

def run_tira_experiment(pipeline, pipeline_name, dataset_id, topics, description):
    target_dir = output / "runs" / dataset_id / pipeline_name
    target_file = target_dir / "run.txt.gz"

    if target_file.exists():
        print(f"Run existiert bereits: {target_dir}")
        return

    target_dir.mkdir(parents=True, exist_ok=True)

    print(f"--- Starte: {pipeline_name} ---")

    with tracking(
        export_file_path=target_dir / "retrieval-metadata.yml", 
        export_format=ExportFormat.IR_METADATA, 
        system_description=description, 
        system_name=pipeline_name
    ):
        run = pipeline.transform(topics)

    pt.io.write_results(run, str(target_file))
    print(f"Gespeichert in: {target_file}")

run_tira_experiment(
    pipeline=bm25,
    pipeline_name="pyterrier-bm25-baseline",
    dataset_id=ds_id,
    topics=topics,
    description="Standard BM25 retrieval without query expansion."
)

run_tira_experiment(
    pipeline=retrieval_llm_pipeline,
    pipeline_name="pyterrier-flan-t5-bm25",
    dataset_id=ds_id,
    topics=topics,
    description="Query rewriting using FLAN-T5-small followed by BM25."
)

--- Starte: pyterrier-bm25-baseline ---


Detected a hypervisor/virtualization technology. Some metrics might not be available due to configuration or availability of virtual hardware features.

=====  Processor information  =====
Linux arch_perfmon flag  : yes
Hybrid processor         : no
IBRS and IBPB supported  : yes
STIBP supported          : yes
Spec arch caps supported : yes
Max CPUID level          : 27
CPU model number         : 140
PCM Error: can't open MSR handle for core 0 (No such file or directory)
Try no-MSR mode by setting env variable PCM_NO_MSR=1
Can not access CPUs Model Specific Registers (MSRs).
execute 'modprobe msr' as root user, then execute pcm as root user.


Gespeichert in: output/runs/radboud-validation-20251114-training/pyterrier-bm25-baseline/run.txt.gz
--- Starte: pyterrier-flan-t5-bm25 ---
Gespeichert in: output/runs/radboud-validation-20251114-training/pyterrier-flan-t5-bm25/run.txt.gz


## Reranker

In [12]:
irds_ds = ir_datasets.load(f"ir-lab-wise-2025/{ds_id}")

doc_text_map = {doc.doc_id: doc.default_text() for doc in tqdm(irds_ds.docs_iter(), desc="Loading Docs")}

def _lookup_text(row):
    return doc_text_map.get(row['docno'], "")

text_getter = pt.apply.text(_lookup_text)

Loading Docs: 63621it [00:27, 2273.79it/s]


In [13]:
from sentence_transformers import CrossEncoder

model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu")

def _cross_encoder_score(row):
    return float(model.predict([(row['query'], row['text'])])[0])

pipeline_rerank = (bm25 % 100) >> text_getter >> pt.apply.doc_score(_cross_encoder_score)

In [14]:
combined_pipeline = optimize_query() >> (bm25 % 100) >> text_getter >> pt.apply.doc_score(_cross_encoder_score)

Device set to use cpu


In [15]:
pt.Experiment(
    [bm25, retrieval_llm_pipeline, pipeline_rerank, combined_pipeline],
    names=['BM25', 'LLM_optimized_queries', "Rerank only", "Rewrite_and_rerank"],
    topics=topics,
    qrels=qrels,
    eval_metrics=['ndcg_cut_10', 'P_10', 'recall_100', 'map']
)

Unnamed: 0,name,map,P_10,recall_100,ndcg_cut_10
0,BM25,0.381395,0.432143,0.681198,0.451635
1,LLM_optimized_queries,0.381321,0.428571,0.681198,0.454508
2,Rerank only,0.383241,0.460714,0.681198,0.49417
3,Rewrite_and_rerank,0.389933,0.464286,0.681198,0.501276


In [16]:
run_tira_experiment(
    pipeline=combined_pipeline,
    pipeline_name="combined_rewrite_reranker",
    dataset_id=ds_id,
    topics=topics,
    description="query rewrite with BM25 retrieval (top 100) followed by reranking."
)

--- Starte: combined_rewrite_reranker ---
Gespeichert in: output/runs/radboud-validation-20251114-training/combined_rewrite_reranker/run.txt.gz
