In [None]:
import os
from glob import glob
import pyterrier as pt
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from transformers import pipeline
from pathlib import Path
import ir_datasets
import torch
from tqdm import tqdm

In [4]:
ensure_pyterrier_is_loaded()
ds_id = "radboud-validation-20251114-training"
dataset = pt.datasets.get_dataset(f"irds:ir-lab-wise-2025/{ds_id}")
topics = dataset.get_topics("title")
qrels = dataset.get_qrels()

Java started and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [5]:
# Define where to store the index
index_path = "./output/indexes/radboud-validation"
abs_index_path = os.path.abspath(index_path)

if not os.path.exists(abs_index_path + "/data.properties"):
    print("Building index...")
    # Load the dataset directly to access documents
    irds_ds = ir_datasets.load(f"ir-lab-wise-2025/{ds_id}")
    
    def doc_iter():
        for doc in tqdm(irds_ds.docs_iter(), desc="Indexing"):
            yield {'docno': doc.doc_id, 'text': doc.default_text()}
            
    # Index the documents
    indexer = pt.IterDictIndexer(abs_index_path, meta={'docno': 100})
    index_ref = indexer.index(doc_iter())
    index = pt.IndexFactory.of(index_ref)
else:
    print("Loading existing index...")
    index = pt.IndexFactory.of(abs_index_path)

print(f"Index loaded: {index}")

Loading existing index...
Index loaded: <org.terrier.structures.Index at 0x71539cc4dda0 jclass=org/terrier/structures/Index jself=<LocalRef obj=0x5d2b582bcb50 at 0x7153545a3c70>>


In [28]:
BACKBONE_MODEL = "flan-t5-small"

PROMPT = """Instruction: Rewrite the user's search query to be optimal for a keyword-based search engine (BM25). Remove unnecessary words, fix typos, and add relevant technical context or synonyms to improve retrieval.

###
Example 1:
Input: why is my internet so slow today
Output: slow internet connection speed troubleshooting causes
###
Example 2:
Input: python list reverse
Output: python list reverse method syntax documentation
###
Example 3:
Input: symptoms of flu vs covid
Output: flu covid-19 symptoms difference comparison chart
###
Example 4:
Input: best italian food nyc near central park
Output: best italian restaurants new york city central park manhattan reviews
###
Example 5:
Input: {{ query_text }}
Output:"""

In [24]:
def optimize_query():
    """
    Returns a PyTerrier transformer that rewrites the query using FLAN-T5.
    """
    # Initialize the generation pipeline (device=-1 for CPU, 0 for GPU if available)
    generator = pipeline("text2text-generation", model=f"google/{BACKBONE_MODEL}", device=-1)

    def _rewrite_row(row):
        input_text = PROMPT.replace("{{ query_text }}", row['query'])
        # Generate with specific parameters
        results = generator(input_text, max_new_tokens=32, do_sample=False)
        return results[0]['generated_text']

    return pt.apply.query(_rewrite_row)

In [25]:
# Preview the optimization on the first few topics
print("Previewing query optimization:")
llm_pipeline = optimize_query()
preview = llm_pipeline.transform(topics.head(50))
display(preview)

Previewing query optimization:


Device set to use cpu


Unnamed: 0,qid,query_0,query
0,3,split ergo keyboard,split ergo keyboard
1,4,metoo Hollywood,metoo california
2,7,gastritis symptoms,stomach pain signs ###
3,8,What is privacy by design(PbD)?,What is privacy by design (PbD)?
4,13,Impact of Exercise on Depression,Impact of Exercise on Depression
5,15,Autonomous car ethics in unavoidable accidents,autonomous car ethics in unavoidable accidents
6,16,mahler sixth symphony concert 2025,mahler sixth symphony concert 2025
7,18,Climate change effects on agriculture,climate change effects on agriculture
8,20,History of urban rooftop gardening in European...,history of urban rooftop gardening in european...
9,23,excel sum cells,excel sum formula function add ###


In [29]:
# Define the retrieval models
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

# Create the pipeline: Rewrite -> Retrieve
retrieval_llm_pipeline = optimize_query() >> bm25

  bm25 = pt.BatchRetrieve(index, wmodel="BM25")
Device set to use cpu


In [30]:
pt.Experiment(
    [bm25, retrieval_llm_pipeline],
    names=['BM25', 'LLM_optimized_queries'],
    topics=topics,
    qrels=qrels,
    eval_metrics=['ndcg_cut_10', 'P_10', 'recall_100', 'map']
)

Unnamed: 0,name,map,P_10,recall_100,ndcg_cut_10
0,BM25,0.381395,0.432143,0.681198,0.451635
1,LLM_optimized_queries,0.335349,0.364286,0.632991,0.393138


In [18]:
from tirex_tracker import tracking, ExportFormat

# Output Basis-Pfad
output = Path("./output")

def run_tira_experiment(pipeline, pipeline_name, dataset_id, topics, description):
    """
    F端hrt eine Pipeline aus und speichert sie exakt mit dem TIRA tracking context manager.
    """
    # 1. Pfade definieren (Exakt wie in deinem Snippet)
    target_dir = output / "runs" / dataset_id / pipeline_name
    target_file = target_dir / "run.txt.gz"

    if target_file.exists():
        print(f"Run existiert bereits: {target_dir}")
        return

    # Ordner erstellen
    target_dir.mkdir(parents=True, exist_ok=True)

    print(f"--- Starte: {pipeline_name} ---")

    # 2. Der entscheidende Teil: Nutzung von tracking() f端r die metadata.yml
    # Dies garantiert, dass die Daten exakt so gespeichert werden, wie TIRA es erwartet.
    with tracking(
        export_file_path=target_dir / "retrieval-metadata.yml", 
        export_format=ExportFormat.IR_METADATA, 
        system_description=description, 
        system_name=pipeline_name
    ):
        # Hier f端hren wir die vorher definierte Pipeline aus
        run = pipeline.transform(topics)

    # 3. Ergebnis speichern
    pt.io.write_results(run, str(target_file))
    print(f"Gespeichert in: {target_file}")

# --- Ausf端hrung ---

# 1. Speichere die BM25 Baseline
run_tira_experiment(
    pipeline=bm25,
    pipeline_name="pyterrier-bm25-baseline",
    dataset_id=ds_id,
    topics=topics,
    description="Standard BM25 retrieval without query expansion."
)

# 2. Speichere die LLM Pipeline
run_tira_experiment(
    pipeline=retrieval_llm_pipeline,
    pipeline_name="pyterrier-flan-t5-bm25",
    dataset_id=ds_id,
    topics=topics,
    description="Query rewriting using FLAN-T5-small followed by BM25."
)

--- Starte: pyterrier-bm25-baseline ---


Detected a hypervisor/virtualization technology. Some metrics might not be available due to configuration or availability of virtual hardware features.

=====  Processor information  =====
Linux arch_perfmon flag  : yes
Hybrid processor         : no
IBRS and IBPB supported  : yes
STIBP supported          : yes
Spec arch caps supported : yes
Max CPUID level          : 27
CPU model number         : 140
PCM Error: can't open MSR handle for core 0 (No such file or directory)
Try no-MSR mode by setting env variable PCM_NO_MSR=1
Can not access CPUs Model Specific Registers (MSRs).
execute 'modprobe msr' as root user, then execute pcm as root user.


Gespeichert in: output/runs/radboud-validation-20251114-training/pyterrier-bm25-baseline/run.txt.gz
--- Starte: pyterrier-flan-t5-bm25 ---
Gespeichert in: output/runs/radboud-validation-20251114-training/pyterrier-flan-t5-bm25/run.txt.gz
