In [None]:
!wget https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz
!wget https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz
!wget https://msmarco.z22.web.core.windows.net/msmarcoranking/qrels.dev.tsv

In [None]:
!tar -xzvf /content/collection.tar.gz
!tar -xzvf /content/queries.tar.gz

In [None]:
!pip install python-terrier

In [None]:
import pyterrier as pt
import os
from multiprocessing import Pool

In [None]:
if not pt.java.started():
    pt.java.init() 
    
collection_path = "/content/collection.tsv"
queries_path = "/content/queries.dev.tsv"
qrels_path = "/content/qrels.dev.tsv"
index_path = "/content/msmarco_passage_index"

def process_line(line):
    doc_id, text = line.strip().split('\t', 1)
    return {'docno': doc_id, 'text': text}

def load_collection_parallel(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        with Pool() as pool:
            return pool.map(process_line, f)

if not os.path.exists(index_path + "/data.properties"):
    indexer = pt.IterDictIndexer(index_path)
    index_ref = indexer.index(load_collection_parallel(collection_path), fields=["text"])
else:
    index_ref = pt.IndexFactory.of(index_path)

topics = pt.io.read_topics(queries_path, format="singleline")
qrels = pt.io.read_qrels(qrels_path)

bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")

eval_results = pt.Experiment(
    [bm25],
    topics,
    qrels,
    eval_metrics=["mrr@10"],  # MRR evaluated over the top 10 results
    names=["BM25"],
)

print(eval_results)

In [None]:
results = bm25.transform(topics).head(1000)
results.to_json("bm_25_rank.json")