In [None]:
!pip install setuptools==58.2.0 packaging==23.2
!pip install python-terrier==0.12.1
!pip install pandas
!pip install torch==2.6.0+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_colbert.git
!pip install --upgrade git+https://github.com/cmacdonald/pyterrier_bert.git
!pip install faiss-cpu --no-cache-dir

The code below preferably should print True before running the experiments; if it's True, it will run faster

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

In [2]:
# For the ColBERT cells to work, replace the following file with the one from replacement-files/:
# .venv/Lib/site-packages/pyterrier_colbert/indexing.py
# Note: restarting VSCode is also necessary for the .venv to get updated properly

# More details: The replacement-file has the compute_throughput() usages commented out
# (the method was solely used in print-commands, so no core functionality is affected)
# (it was throwing ZeroDivisionError via the subtracted timestamps in the denominator)

In [3]:
import pyterrier as pt
import pandas as pd
import os
from pyterrier.measures import MAP, nDCG, MRR

In [4]:
vaswani = pt.get_dataset("irds:vaswani")

In [None]:
vaswani_index_src = os.path.abspath("vaswani-index")
if not os.path.exists(vaswani_index_src):
    print("Creating a new Vaswani index for BM25 and RM3...")
    pt.index.IterDictIndexer(vaswani_index_src, blocks=True, meta={"docno": 20, "text": 4096}).index(vaswani.get_corpus_iter(), fields=["docno", "text"])

vaswani_index = pt.IndexFactory.of(vaswani_index_src)

In [6]:
queries = vaswani.get_topics()
qrels = vaswani.get_qrels()

In [5]:
bm25 = pt.terrier.Retriever(vaswani_index, wmodel="BM25", metadata=["docno", "text"])
rm3 = pt.rewrite.RM3(vaswani_index)
rm3_pipeline = bm25 >> rm3 >> bm25

In [None]:
pt.Experiment(
    [bm25, rm3_pipeline],
    queries,
    qrels,
    [MAP, nDCG @ 10, MRR @ 10],
    ["BM25", "BM25 >> RM3 >> BM25"]
)

In [None]:
from pyterrier_colbert.indexing import ColBERTIndexer
from pyterrier_colbert.ranking import ColBERTFactory
import wget
import zipfile

In [None]:
checkpoint_url = "http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip"
extract_dir = "colbert_checkpoint"
checkpoint_path = "colbert_checkpoint.zip"

if not os.path.exists(checkpoint_path):
    print("Downloading checkpoint...")
    wget.download(checkpoint_url, checkpoint_path)
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(checkpoint_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

colbert_checkpoint_path = os.path.abspath("colbert_checkpoint/colbert.dnn")
index_root = os.path.abspath("vaswani-index")
index_name = os.path.abspath("vaswani-colbert-index")

if not os.path.exists(index_name):
    print("Index not found. Creating a new Vaswani index for ColBERT...")
    colbert_index = ColBERTIndexer(
        checkpoint=colbert_checkpoint_path,
        index_root=index_root,
        index_name=index_name,
        chunksize=64, # Maybe even 128, the allowed maximum --> it regulates the size of PyTorch temp files that are created by the indexer
        gpu=True # if the torch.cuda returned False, comment this
    )
    colbert_index.index(vaswani.get_corpus_iter())
    print("Index successfully created!")

In [None]:
pytcolbert = ColBERTFactory(colbert_checkpoint_path, index_root, index_name)

In [10]:
sparse_colbert = bm25 >> pytcolbert.text_scorer()

In [None]:
pt.Experiment(
    [bm25, sparse_colbert],
    queries,
    qrels,
    [MAP, nDCG @ 10, MRR @ 10],
    ["BM25", "BM25 >> ColBERT"]
)

In [None]:
fiqa = pt.get_dataset("irds:beir/fiqa/test")

In [None]:
fiqa_index_src = os.path.abspath("fiqa-index")
if not os.path.exists(fiqa_index_src):
    print("Creating a new FIQA index for BM25 and RM3...")
    pt.index.IterDictIndexer(fiqa_index_src, blocks=True, meta={"docno": 20, "text": 131072}).index(fiqa.get_corpus_iter(), fields=["docno", "text"])

fiqa_index = pt.IndexFactory.of(fiqa_index_src)

In [None]:
queries = fiqa.get_topics()
qrels = fiqa.get_qrels()

In [None]:
bm25 = pt.terrier.Retriever(fiqa_index, wmodel="BM25", metadata=["docno", "text"])
rm3 = pt.rewrite.RM3(fiqa_index)
rm3_pipeline = bm25 >> rm3 >> bm25

In [None]:
pt.Experiment(
    [bm25, rm3_pipeline],
    queries,
    qrels,
    [MAP, nDCG @ 10, MRR @ 10],
    ["BM25", "BM25 >> RM3 >> BM25"]
)

In [None]:
checkpoint_url = "http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip"
extract_dir = "colbert_checkpoint"
checkpoint_path = "colbert_checkpoint.zip"

if not os.path.exists(checkpoint_path):
    print("Downloading checkpoint...")
    wget.download(checkpoint_url, checkpoint_path)
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(checkpoint_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

colbert_checkpoint_path = os.path.abspath("colbert_checkpoint/colbert.dnn")
index_root = os.path.abspath("fiqa-index")
index_name = os.path.abspath("fiqa-colbert-index")

if not os.path.exists(index_name):
    print("Index not found. Creating a new FIQA index for ColBERT...")
    colbert_index = ColBERTIndexer(
        checkpoint=colbert_checkpoint_path,
        index_root=index_root,
        index_name=index_name,
        chunksize=64, # Maybe even 128, the allowed maximum --> it regulates the size of PyTorch temp files that are created by the indexer
        gpu=True # if the torch.cuda returned False, comment this
    )
    colbert_index.index(fiqa.get_corpus_iter())
    print("Index successfully created!")

In [None]:
pytcolbert = ColBERTFactory(colbert_checkpoint_path, index_root, index_name)

In [None]:
sparse_colbert = bm25 >> pytcolbert.text_scorer()

In [None]:
pt.Experiment(
    [bm25, sparse_colbert],
    queries,
    qrels,
    [MAP, nDCG @ 10, MRR @ 10],
    ["BM25", "BM25 >> ColBERT"]
)

In [None]:
antique = pt.get_dataset("irds:antique/test")

In [None]:
antique_index_src = os.path.abspath("antique-index")
if not os.path.exists(antique_index_src):
    print("Creating a new Antique index for BM25 and RM3...")
    pt.index.IterDictIndexer(antique_index_src, blocks=True, meta={"docno": 20, "text": 131072}).index(antique.get_corpus_iter(), fields=["docno", "text"])

antique_index = pt.IndexFactory.of(antique_index_src)

In [None]:
queries = antique.get_topics()
qrels = antique.get_qrels()

In [None]:
bm25 = pt.terrier.Retriever(antique_index, wmodel="BM25", metadata=["docno", "text"])
rm3 = pt.rewrite.RM3(antique_index)
rm3_pipeline = bm25 >> rm3 >> bm25

In [None]:
pt.Experiment(
    [bm25, rm3_pipeline],
    queries,
    qrels,
    [MAP, nDCG @ 10, MRR @ 10],
    ["BM25", "BM25 >> RM3 >> BM25"]
)

In [None]:
checkpoint_url = "http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip"
extract_dir = "colbert_checkpoint"
checkpoint_path = "colbert_checkpoint.zip"

if not os.path.exists(checkpoint_path):
    print("Downloading checkpoint...")
    wget.download(checkpoint_url, checkpoint_path)
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(checkpoint_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

colbert_checkpoint_path = os.path.abspath("colbert_checkpoint/colbert.dnn")
index_root = os.path.abspath("antique-index")
index_name = os.path.abspath("antique-colbert-index")

if not os.path.exists(index_name):
    print("Index not found. Creating a new Antique index for ColBERT...")
    colbert_index = ColBERTIndexer(
        checkpoint=colbert_checkpoint_path,
        index_root=index_root,
        index_name=index_name,
        chunksize=64, # Maybe even 128, the allowed maximum --> it regulates the size of PyTorch temp files that are created by the indexer
        gpu=True # if the torch.cuda returned False, comment this
    )
    colbert_index.index(antique.get_corpus_iter())
    print("Index successfully created!")

In [None]:
pytcolbert = ColBERTFactory(colbert_checkpoint_path, index_root, index_name)

In [None]:
sparse_colbert = bm25 >> pytcolbert.text_scorer()

In [None]:
pt.Experiment(
    [bm25, sparse_colbert],
    queries,
    qrels,
    [MAP, nDCG @ 10, MRR @ 10],
    ["BM25", "BM25 >> ColBERT"]
)

In [None]:
dataset_name = "irds:msmarco-passage/trec-dl-2019/judged"
msmarco = pt.get_dataset(dataset_name)
print(list(msmarco.get_corpus_iter())[0])

# NOTE: The following datasets contain the 8.8M passages from the MSMARCO Passage dataset
# They only differ in the number of queries and qrels that are included
# Hence, the msmarco-index and colbert-index can be reused

# Small dataset used to verify the latest changes:
# "irds:msmarco-passage/train/split200-valid" (200 queries, 131 qrels)

# Datasets to-be-used for actual experimentation:
# "irds:msmarco-passage/trec-dl-2019/judged" (43 queries, 9.3K qrels)
# "irds:msmarco-passage/trec-dl-2020/judged" (54 queries, 11K qrels)

In [None]:
marco_index_src = os.path.abspath("msmarco-index")
if not os.path.exists(marco_index_src):
    print("Index not found. Creating a new MSMARCO index for BM25 and RM3...")
    pt.index.IterDictIndexer(marco_index_src, blocks=True, meta={"docno": 20, "text": 131072}).index(msmarco.get_corpus_iter(), fields=["docno", "text"])
    print("Index successfully created!")

msmarco_index = pt.IndexFactory.of(marco_index_src)

In [None]:
if dataset_name == "irds:msmarco-passage/trec-dl-2019/judged":
    queries = pd.read_csv(os.path.abspath("trec2019judged-queries.tsv"), sep='\t', names=["qid", "query"])
else:
    queries = msmarco.get_topics()

print(f"Checking Queries:\n{queries}")

In [None]:
qrels = msmarco.get_qrels()

print(f"Checking Result Relevance:\n{qrels.head()}")

In [5]:
bm25 = pt.terrier.Retriever(msmarco_index, wmodel="BM25", metadata=["docno", "text"])
rm3 = pt.rewrite.RM3(msmarco_index)

In [None]:
bm25(queries)

In [None]:
rm3(bm25(queries))

In [None]:
pt.Experiment(
    [bm25, bm25 >> rm3 >> bm25],
    queries,
    qrels,
    [MAP, nDCG @ 10, MRR @ 10],
    ["BM25", "BM25 >> RM3 >> BM25"]
)