In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [2]:
import pyterrier as pt
import pandas as pd
from itertools import filterfalse
import os
import wget
import zipfile

pt.init()

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [3]:
from pyterrier_colbert.indexing import ColBERTIndexer
from pyterrier_colbert.ranking import ColBERTFactory

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from ance.pyterrier_ance import ANCEIndexer, ANCETextScorer

In [5]:
vaswani = pt.get_dataset("irds:vaswani")

In [6]:
vaswani_index_src = os.path.abspath("vaswani-index")
if not os.path.exists(vaswani_index_src):
    print("Creating a new Vaswani index for BM25 and RM3...")
    pt.index.IterDictIndexer(vaswani_index_src, blocks=True, meta={"docno": 20, "text": 4096}).index(vaswani.get_corpus_iter(), fields=["docno", "text"])

vaswani_index = pt.IndexFactory.of(vaswani_index_src)



In [7]:
queries = vaswani.get_topics()
qrels = vaswani.get_qrels()

In [8]:
bm25 = pt.terrier.Retriever(vaswani_index, wmodel="BM25", metadata=["docno", "text"])
rm3 = pt.rewrite.RM3(vaswani_index)

In [9]:
checkpoint_url = "http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip"
extract_dir = "colbert_checkpoint"
checkpoint_path = "colbert_checkpoint.zip"

if not os.path.exists(checkpoint_path):
    print("Downloading checkpoint...")
    wget.download(checkpoint_url, checkpoint_path)
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(checkpoint_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

colbert_checkpoint_path = os.path.abspath("colbert_checkpoint/colbert.dnn")
index_root = os.path.abspath("vaswani-index")
index_name = os.path.abspath("vaswani-colbert-index")

if not os.path.exists(index_name):
    print("Index not found. Creating a new Vaswani index for ColBERT...")
    colbert_index = ColBERTIndexer(
        checkpoint=colbert_checkpoint_path,
        index_root=index_root,
        index_name=index_name,
        chunksize=64, # Maybe even 128, the allowed maximum --> it regulates the size of PyTorch temp files that are created by the indexer
        gpu=True # if the torch.cuda returned False, comment this
    )
    colbert_index.index(vaswani.get_corpus_iter())
    print("Index successfully created!")

In [10]:
colbert_reranker = ColBERTFactory(colbert_checkpoint_path, index_root, index_name)

Some weights of ColBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Mar 26, 21:45:38] #> Loading model checkpoint.
[Mar 26, 21:45:38] #> Loading checkpoint c:\Users\Konstantin-Asen\Desktop\IR-research-project\colbert_checkpoint\colbert.dnn
[Mar 26, 21:45:39] #> checkpoint['epoch'] = 0
[Mar 26, 21:45:39] #> checkpoint['batch'] = 44500


  self.scaler = torch.cuda.amp.GradScaler()


In [11]:
ance_extract_dir = "ance_checkpoint"
ance_checkpoint_path = "ance_checkpoint.zip"

if not os.path.exists(ance_extract_dir):
    with zipfile.ZipFile(ance_checkpoint_path, 'r') as zip_ref:
        zip_ref.extractall(ance_extract_dir)

In [12]:
ance_checkpoint_path = os.path.abspath("ance_checkpoint")
ance_index_name = os.path.abspath("vaswani-ance-index")

if not os.path.exists(ance_index_name):
    print("Index not found. Creating a new Vaswani index for ANCE...")
    ance_index = ANCEIndexer(ance_checkpoint_path, ance_index_name, num_docs=11429)
    ance_index.index(vaswani.get_corpus_iter())
    print("Index successfully created!")

In [13]:
ance_reranker = ANCETextScorer(ance_checkpoint_path)

Using mean: False


Some weights of the model checkpoint at c:\Users\Konstantin-Asen\Desktop\IR-research-project\ance_checkpoint were not used when initializing RobertaDot_NLL_LN: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaDot_NLL_LN from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaDot_NLL_LN from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
pipe_dict = {
    "BM25": bm25,
    "BM25_RM3": bm25 >> rm3 >> bm25,
    "BM25_COLBERT": bm25 >> colbert_reranker.text_scorer(),
    "BM25_ANCE": bm25 >> ance_reranker,
    "BM25_RM3_COLBERT": bm25 >> rm3 >> bm25 >> colbert_reranker.text_scorer(),
    "BM25_RM3_ANCE": bm25 >> rm3 >> bm25 >> ance_reranker,
    "BM25_COLBERT_RM3": bm25 >> colbert_reranker.text_scorer() >> rm3 >> bm25,
    "BM25_COLBERT_ANCE": bm25 >> colbert_reranker.text_scorer() >> pt.text.get_text(vaswani_index) >> ance_reranker,
    "BM25_ANCE_RM3": bm25 >> ance_reranker >> rm3 >> bm25,
    "BM25_ANCE_COLBERT": bm25 >> ance_reranker >> colbert_reranker.text_scorer(),
    "BM25_RM3_COLBERT_ANCE": bm25 >> rm3 >> bm25 >> colbert_reranker.text_scorer() >> pt.text.get_text(vaswani_index) >> ance_reranker,
    "BM25_RM3_ANCE_COLBERT": bm25 >> rm3 >> bm25 >> ance_reranker >> colbert_reranker.text_scorer(),
    "BM25_COLBERT_RM3_ANCE": bm25 >> colbert_reranker.text_scorer() >> rm3 >> bm25 >> ance_reranker,
    "BM25_COLBERT_ANCE_RM3": bm25 >> colbert_reranker.text_scorer() >> pt.text.get_text(vaswani_index) >> ance_reranker >> rm3 >> bm25,
    "BM25_ANCE_RM3_COLBERT": bm25 >> ance_reranker >> rm3 >> bm25 >> colbert_reranker.text_scorer(),
    "BM25_ANCE_COLBERT_RM3": bm25 >> ance_reranker >> colbert_reranker.text_scorer() >> rm3 >> bm25
}

In [15]:
# Ran for 166 minutes

if not os.path.exists("vaswani-twofold/results.csv"):
    twofold_results = pt.Experiment(
        [
            bm25,
            bm25 >> rm3 >> bm25,
            bm25 >> colbert_reranker.text_scorer(),
            bm25 >> ance_reranker
        ],
        queries,
        qrels,
        ["map", "ndcg_cut_10", "recip_rank", "mrt"],
        ["BM25", "BM25_RM3", "BM25_COLBERT", "BM25_ANCE"],
        save_dir="vaswani-twofold",
        save_mode="reuse",
        baseline=0,
        correction="bonferroni"
    )
    twofold_results.to_csv("vaswani-twofold/results.csv", sep=',', na_rep="NaN", header=True, index=False)

In [16]:
# Ran for 335 minutes

if not os.path.exists("vaswani-threefold/results.csv"):
    bm25_rm3 = pt.Transformer.from_df(pt.io.read_results("vaswani-twofold/BM25_RM3.res.gz"), uniform=False)
    bm25_colbert = pt.Transformer.from_df(pt.io.read_results("vaswani-twofold/BM25_COLBERT.res.gz"), uniform=False)
    bm25_ance = pt.Transformer.from_df(pt.io.read_results("vaswani-twofold/BM25_ANCE.res.gz"), uniform=False)

    threefold_results = pt.Experiment(
        [
            bm25,
            bm25_rm3 >> pt.text.get_text(vaswani_index) >> colbert_reranker.text_scorer(),
            bm25_rm3 >> pt.text.get_text(vaswani_index) >> ance_reranker,
            bm25_colbert >> rm3 >> bm25,
            bm25_colbert >> pt.text.get_text(vaswani_index) >> ance_reranker,
            bm25_ance >> rm3 >> bm25,
            bm25_ance >> pt.text.get_text(vaswani_index) >> colbert_reranker.text_scorer()
        ],
        queries,
        qrels,
        ["map", "ndcg_cut_10", "recip_rank", "mrt"],
        ["BM25", "BM25_RM3_COLBERT", "BM25_RM3_ANCE", "BM25_COLBERT_RM3", "BM25_COLBERT_ANCE", "BM25_ANCE_RM3", "BM25_ANCE_COLBERT"],
        save_dir="vaswani-threefold",
        save_mode="reuse",
        baseline=0,
        correction="bonferroni"
    )
    threefold_results.to_csv("vaswani-threefold/results.csv", sep=',', na_rep="NaN", header=True, index=False)

  return torch.cuda.amp.autocast() if self.activated else nullcontext()
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Inferencing: 0it [00:00, ?it/s]

Not running in distributed mode


Inferencing: 1it [00:01,  1.04s/it]
Inferencing: 0it [00:00, ?it/s]

Not running in distributed mode


Inferencing: 727it [23:51,  1.97s/it]
Inferencing: 0it [00:00, ?it/s]

Not running in distributed mode


Inferencing: 1it [00:00,  1.09it/s]
Inferencing: 0it [00:00, ?it/s]

Not running in distributed mode


Inferencing: 719it [23:34,  1.97s/it]
  return torch.cuda.amp.autocast() if self.activated else nullcontext()


In [17]:
# Ran for 337 minutes

if not os.path.exists("vaswani-fourfold/results.csv"):
    bm25_rm3_colbert = pt.Transformer.from_df(pt.io.read_results("vaswani-threefold/BM25_RM3_COLBERT.res.gz"), uniform=False)
    bm25_rm3_ance = pt.Transformer.from_df(pt.io.read_results("vaswani-threefold/BM25_RM3_ANCE.res.gz"), uniform=False)
    bm25_colbert_rm3 = pt.Transformer.from_df(pt.io.read_results("vaswani-threefold/BM25_COLBERT_RM3.res.gz"), uniform=False)
    bm25_colbert_ance = pt.Transformer.from_df(pt.io.read_results("vaswani-threefold/BM25_COLBERT_ANCE.res.gz"), uniform=False)
    bm25_ance_rm3 = pt.Transformer.from_df(pt.io.read_results("vaswani-threefold/BM25_ANCE_RM3.res.gz"), uniform=False)
    bm25_ance_colbert = pt.Transformer.from_df(pt.io.read_results("vaswani-threefold/BM25_ANCE_COLBERT.res.gz"), uniform=False)

    fourfold_results = pt.Experiment(
        [
            bm25,
            bm25_rm3_colbert >> pt.text.get_text(vaswani_index) >> ance_reranker,
            bm25_rm3_ance >> pt.text.get_text(vaswani_index) >> colbert_reranker.text_scorer(),
            bm25_colbert_rm3 >> pt.text.get_text(vaswani_index) >> ance_reranker,
            bm25_colbert_ance >> rm3 >> bm25,
            bm25_ance_rm3 >> pt.text.get_text(vaswani_index) >> colbert_reranker.text_scorer(),
            bm25_ance_colbert >> rm3 >> bm25
        ],
        queries,
        qrels,
        ["map", "ndcg_cut_10", "recip_rank", "mrt"],
        ["BM25", "BM25_RM3_COLBERT_ANCE", "BM25_RM3_ANCE_COLBERT", "BM25_COLBERT_RM3_ANCE", "BM25_COLBERT_ANCE_RM3", "BM25_ANCE_RM3_COLBERT", "BM25_ANCE_COLBERT_RM3"],
        save_dir="vaswani-fourfold",
        save_mode="reuse",
        baseline=0,
        correction="bonferroni"
    )
    fourfold_results.to_csv("vaswani-fourfold/results.csv", sep=',', na_rep="NaN", header=True, index=False)

Inferencing: 0it [00:00, ?it/s]

Not running in distributed mode


Inferencing: 1it [00:01,  1.35s/it]
Inferencing: 0it [00:00, ?it/s]

Not running in distributed mode


Inferencing: 727it [23:47,  1.96s/it]
  return torch.cuda.amp.autocast() if self.activated else nullcontext()
Inferencing: 0it [00:00, ?it/s]

Not running in distributed mode


Inferencing: 1it [00:01,  1.59s/it]
Inferencing: 0it [00:00, ?it/s]

Not running in distributed mode


Inferencing: 727it [23:46,  1.96s/it]
  return torch.cuda.amp.autocast() if self.activated else nullcontext()
