In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [2]:
import pyterrier as pt
import pandas as pd
from itertools import filterfalse
import os
import wget
import zipfile

pt.init()

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


In [3]:
from pyterrier_colbert.indexing import ColBERTIndexer
from pyterrier_colbert.ranking import ColBERTFactory

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from ance.pyterrier_ance import ANCEIndexer, ANCETextScorer

In [5]:
nfcorpus = pt.get_dataset("irds:beir/nfcorpus/test")

In [6]:
nfcorpus_index_src = os.path.abspath("nfcorpus-index")
if not os.path.exists(nfcorpus_index_src):
    print("Creating a new NFCorpus index for BM25 and RM3...")
    pt.index.IterDictIndexer(nfcorpus_index_src, blocks=True, meta={"docno": 20, "text": 4096}).index(nfcorpus.get_corpus_iter(), fields=["docno", "text"])

nfcorpus_index = pt.IndexFactory.of(nfcorpus_index_src)

Creating a new NFCorpus index for BM25 and RM3...


beir/nfcorpus/test documents: 100%|██████████| 3633/3633 [00:02<00:00, 1448.72it/s]


In [7]:
queries = nfcorpus.get_topics("text")
qrels = nfcorpus.get_qrels()

[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [2ms]
[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [1ms]


In [8]:
bm25 = pt.terrier.Retriever(nfcorpus_index, wmodel="BM25", metadata=["docno", "text"])
rm3 = pt.rewrite.RM3(nfcorpus_index)

In [9]:
checkpoint_url = "http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip"
extract_dir = "colbert_checkpoint"
checkpoint_path = "colbert_checkpoint.zip"

if not os.path.exists(checkpoint_path):
    print("Downloading checkpoint...")
    wget.download(checkpoint_url, checkpoint_path)
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(checkpoint_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

colbert_checkpoint_path = os.path.abspath("colbert_checkpoint/colbert.dnn")
index_root = os.path.abspath("nfcorpus-index")
index_name = os.path.abspath("nfcorpus-colbert-index")

if not os.path.exists(index_name):
    print("Index not found. Creating a new NFCorpus index for ColBERT...")
    colbert_index = ColBERTIndexer(
        checkpoint=colbert_checkpoint_path,
        index_root=index_root,
        index_name=index_name,
        chunksize=64, # Maybe even 128, the allowed maximum --> it regulates the size of PyTorch temp files that are created by the indexer
        gpu=True # if the torch.cuda returned False, comment this
    )
    colbert_index.index(nfcorpus.get_corpus_iter())
    print("Index successfully created!")

Index not found. Creating a new NFCorpus index for ColBERT...


beir/nfcorpus/test documents:   0%|          | 0/3633 [00:00<?, ?it/s]

[Mar 29, 15:42:50] [0] 		 #> Local args.bsize = 128
[Mar 29, 15:42:50] [0] 		 #> args.index_root = c:\Users\Konstantin-Asen\Desktop\IR-research-project\nfcorpus-index
[Mar 29, 15:42:50] [0] 		 #> self.possible_subset_sizes = [1491308]


Some weights of ColBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Mar 29, 15:42:52] #> Loading model checkpoint.
[Mar 29, 15:42:52] #> Loading checkpoint c:\Users\Konstantin-Asen\Desktop\IR-research-project\colbert_checkpoint\colbert.dnn
[Mar 29, 15:42:53] #> checkpoint['epoch'] = 0
[Mar 29, 15:42:53] #> checkpoint['batch'] = 44500




[Mar 29, 15:42:53] #> Note: Output directory c:\Users\Konstantin-Asen\Desktop\IR-research-project\nfcorpus-index already exists




[Mar 29, 15:42:53] #> Creating directory c:\Users\Konstantin-Asen\Desktop\IR-research-project\nfcorpus-colbert-index 




beir/nfcorpus/test documents: 100%|██████████| 3633/3633 [00:03<00:00, 1132.10it/s]


[Mar 29, 15:44:13] [0] 		 [NOTE] Done with local share.
[Mar 29, 15:44:13] [0] 		 #> Joining saver thread.
#> num_embeddings = 636776
[Mar 29, 15:44:13] #> Starting..
[Mar 29, 15:44:13] #> Processing slice #1 of 1 (range 0..1).
[Mar 29, 15:44:13] #> Will write to c:\Users\Konstantin-Asen\Desktop\IR-research-project\nfcorpus-colbert-index\ivfpq.100.faiss.
[Mar 29, 15:44:13] #> Loading c:\Users\Konstantin-Asen\Desktop\IR-research-project\nfcorpus-colbert-index\0.sample ...
#> Sample has shape (31838, 128)
[Mar 29, 15:44:13] #> Training with the vectors...
[Mar 29, 15:44:13] #> Training now (using 0 GPUs)...
0.39649462699890137
[Mar 29, 15:44:14] Done training!

[Mar 29, 15:44:14] #> Indexing the vectors...
[Mar 29, 15:44:14] #> Loading ('c:\\Users\\Konstantin-Asen\\Desktop\\IR-research-project\\nfcorpus-colbert-index\\0.pt', None, None) (from queue)...
[Mar 29, 15:44:14] #> Processing a sub_collection with shape (636776, 128)
[Mar 29, 15:44:14] Add data with shape (636776, 128) (offset =

In [10]:
colbert_reranker = ColBERTFactory(colbert_checkpoint_path, index_root, index_name)

Some weights of ColBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Mar 29, 15:44:25] #> Loading model checkpoint.
[Mar 29, 15:44:25] #> Loading checkpoint c:\Users\Konstantin-Asen\Desktop\IR-research-project\colbert_checkpoint\colbert.dnn
[Mar 29, 15:44:26] #> checkpoint['epoch'] = 0
[Mar 29, 15:44:26] #> checkpoint['batch'] = 44500


  self.scaler = torch.cuda.amp.GradScaler()


In [11]:
ance_extract_dir = "ance_checkpoint"
ance_checkpoint_path = "ance_checkpoint.zip"

if not os.path.exists(ance_extract_dir):
    with zipfile.ZipFile(ance_checkpoint_path, 'r') as zip_ref:
        zip_ref.extractall(ance_extract_dir)

In [12]:
ance_checkpoint_path = os.path.abspath("ance_checkpoint")
ance_index_name = os.path.abspath("nfcorpus-ance-index")

if not os.path.exists(ance_index_name):
    print("Index not found. Creating a new NFCorpus index for ANCE...")
    ance_index = ANCEIndexer(ance_checkpoint_path, ance_index_name, num_docs=3633)
    ance_index.index(nfcorpus.get_corpus_iter())
    print("Index successfully created!")

Index not found. Creating a new NFCorpus index for ANCE...


beir/nfcorpus/test documents:   0%|          | 0/3633 [00:00<?, ?it/s]

Loading checkpoint c:\Users\Konstantin-Asen\Desktop\IR-research-project\ance_checkpoint
Using mean: False


Some weights of the model checkpoint at c:\Users\Konstantin-Asen\Desktop\IR-research-project\ance_checkpoint were not used when initializing RobertaDot_NLL_LN: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaDot_NLL_LN from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaDot_NLL_LN from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy mo

Segment 0


beir/nfcorpus/test documents:   1%|          | 29/3633 [00:01<02:57, 20.29it/s] 


Not running in distributed mode


beir/nfcorpus/test documents: 100%|██████████| 3633/3633 [01:03<00:00, 56.93it/s]
Indexing: 100%|██████████| 3633/3633 [01:01<00:00, 58.63d/s]
Inferencing: 29it [01:02,  2.16s/it]

Index successfully created!





In [13]:
ance_reranker = ANCETextScorer(ance_checkpoint_path)

Using mean: False


Some weights of the model checkpoint at c:\Users\Konstantin-Asen\Desktop\IR-research-project\ance_checkpoint were not used when initializing RobertaDot_NLL_LN: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaDot_NLL_LN from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaDot_NLL_LN from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
pipe_dict = {
    "BM25": bm25,
    "BM25_RM3": bm25 >> rm3 >> bm25,
    "BM25_COLBERT": bm25 >> colbert_reranker.text_scorer(),
    "BM25_ANCE": bm25 >> ance_reranker,
    "BM25_RM3_COLBERT": bm25 >> rm3 >> bm25 >> colbert_reranker.text_scorer(),
    "BM25_RM3_ANCE": bm25 >> rm3 >> bm25 >> ance_reranker,
    "BM25_COLBERT_RM3": bm25 >> colbert_reranker.text_scorer() >> rm3 >> bm25,
    "BM25_COLBERT_ANCE": bm25 >> colbert_reranker.text_scorer() >> pt.text.get_text(nfcorpus_index) >> ance_reranker,
    "BM25_ANCE_RM3": bm25 >> ance_reranker >> rm3 >> bm25,
    "BM25_ANCE_COLBERT": bm25 >> ance_reranker >> colbert_reranker.text_scorer(),
    "BM25_RM3_COLBERT_ANCE": bm25 >> rm3 >> bm25 >> colbert_reranker.text_scorer() >> pt.text.get_text(nfcorpus_index) >> ance_reranker,
    "BM25_RM3_ANCE_COLBERT": bm25 >> rm3 >> bm25 >> ance_reranker >> colbert_reranker.text_scorer(),
    "BM25_COLBERT_RM3_ANCE": bm25 >> colbert_reranker.text_scorer() >> rm3 >> bm25 >> ance_reranker,
    "BM25_COLBERT_ANCE_RM3": bm25 >> colbert_reranker.text_scorer() >> pt.text.get_text(nfcorpus_index) >> ance_reranker >> rm3 >> bm25,
    "BM25_ANCE_RM3_COLBERT": bm25 >> ance_reranker >> rm3 >> bm25 >> colbert_reranker.text_scorer(),
    "BM25_ANCE_COLBERT_RM3": bm25 >> ance_reranker >> colbert_reranker.text_scorer() >> rm3 >> bm25
}

In [15]:
# Ran for 220 minutes

if not os.path.exists("nfcorpus-twofold/results.csv"):
    twofold_results = pt.Experiment(
        [
            bm25,
            bm25 >> rm3 >> bm25,
            bm25 >> colbert_reranker.text_scorer(),
            bm25 >> ance_reranker
        ],
        queries,
        qrels,
        ["map", "ndcg_cut_10", "recip_rank", "mrt"],
        ["BM25", "BM25_RM3", "BM25_COLBERT", "BM25_ANCE"],
        save_dir="nfcorpus-twofold",
        save_mode="reuse",
        baseline=0,
        correction="bonferroni"
    )
    twofold_results.to_csv("nfcorpus-twofold/results.csv", sep=',', na_rep="NaN", header=True, index=False)

  return torch.cuda.amp.autocast() if self.activated else nullcontext()
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Inferencing: 0it [00:00, ?it/s]

Not running in distributed mode


Inferencing: 3it [00:02,  1.17it/s]
Inferencing: 0it [00:00, ?it/s]

Not running in distributed mode


Inferencing: 932it [30:46,  1.98s/it]


In [None]:
# Ran for ... minutes

if not os.path.exists("nfcorpus-threefold/results.csv"):
    bm25_rm3 = pt.Transformer.from_df(pt.io.read_results("nfcorpus-twofold/BM25_RM3.res.gz"), uniform=False)
    bm25_colbert = pt.Transformer.from_df(pt.io.read_results("nfcorpus-twofold/BM25_COLBERT.res.gz"), uniform=False)
    bm25_ance = pt.Transformer.from_df(pt.io.read_results("nfcorpus-twofold/BM25_ANCE.res.gz"), uniform=False)

    threefold_results = pt.Experiment(
        [
            bm25,
            bm25_rm3 >> pt.text.get_text(nfcorpus_index) >> colbert_reranker.text_scorer(),
            bm25_rm3 >> pt.text.get_text(nfcorpus_index) >> ance_reranker,
            bm25_colbert >> rm3 >> bm25,
            bm25_colbert >> pt.text.get_text(nfcorpus_index) >> ance_reranker,
            bm25_ance >> rm3 >> bm25,
            bm25_ance >> pt.text.get_text(nfcorpus_index) >> colbert_reranker.text_scorer()
        ],
        queries,
        qrels,
        ["map", "ndcg_cut_10", "recip_rank", "mrt"],
        ["BM25", "BM25_RM3_COLBERT", "BM25_RM3_ANCE", "BM25_COLBERT_RM3", "BM25_COLBERT_ANCE", "BM25_ANCE_RM3", "BM25_ANCE_COLBERT"],
        save_dir="nfcorpus-threefold",
        save_mode="reuse",
        baseline=0,
        correction="bonferroni"
    )
    threefold_results.to_csv("nfcorpus-threefold/results.csv", sep=',', na_rep="NaN", header=True, index=False)

In [None]:
# Ran for ... minutes

if not os.path.exists("nfcorpus-fourfold/results.csv"):
    bm25_rm3_colbert = pt.Transformer.from_df(pt.io.read_results("nfcorpus-threefold/BM25_RM3_COLBERT.res.gz"), uniform=False)
    bm25_rm3_ance = pt.Transformer.from_df(pt.io.read_results("nfcorpus-threefold/BM25_RM3_ANCE.res.gz"), uniform=False)
    bm25_colbert_rm3 = pt.Transformer.from_df(pt.io.read_results("nfcorpus-threefold/BM25_COLBERT_RM3.res.gz"), uniform=False)
    bm25_colbert_ance = pt.Transformer.from_df(pt.io.read_results("nfcorpus-threefold/BM25_COLBERT_ANCE.res.gz"), uniform=False)
    bm25_ance_rm3 = pt.Transformer.from_df(pt.io.read_results("nfcorpus-threefold/BM25_ANCE_RM3.res.gz"), uniform=False)
    bm25_ance_colbert = pt.Transformer.from_df(pt.io.read_results("nfcorpus-threefold/BM25_ANCE_COLBERT.res.gz"), uniform=False)

    fourfold_results = pt.Experiment(
        [
            bm25,
            bm25_rm3_colbert >> pt.text.get_text(nfcorpus_index) >> ance_reranker,
            bm25_rm3_ance >> pt.text.get_text(nfcorpus_index) >> colbert_reranker.text_scorer(),
            bm25_colbert_rm3 >> pt.text.get_text(nfcorpus_index) >> ance_reranker,
            bm25_colbert_ance >> rm3 >> bm25,
            bm25_ance_rm3 >> pt.text.get_text(nfcorpus_index) >> colbert_reranker.text_scorer(),
            bm25_ance_colbert >> rm3 >> bm25
        ],
        queries,
        qrels,
        ["map", "ndcg_cut_10", "recip_rank", "mrt"],
        ["BM25", "BM25_RM3_COLBERT_ANCE", "BM25_RM3_ANCE_COLBERT", "BM25_COLBERT_RM3_ANCE", "BM25_COLBERT_ANCE_RM3", "BM25_ANCE_RM3_COLBERT", "BM25_ANCE_COLBERT_RM3"],
        save_dir="nfcorpus-fourfold",
        save_mode="reuse",
        baseline=0,
        correction="bonferroni"
    )
    fourfold_results.to_csv("nfcorpus-fourfold/results.csv", sep=',', na_rep="NaN", header=True, index=False)