## Setting Up ColBERT

In [None]:
!sudo apt-get remove openjdk-11-jdk -y

# Add Java 21 source
!sudo add-apt-repository ppa:openjdk-r/ppa -y
!sudo apt-get update
!sudo apt install openjdk-21-jdk -y

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

!java -version

!pip install 'pyserini[optional]' faiss-cpu

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Package 'openjdk-11-jdk' is not installed, so not removed
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
PPA publishes dbgsym, you may need to include 'main/debug' component
Repository: 'deb https://ppa.launchpadcontent.net/openjdk-r/ppa/ubuntu/ jammy main'
More info: https://launchpad.net/~openjdk-r/+archive/ubuntu/ppa
Adding repository.
Adding deb entry to /etc/apt/sources.list.d/openjdk-r-ubuntu-ppa-jammy.list
Adding disabled deb-src entry to /etc/apt/sources.list.d/openjdk-r-ubuntu-ppa-jammy.list
Adding key to /etc/apt/trusted.gpg.d/openjdk-r-ubuntu-ppa.gpg with fingerprint F7C313DB11F1ED148BB5117C08B3810CB7017B89
Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 ht

In [None]:
!pip uninstall -y transformers
!rm -rf /usr/local/lib/python3.11/dist-packages/transformers
!rm -rf /usr/local/lib/python3.11/dist-packages/transformers-*

!pip install transformers==4.41.1

Found existing installation: transformers 4.55.2
Uninstalling transformers-4.55.2:
  Successfully uninstalled transformers-4.55.2
Collecting transformers==4.41.1
  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.1)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstal

## Creating a JSON Lines files with the entire dataset

Make sure to merge the 3 csv files together before creeating the .jsonl

In [None]:
import pandas as pd
import json

df1 = pd.read_csv("/content/documents.csv")
doc_dict = {}

with open("docs.jsonl", "w", encoding="utf-8") as f:
    for _, row in df1.iterrows():
        content = f"{row['Title']} {row['Body']} {row['Tags']}"
        f.write(json.dumps({"id": str(row['Id']), "contents": content}) + '\n')
        doc_dict[str(row['Id'])] = content

## Creating & Storing the Embedding

Will take about 5 min for the entire merged corpus

In [None]:
!python -m pyserini.encode \
  input --corpus docs.jsonl --field text --delimiter '\n' \
  output --embeddings encoded_docs --to-faiss \
  encoder --encoder colbert-ir/colbertv2.0 --fields text

config.json: 100% 743/743 [00:00<00:00, 4.69MB/s]
model.safetensors: 100% 438M/438M [00:05<00:00, 85.0MB/s]
tokenizer_config.json: 100% 405/405 [00:00<00:00, 3.19MB/s]
vocab.txt: 232kB [00:00, 13.5MB/s]
tokenizer.json: 466kB [00:00, 38.5MB/s]
special_tokens_map.json: 100% 112/112 [00:00<00:00, 927kB/s]
415it [00:00, 126665.42it/s]
100% 7/7 [00:12<00:00,  1.82s/it]


#### Make sure to download the encoded_docs folder from colab disk

## Setting Up Reranker(Cross Encoder) Model

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')       # make sure it is cuda

checkpoint = "cross-encoder/ms-marco-MiniLM-L-6-v2"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [None]:
device

device(type='cuda')

In [None]:
model = model.to(device)

## Retrieval System

In [None]:
from pyserini.dsearch import FaissSearcher

index = "encoded_docs"
colbert_model = 'colbert-ir/colbertv2.0'

searcher = FaissSearcher(index, colbert_model)

### Testing the retrieval system

In [None]:
query = "What is the probabilty of a next random number given previous numbers?"
hits = searcher.search(query, k = 100)

reranked = []
for hit in hits[:10]:
  doc = doc_dict[hit.docid]
  inputs = tokenizer.encode_plus(query, doc, return_tensors="pt", truncation=True, max_length=512)
  inputs = {k: v.to(device) for k, v in inputs.items()}

  with torch.no_grad():
      outputs = model(**inputs)
      logits = outputs.logits
      score = logits[0][0].item()

  reranked.append((hit.docid, score))


reranked.sort(key=lambda x: x[1], reverse=True)
reranked = reranked[:5]

In [None]:
hits[:10]

[DenseSearchResult(docid='159', score=np.float32(99.680504)),
 DenseSearchResult(docid='73', score=np.float32(97.526375)),
 DenseSearchResult(docid='350', score=np.float32(97.36194)),
 DenseSearchResult(docid='304', score=np.float32(96.84804)),
 DenseSearchResult(docid='368', score=np.float32(91.45996)),
 DenseSearchResult(docid='274', score=np.float32(90.74)),
 DenseSearchResult(docid='303', score=np.float32(90.002815)),
 DenseSearchResult(docid='341', score=np.float32(89.5518)),
 DenseSearchResult(docid='340', score=np.float32(89.32342)),
 DenseSearchResult(docid='19', score=np.float32(88.80042))]

In [None]:
reranked

[('1', 4.798033714294434),
 ('162', -4.097599983215332),
 ('4307', -5.535955905914307),
 ('751', -7.902719020843506),
 ('5699', -8.08521556854248)]

### Extracting test results

In [None]:
import pandas as pd

df = pd.read_csv('/content/val_data.csv')

len(df)

21

In [None]:
df.head(3)

Unnamed: 0,query_id,query
0,1,How to solve a quadratic equation of the form ...
1,2,How to simplify algebraic expressions with exp...
2,3,What are logarithms and what are their propert...


In [None]:
# query_ID, retrieved_body_ID, Run No., Similarity Score
output = []

for i in range(len(df)):

    row = df.iloc[i]
    query = row['query']

    hits = searcher.search(query, k = 100)

    reranked = []

    for hit in hits[:100]:
      doc = doc_dict[hit.docid]
      inputs = tokenizer.encode_plus(query, doc, return_tensors="pt", truncation=True, max_length=512)
      inputs = {k: v.to(device) for k, v in inputs.items()}

      with torch.no_grad():
          outputs = model(**inputs)
          logits = outputs.logits
          score = logits[0][0].item()

      reranked.append((hit.docid, score))


    reranked.sort(key=lambda x: x[1], reverse=True)
    reranked = reranked[:50]

    for doc_id, score in reranked:
        output.append({
            "query_ID": row['query_id'],
            "retrieved_body_ID": doc_id,
            "Run No.": 1,
            "Similarity Score": score
            })

In [None]:
result = pd.DataFrame(output)
result.head(3)

Unnamed: 0,query_ID,retrieved_body_ID,Run No.,Similarity Score
0,1,97,1,8.495399
1,1,3,1,8.469643
2,1,231,1,8.239656


In [None]:
result.to_csv('output_val.csv', index=False)