## Setting Up ColBERT

In [1]:
!sudo apt-get remove openjdk-11-jdk -y

# Add Java 21 source
!sudo add-apt-repository ppa:openjdk-r/ppa -y
!sudo apt-get update
!sudo apt install openjdk-21-jdk -y

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

!java -version

!pip install 'pyserini[optional]' faiss-cpu

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Package 'openjdk-11-jdk' is not installed, so not removed
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
PPA publishes dbgsym, you may need to include 'main/debug' component
Repository: 'deb https://ppa.launchpadcontent.net/openjdk-r/ppa/ubuntu/ jammy main'
More info: https://launchpad.net/~openjdk-r/+archive/ubuntu/ppa
Adding repository.
Adding deb entry to /etc/apt/sources.list.d/openjdk-r-ubuntu-ppa-jammy.list
Adding disabled deb-src entry to /etc/apt/sources.list.d/openjdk-r-ubuntu-ppa-jammy.list
Adding key to /etc/apt/trusted.gpg.d/openjdk-r-ubuntu-ppa.gpg with fingerprint F7C313DB11F1ED148BB5117C08B3810CB7017B89
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://r2u.stat.illin

In [2]:
!pip uninstall -y transformers
!rm -rf /usr/local/lib/python3.11/dist-packages/transformers
!rm -rf /usr/local/lib/python3.11/dist-packages/transformers-*

!pip install transformers==4.41.1

Found existing installation: transformers 4.52.4
Uninstalling transformers-4.52.4:
  Successfully uninstalled transformers-4.52.4
Collecting transformers==4.41.1
  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.1)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstal

## Creating a JSON Lines files with the entire dataset

Make sure to merge the 3 csv files together before creeating the .jsonl

In [42]:
import pandas as pd
import json

df = pd.read_csv("/content/Train_set1.csv")
doc_dict = {}

with open("docs.jsonl", "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        content = f"{row['Title']} {row['Body']} {row['Tags']}"
        f.write(json.dumps({"id": str(row['Id']), "contents": content}) + '\n')
        doc_dict[str(row['Id'])] = content

## Creating & Storing the Embedding

Will take about 5 min for the entire merged corpus

In [43]:
!python -m pyserini.encode \
  input --corpus docs.jsonl --field text --delimiter '\n' \
  output --embeddings encoded_docs --to-faiss \
  encoder --encoder colbert-ir/colbertv2.0 --fields text

10000it [00:00, 134700.93it/s]
100% 157/157 [02:31<00:00,  1.04it/s]


#### Make sure to download the encoded_docs folder from colab disk

## Setting Up Reranker(Cross Encoder) Model

In [44]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')       # make sure it is cuda

checkpoint = "cross-encoder/ms-marco-MiniLM-L-6-v2"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [45]:
device

device(type='cuda')

In [50]:
model = model.to(device)

## Retrieval System

In [49]:
from pyserini.dsearch import FaissSearcher

index = "encoded_docs"
colbert_model = 'colbert-ir/colbertv2.0'

searcher = FaissSearcher(index, colbert_model)

### Testing the retrieval system

In [65]:
query = "What is the probabilty of a next random number given previous numbers?"
hits = searcher.search(query, k = 100)

reranked = []
for hit in hits[:10]:
  doc = doc_dict[hit.docid]
  inputs = tokenizer.encode_plus(query, doc, return_tensors="pt", truncation=True, max_length=512)
  inputs = {k: v.to(device) for k, v in inputs.items()}

  with torch.no_grad():
      outputs = model(**inputs)
      logits = outputs.logits
      score = logits[0][0].item()

  reranked.append((hit.docid, score))


reranked.sort(key=lambda x: x[1], reverse=True)
reranked = reranked[:5]

In [66]:
hits[:10]

[DenseSearchResult(docid='1', score=np.float32(103.66195)),
 DenseSearchResult(docid='3295', score=np.float32(101.52556)),
 DenseSearchResult(docid='162', score=np.float32(100.04333)),
 DenseSearchResult(docid='5699', score=np.float32(99.85113)),
 DenseSearchResult(docid='2880', score=np.float32(98.82381)),
 DenseSearchResult(docid='7762', score=np.float32(97.71274)),
 DenseSearchResult(docid='4307', score=np.float32(97.10241)),
 DenseSearchResult(docid='751', score=np.float32(96.67285)),
 DenseSearchResult(docid='8362', score=np.float32(95.40138)),
 DenseSearchResult(docid='2401', score=np.float32(94.90433))]

In [67]:
reranked

[('1', 4.798033714294434),
 ('162', -4.097599983215332),
 ('4307', -5.535955905914307),
 ('751', -7.902719020843506),
 ('5699', -8.08521556854248)]

### Extracting test results

In [69]:
import pandas as pd

df = pd.read_csv('/content/Test_Data.csv')

len(df)

50

In [70]:
df.head(3)

Unnamed: 0,query_id,query,context
0,q_1,a^2 + b^2 = c^2,Pythagorean theorem in geometry
1,q_2,x^2 + y^2 = r^2,Circle equation in spherical coordinates
2,q_3,(x - x_0)^2 + (y - y_0)^2 = r^2,"Circle with center (x_0, y_0) in optimization"


In [74]:
# query_ID, retrieved_body_ID, Run No., Similarity Score
output = []

for i in range(len(df)):

    row = df.iloc[i]
    query = row['query'] + row['context']

    hits = searcher.search(query, k = 100)

    reranked = []

    for hit in hits[:100]:
      doc = doc_dict[hit.docid]
      inputs = tokenizer.encode_plus(query, doc, return_tensors="pt", truncation=True, max_length=512)
      inputs = {k: v.to(device) for k, v in inputs.items()}

      with torch.no_grad():
          outputs = model(**inputs)
          logits = outputs.logits
          score = logits[0][0].item()

      reranked.append((hit.docid, score))


    reranked.sort(key=lambda x: x[1], reverse=True)
    reranked = reranked[:50]

    for doc_id, score in reranked:
        output.append({
            "query_ID": row['query_id'],
            "retrieved_body_ID": doc_id,
            "Run No.": 1,
            "Similarity Score": score
            })

In [75]:
result = pd.DataFrame(output)
result.head(3)

Unnamed: 0,query_ID,retrieved_body_ID,Run No.,Similarity Score
0,q_1,4111,1,3.76895
1,q_1,6992,1,2.580899
2,q_1,6513,1,2.579618


In [77]:
result.to_csv('output.csv', index=False)