In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [2]:
!pip install datasets



In [4]:
import pandas as pd

df = pd.read_csv('/content/documents.csv')

In [5]:
from datasets import Dataset

documents = Dataset.from_pandas(df)

documents

Dataset({
    features: ['Id', 'Title', 'Body', 'Tags'],
    num_rows: 415
})

In [6]:
# Searching based on title as well as description
def concatenate_text(examples):
    """
    Concatenate Title Body and Tags.
    """

    return {
        "text": examples["Title"]
        + " \n "
        + examples["Body"]
        + "\n"
        + examples["Tags"]
    }


dataset = documents.map(concatenate_text)

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

In [7]:
dataset

Dataset({
    features: ['Id', 'Title', 'Body', 'Tags', 'text'],
    num_rows: 415
})

In [35]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

# we will use mpnet for embedding generation
checkpoint = "tbs17/MathBERT"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/441M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [36]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]    # embedding of [CLS] token

In [37]:
def get_embeddings(text_list):

    encoded_input = tokenizer(
        text_list,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    model_output = model(**encoded_input)

    return cls_pooling(model_output)

In [38]:
emb = get_embeddings(dataset["text"][0])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [39]:
len(emb[0])

768

In [40]:
# generating the dataset with embedding
# FAISS excepts embedding in numpy arrays

embedding_dataset = dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

embedding_dataset

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

Dataset({
    features: ['Id', 'Title', 'Body', 'Tags', 'text', 'embeddings'],
    num_rows: 415
})

In [41]:
# FAISS indexing to embeddings
embedding_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Id', 'Title', 'Body', 'Tags', 'text', 'embeddings'],
    num_rows: 415
})

In [15]:
val_df = pd.read_csv('/content/val_data.csv')

In [16]:
val_df.head()

Unnamed: 0,query_id,query
0,1,How to solve a quadratic equation of the form ...
1,2,How to simplify algebraic expressions with exp...
2,3,What are logarithms and what are their propert...
3,4,How do you factor polynomials?
4,5,How to solve a system of linear equations?


In [42]:
checkpoint = "cross-encoder/ms-marco-MiniLM-L-6-v2"

rr_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
rr_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [25]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [43]:
# query_ID, retrieved_body_ID, Run No., Similarity Score
output = []

for i in range(len(val_df)):

    row = val_df.iloc[i]
    query = row['query']
    question_embedding = get_embeddings([query]).cpu().detach().numpy()

    reranked = []

    scores, samples = embedding_dataset.get_nearest_examples(
      "embeddings", question_embedding, k=100
    )

    for idx in range(len(scores)):
        doc = samples['text'][idx]
        inputs = rr_tokenizer.encode_plus(query, doc, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = rr_model(**inputs)
            logits = outputs.logits
            rr_score = logits[0][0].item()

        reranked.append((samples['Id'][idx], rr_score))

    reranked = sorted(reranked, key=lambda x: x[1], reverse=True)

    for id,score in reranked[:50]:
        output.append({
            "query_ID": row['query_id'],
            "retrieved_body_ID": id,
            "Run No.": 1,
            "Similarity Score": score
            })

In [44]:
result = pd.DataFrame(output)
result.head()

Unnamed: 0,query_ID,retrieved_body_ID,Run No.,Similarity Score
0,1,97,1,8.495401
1,1,231,1,8.239658
2,1,217,1,7.268788
3,1,76,1,3.137625
4,1,1,1,3.029583


In [46]:
result.to_csv('output_val_math_bert_reranked.csv', index=False)