# Semantic search with FAISS (PyTorch)

# Libs

In [24]:
# ! pip install datasets evaluate transformers[sentencepiece]

# reference:
# [1] https://stackoverflow.com/questions/58957169/faiss-error-could-not-find-a-version-that-satisfies-the-requirement-faiss-from/58957380
# [1] Self-summary: 
#   1.1 Python version too high (for example: 3.13 has problem with installing faiss)
#   1.2 must state the cuda version explicitly while installing faiss -> (after install torch) check 'nvidia-...' version in  `conda list > requirement.txt` 
# ! pip install faiss-gpu-cu12 # (NO NEED TO INSTALL THIS, WE DO NOT USE GPU)
# ! pip install faiss-cpu 
# ! pip install torch

In [1]:
# reference: https://stackoverflow.com/questions/20554074/sklearn-omp-error-15-initializing-libiomp5md-dll-but-found-mk2iomp5md-dll-a
# for solving crashing problem: 22:48:44.157 [error] Disposing session as kernel process died ExitCode: 3, 
# Reason: OMP: Error #15: Initializing libomp140.x86_64.dll, but found libiomp5md.dll already initialized
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
import pandas as pd

# Data processing

In [3]:
df= pd.read_csv("./df_combine.csv")
print(f"row counts in df: {len(df)}")

row counts in df: 1848


In [4]:
# df["claims"][0], df["title"][0]

In [5]:
# convert panda data frame to dataset
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['title', 'description', 'claims', 'ipc'],
    num_rows: 1848
})

In [7]:
# remove column description
columns = dataset.column_names
columns_to_keep = ["title", "claims", "ipc"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
dataset = dataset.remove_columns(columns_to_remove)
dataset


Dataset({
    features: ['title', 'claims', 'ipc'],
    num_rows: 1848
})

> Now that we **have one comment per row**, let’s **create a new claims_length column** that **contains the number of words per claims**:  

In [8]:
description_dataset = dataset.map(
    lambda x: {"claims_length": len(x["claims"].split())}
)

Map: 100%|██████████| 1848/1848 [00:00<00:00, 13325.83 examples/s]


> We can **use this new column to filter out short comments**, which typically **include things like “cc @lewtun” or “Thanks!” that are not relevant** for our search engine. There’s **no precise number to select for the filter**, **but around 15 words** seems like a good start:  

In [9]:
description_dataset = description_dataset.filter(lambda x: x["claims_length"] > 15)
description_dataset

Filter: 100%|██████████| 1848/1848 [00:00<00:00, 216570.94 examples/s]


Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length'],
    num_rows: 1847
})

In [10]:
# description_dataset[0]['claims_length']

In [11]:
max = 0
for i in range(len(description_dataset)):
    length_description_dataset = description_dataset[i]['claims_length']
    if i== 0:
        min = length_description_dataset
        max = length_description_dataset
    else:
        if length_description_dataset <= min:
            min = length_description_dataset
        
        if length_description_dataset >= max:
            max = length_description_dataset
print(f"min length in dataset: {min} words\nmax length in dataset: {max}")

min length in dataset: 27 words
max length in dataset: 9265


Split the long description into small chunks

In [12]:
# def spilt_into_smaller_descriptions(examples):
#     res = []
#     index = 0
#     num_words_per_chunk = 4500
#     total_chunks = examples["claims"].split()
#     total_len = examples["claims_length"]
#     while index < total_len:
#         chunk = ' '.join(total_chunks[index: index+num_words_per_chunk]) 
#                         # the elem with index = index + num_words_per_chunk 
#                         # is excluded
#         res.append(chunk)
#         index = index + num_words_per_chunk
#     last_chunk = ' '.join(total_chunks[index - num_words_per_chunk: total_len])
#     res.append(last_chunk)
#     return {
#         "claims": res
#     }

In [13]:
# 1. For demonstration purpose:
#    extract only a small fraction of the database.
# 2. For demonstrating the ability to add new data to later the embedding database for similar search:
#    two samples are used. First sample = old data, New sample = new data
all_samples = []
all_samples.append(description_dataset.select(range(20))) # extract row from 0th to 19th
all_samples.append(description_dataset.select(range(20, 40)))

In [14]:
# all_samples[1][0]['title'], all_samples[0][0]['title'] 

In [15]:
# all_samples_split = []
# for idx, elem in enumerate(all_samples):
#     all_samples_split.append(elem.map(spilt_into_smaller_descriptions))

In [16]:
# all_samples_split[1][0]

convert to dataframe to use `explode`

In [17]:
# df_all_samples_split = []
# for idx, elem in enumerate(all_samples_split):
#     elem.set_format("pandas")
#     df_elem = elem[:] # convert to dataframe
#     df_all_samples_split.append(df_elem.explode("claims", ignore_index=True)) # explode


In [18]:
# df_all_samples_split[0].head(4)

Convert the dataframe back to dataset 

In [19]:
# for idx, elem in enumerate(df_all_samples_split):
#     all_samples[idx] =  Dataset.from_pandas(elem)
# print(all_samples)

In [20]:
# all_samples[0][0]['title'],all_samples[1][0]['title'] 

# Creating text embeddings

In [21]:
# import torch
# x = torch.rand(5, 3)
# print(x)

In [22]:
from transformers import AutoTokenizer, AutoModel
# Because Arnab did not save the tokenizer of the fine-tune model,
# so we should use the tokenizer downloaded from Hugging face of the model Arnab used in his training.
# NOTE: actually, we could combine the model with tokenizer of other model such as:
# token_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1".
# However, not so sure there will be mismatching between the tokenizer id of other model with the used model?
# and how it affects the output?
# But using Arnab fine-tuned model gives much better relevancy test, 
# than using random model like "sentence-transformers/multi-qa-mpnet-base-dot-v1"
token_ckpt = "sadickam/sdg-classification-bert"
model_ckpt = "../current_batch" 
             # this is possible because Arnab saved the model using Huggingface trainer.save_model()
tokenizer = AutoTokenizer.from_pretrained(token_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [23]:
import torch

device = torch.device("cpu")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

 > As we mentioned earlier, we’d **like to represent each entry in our GitHub issues corpus as a single vector**, so we **need to “pool” or average our token embeddings** in some way. One popular approach is to **perform CLS pooling on our model’s outputs**, where we **simply collect the last hidden state for the special [CLS] token**. The following function does the trick for us:

In [24]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

> Next, we’ll create a helper function that will tokenize a list of documents, place the tensors on the GPU, feed them to the model, and finally apply CLS pooling to the outputs:

In [None]:
def get_embeddings(text_list, imp_model, imp_tokenizer):
    # encoded_input = tokenizer(
    #     text_list, padding=True, truncation=True, return_tensors="pt"
    # )
    encoded_input = imp_tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = imp_model(**encoded_input)
    return cls_pooling(model_output)

In [26]:
embedding = get_embeddings(all_samples[0]["claims"][0])
embedding.shape

torch.Size([1, 768])

In [27]:
embedding_all_samples = []
for idx, elem in enumerate(all_samples):
    embedding_all_samples.append( elem.map(
                                    lambda x: {"embeddings": get_embeddings(x["claims"]).detach().cpu().numpy()[0]}
                                    )
                                )   

Map: 100%|██████████| 20/20 [00:04<00:00,  4.04 examples/s]
Map: 100%|██████████| 20/20 [00:05<00:00,  3.84 examples/s]


In [28]:
# for demonstration, assume we only have the first sample being tracked by FAISS:
embedding_all_samples[0].add_faiss_index(column="embeddings")

100%|██████████| 1/1 [00:00<?, ?it/s]


Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length', 'embeddings'],
    num_rows: 20
})

In [29]:
# from the embedded samples, get some idea for testing questions:
embedding_all_samples[0]['claims'][0], embedding_all_samples[0]['title'][0]

("A method of assaying nucleic acids in a sample, comprising the steps of: a) adding multiple sets of probes into the sample to form a mixture, each set of probes comprising: i. a first probe having a first portion at least partially complementary to a first region of a target nucleic acid in the sample and a second portion forming a first primer binding site;ii. a second probe having a first portion at least partially complementary to a second region of the target nucleic acid in the sample and a second portion forming a second primer binding site, wherein the 5' end of the first probe is adjacent to the 3' end of the second probe when both probes are hybridized to the target nucleic acid;b) denaturing nucleic acids in the mixture;c) hybridizing the set of probes to the complementary regions of the target nucleic acid;d) performing a ligation reaction with a ligase enzyme on the set of hybridized probes to connect the adjacent 5' end of the first probe and the 3' end of the second pro

In [30]:
# question = "How can I load a dataset offline?"
question = "How to test nucleic acids in a sample"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [31]:
scores, samples = embedding_all_samples[0].get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [32]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [33]:
for _, row in samples_df.iterrows():
    print(f"TITLE: {row.title}")
    print(f"SCORE: {row.scores}")
    print(f"CLAIMS: {row.claims}")
    print("=" * 50)
    print()

TITLE: POLYMER-BIOACTIVE AGENT CONJUGATES
SCORE: 338.6058349609375
CLAIMS: A bioerodible polymer - bioactive agent conjugate comprising as part of its polymer backbone a moiety of general formula (I):  wherein: A and B, which may be the same or different, represent the remainder of the polymer backbone and are (i) attached to the -O-R(ZD)-O- moiety as shown in formula (I) via a bioerodible moiety, and (ii) comprise a poly(urethane-ester) having monomeric units coupled via bioerodible urethane or ester moieties;R represents a linear or branched optionally substituted hydrocarbon;Z is a linking group; andD is a releasable bioactive agent.The bioerodible polymer - bioactive agent conjugate according to claim 1, wherein A and B each comprise a copolymer of polyurethane and polyester.The bioerodible polymer - bioactive agent conjugate according to claim 1 or claim 2, which comprises less than 25 mol% of polymerised residues that are derived from a C2 diol, relative to the total number of mo

# Save and reload FAISS database

**references:**

[1] https://huggingface.co/docs/datasets/v1.2.0/faiss_and_ea.html

[2] https://discuss.huggingface.co/t/save-and-load-datasets/9260

## Save

IMPORTANT:

[1] must save the dataset which contains the corresponding computed embeddings

In [34]:
for idx, elem in enumerate(embedding_all_samples):
    saved_path = '../backend-API/data_embeddings/epo_dataset' + str(idx)
    print(saved_path)
    try:
        elem.save_to_disk(saved_path)
    except ValueError:
        elem.drop_index('embeddings')
        elem.save_to_disk(saved_path)
# embedding_all_samples[0].save_to_disk('./data_embeddings/epo_dataset0')

../backend-API/data_embeddings/epo_dataset0


Saving the dataset (1/1 shards): 100%|██████████| 20/20 [00:00<?, ? examples/s]


../backend-API/data_embeddings/epo_dataset1


Saving the dataset (1/1 shards): 100%|██████████| 20/20 [00:00<?, ? examples/s]


# Searching in multiple databse using FAISS index

[1] https://huggingface.co/learn/cookbook/en/semantic_cache_chroma_vector_database

[2] https://www.pinecone.io/learn/series/faiss/faiss-tutorial/

In [55]:
# check how many data files 
# ref: https://pythonclcoding.medium.com/count-files-and-folders-using-python-93e0f8dc9337#:~:text=dirs%20%2B%3D%20len(dirnames)%20%3A,directory%20to%20the%20files%20counter.
PATH = 'C:/Users/20245580/Documents/Others/EPO2025/EPO-CodeFest-2025/backend-API/data_embeddings'
files, dir_names = 0, []
for root, dirnames, filenames in os.walk(PATH):
    print(f"looking into: {root}")
    if dirnames != []:
        for idx, elem in enumerate(dirnames):
            dir_names.append(elem)
    print(dirnames)
    files += len(filenames)

print(f"files: {files}")
print(f"directories: {dir_names}")


looking into: C:/Users/20245580/Documents/Others/EPO2025/EPO-CodeFest-2025/backend-API/data_embeddings
['epo_dataset0', 'epo_dataset1']
looking into: C:/Users/20245580/Documents/Others/EPO2025/EPO-CodeFest-2025/backend-API/data_embeddings\epo_dataset0
[]
looking into: C:/Users/20245580/Documents/Others/EPO2025/EPO-CodeFest-2025/backend-API/data_embeddings\epo_dataset1
[]
files: 6
directories: ['epo_dataset0', 'epo_dataset1']


In [57]:
from datasets import load_from_disk
load_all_dataset = []
for idx, names in enumerate(dir_names):
    dataset_name = PATH + '/'+ names
    loaded_dataset = load_from_disk(dataset_name)
    load_all_dataset.append(loaded_dataset)
print( load_all_dataset[0],
     len(load_all_dataset)
    )

Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length', 'embeddings'],
    num_rows: 20
}) 2


In [37]:
print(
    load_all_dataset[1],
    load_all_dataset[1]['title'][0],
    load_all_dataset[0]['title'][0]
)

Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length', 'embeddings'],
    num_rows: 20
}) INHALATION PARTICLES: method of preparation METHOD FOR MULTIPLEX NUCLEIC ACID ANALYSIS


In [87]:
from datasets import concatenate_datasets
all_dataset = concatenate_datasets(load_all_dataset)

In [88]:
all_dataset

Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length', 'embeddings'],
    num_rows: 40
})

## FAISS 

In [89]:
import numpy as np
import faiss

In [103]:
# get the stored embeddings 
test_emds = np.array(all_dataset['embeddings'])
print(test_emds.shape)

# add faiss index
index_faiss = faiss.IndexFlatL2(test_emds.shape[1]) 
print(index_faiss.is_trained)
index_faiss.add(test_emds)                  # add vectors to the index
print(index_faiss.ntotal)

(40, 768)
True
40


In [104]:
question = "How to test nuclear acids in a sample?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [105]:
k = 5
[distance_res], [index_res] = index_faiss.search(question_embedding, k)
print(index_res)
print(distance_res)

[ 2  8 28 16 18]
[177.28308 183.54887 212.45772 240.05504 245.35034]


In [106]:
for i, d in zip(index_res, distance_res):
    # print(i, d)
    print(f"TITLE: {all_dataset['title'][i]}")
    print(f"DISTANCE: {d}")
    print(f"CLAIMS: {all_dataset['claims'][i]}")
    print("=" * 50)
    print()

TITLE: EMULSIFYING DISPERSANTS, METHOD FOR EMUSIFICATION AND DISPERSION WITH THE SAME, EMULSIONS, AND EMULSION FUELS
DISTANCE: 177.2830810546875
CLAIMS: An emulsification dispersant, wherein the main component is vesicles that are formed from amphiphilic substances capable of forming vesicles spontaneously and that adhere onto the surface of an oil based material, and wherein the average particle size of said vesicles is 8 nm to 500 nm when the emulsion is being formed, and 200 nm to 800 nm when the dispersant is being conditioned within a concentration range of 5 to 20 wt.% in the dispersion, wherein said amphiphilic substance is selected from a group comprising a. derivatives with an average number of 5 to 15 added ethylene oxide molecules (E), selected from among polyoxyethylene hydrogenated castor oil derivatives represented by the following general formula (Formula 1): b. halides of dialkylammonium derivatives, trialkylammonium derivatives, tetraalkylammonium derivatives, dialkeny

## HUGGING FACE FAISS

In [107]:
all_dataset.add_faiss_index(column="embeddings")

100%|██████████| 1/1 [00:00<00:00, 179.94it/s]


Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length', 'embeddings'],
    num_rows: 40
})

In [108]:
# question = "Is solar pannel friendly to the environment"
# question_embedding = get_embeddings([question]).cpu().detach().numpy()
# question_embedding.shape

In [109]:

scores, samples = all_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=k
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

for _, row in samples_df.iterrows():
    print(f"TITLE: {row.title}")
    print(f"SCORE: {row.scores}")
    print(f"DESCRIPTION: {row.claims}")
    print("=" * 50)
    print()


TITLE: POLYMER-BIOACTIVE AGENT CONJUGATES
SCORE: 245.350341796875
DESCRIPTION: A bioerodible polymer - bioactive agent conjugate comprising as part of its polymer backbone a moiety of general formula (I):  wherein: A and B, which may be the same or different, represent the remainder of the polymer backbone and are (i) attached to the -O-R(ZD)-O- moiety as shown in formula (I) via a bioerodible moiety, and (ii) comprise a poly(urethane-ester) having monomeric units coupled via bioerodible urethane or ester moieties;R represents a linear or branched optionally substituted hydrocarbon;Z is a linking group; andD is a releasable bioactive agent.The bioerodible polymer - bioactive agent conjugate according to claim 1, wherein A and B each comprise a copolymer of polyurethane and polyester.The bioerodible polymer - bioactive agent conjugate according to claim 1 or claim 2, which comprises less than 25 mol% of polymerised residues that are derived from a C2 diol, relative to the total number o