# Semantic search with FAISS (PyTorch)

# Libs

In [6]:
# ! pip install datasets evaluate transformers[sentencepiece]

# reference:
# [1] https://stackoverflow.com/questions/58957169/faiss-error-could-not-find-a-version-that-satisfies-the-requirement-faiss-from/58957380
# [1] Self-summary: 
#   1.1 Python version too high (for example: 3.13 has problem with installing faiss)
#   1.2 must state the cuda version explicitly while installing faiss -> (after install torch) check 'nvidia-...' version in  `conda list > requirement.txt` 
# ! pip install faiss-gpu-cu12 #
# ! pip install faiss-cpu 

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp312-cp312-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [1]:
import pandas as pd

# Data processing

In [7]:
df= pd.read_csv("/home/lephuonglantran/EPO2024/df_combine.csv")
print(f"row counts in df: {len(df)}")

row counts in df: 1848


In [3]:
# df["claims"][0], df["title"][0]

In [8]:
# convert panda data frame to dataset
from datasets import Dataset

In [5]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['title', 'description', 'claims', 'ipc'],
    num_rows: 1848
})

In [9]:
# remove column description
columns = dataset.column_names
columns_to_keep = ["title", "claims", "ipc"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
dataset = dataset.remove_columns(columns_to_remove)
dataset


Dataset({
    features: ['title', 'claims', 'ipc'],
    num_rows: 1848
})

> Now that we **have one comment per row**, let’s **create a new claims_length column** that **contains the number of words per claims**:  

In [12]:
description_dataset = dataset.map(
    lambda x: {"claims_length": len(x["claims"].split())}
)

Map: 100%|██████████| 1848/1848 [00:00<00:00, 8293.42 examples/s]


> We can **use this new column to filter out short comments**, which typically **include things like “cc @lewtun” or “Thanks!” that are not relevant** for our search engine. There’s **no precise number to select for the filter**, **but around 15 words** seems like a good start:  

In [13]:
description_dataset = description_dataset.filter(lambda x: x["claims_length"] > 15)
description_dataset

Filter: 100%|██████████| 1848/1848 [00:00<00:00, 84820.58 examples/s]


Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length'],
    num_rows: 1847
})

In [24]:
# description_dataset[0]['claims_length']

In [26]:
max = 0
for i in range(len(description_dataset)):
    length_description_dataset = description_dataset[i]['claims_length']
    if i== 0:
        min = length_description_dataset
        max = length_description_dataset
    else:
        if length_description_dataset <= min:
            min = length_description_dataset
        
        if length_description_dataset >= max:
            max = length_description_dataset
print(f"min length in dataset: {min} words\nmax length in dataset: {max}")

min length in dataset: 27 words
max length in dataset: 9265


Split the long description into small chunks

In [27]:
# def spilt_into_smaller_descriptions(examples):
#     res = []
#     index = 0
#     num_words_per_chunk = 4500
#     total_chunks = examples["claims"].split()
#     total_len = examples["claims_length"]
#     while index < total_len:
#         chunk = ' '.join(total_chunks[index: index+num_words_per_chunk]) 
#                         # the elem with index = index + num_words_per_chunk 
#                         # is excluded
#         res.append(chunk)
#         index = index + num_words_per_chunk
#     last_chunk = ' '.join(total_chunks[index - num_words_per_chunk: total_len])
#     res.append(last_chunk)
#     return {
#         "claims": res
#     }

In [65]:
# 1. For demonstration purpose:
#    extract only a small fraction of the database.
# 2. For demonstrating the ability to add new data to later the embedding database for similar search:
#    two samples are used. First sample = old data, New sample = new data
all_samples = []
all_samples.append(description_dataset.select(range(20))) # extract row from 0th to 19th
all_samples.append(description_dataset.select(range(20, 40)))

In [66]:
# all_samples[1][0]['title'], all_samples[0][0]['title'] 

In [67]:
# all_samples_split = []
# for idx, elem in enumerate(all_samples):
#     all_samples_split.append(elem.map(spilt_into_smaller_descriptions))

In [68]:
# all_samples_split[1][0]

convert to dataframe to use `explode`

In [69]:
# df_all_samples_split = []
# for idx, elem in enumerate(all_samples_split):
#     elem.set_format("pandas")
#     df_elem = elem[:] # convert to dataframe
#     df_all_samples_split.append(df_elem.explode("claims", ignore_index=True)) # explode


In [70]:
# df_all_samples_split[0].head(4)

Convert the dataframe back to dataset 

In [71]:
# for idx, elem in enumerate(df_all_samples_split):
#     all_samples[idx] =  Dataset.from_pandas(elem)
# print(all_samples)

In [72]:
# all_samples[0][0]['title'],all_samples[1][0]['title'] 

# Creating text embeddings

In [73]:
from transformers import AutoTokenizer, AutoModel
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [74]:
import torch

device = torch.device("cpu")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

 > As we mentioned earlier, we’d **like to represent each entry in our GitHub issues corpus as a single vector**, so we **need to “pool” or average our token embeddings** in some way. One popular approach is to **perform CLS pooling on our model’s outputs**, where we **simply collect the last hidden state for the special [CLS] token**. The following function does the trick for us:

In [75]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

> Next, we’ll create a helper function that will tokenize a list of documents, place the tensors on the GPU, feed them to the model, and finally apply CLS pooling to the outputs:

In [76]:
def get_embeddings(text_list):
    # encoded_input = tokenizer(
    #     text_list, padding=True, truncation=True, return_tensors="pt"
    # )
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [77]:
embedding = get_embeddings(all_samples[0]["claims"][0])
embedding.shape

torch.Size([1, 768])

In [138]:
embeddings_dataset = sm_description_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["description"]).detach().cpu().numpy()[0]}
)

Map: 100%|██████████| 826/826 [00:21<00:00, 39.11 examples/s]


In [87]:
embeddings_dataset.add_faiss_index(column="embeddings")

100%|██████████| 1/1 [00:00<00:00, 325.11it/s]


Dataset({
    features: ['title', 'description', 'ipc', 'description_length', 'embeddings'],
    num_rows: 843
})

In [88]:
# question = "How can I load a dataset offline?"
question = "How to test nucleic acids in a sample"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [89]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

# Loaded version
# scores, samples = load_dataset.get_nearest_examples(
#     "embeddings", question_embedding, k=5
# )

In [90]:
# samples

In [91]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [92]:
# samples_df

In [93]:
for _, row in samples_df.iterrows():
    print(f"TITLE: {row.title}")
    print(f"SCORE: {row.scores}")
    print(f"DESCRIPTION: {row.description}")
    print("=" * 50)
    print()

TITLE: POLYMER-BIOACTIVE AGENT CONJUGATES
SCORE: 34.82658767700195
DESCRIPTION: (s, 1H, H-2); 8.07-7.97 (m, 1H, H-5); 7.41-7.30 (m, 1H, H-8); 3.57-3.46 (m, CH of cyclopropane ring); 3.41-3.24 (m, 4H, 2xCH2 of piperazine); 2.74-2.58 (m, 2xCH2 of piperazine); 2.44-2.33 (m, 2H, CH2N); 1.65-1.45 (m, 2H, CH2); 1.45-1.28 (m, 2H, CH2 of cyclopropane ring); 1.28-1.12 (m, 2H, CH2 of cyclopropane ring); 0.94 (t, 3H, CH3) MS (CH2Cl2) 374 [M+ 1] 747 [2M+1]b) (2,2-dimethyl-1,3-dioxolan-4-yl)methyl 1-cyclopropyl-6-fluoro-4-oxo-7-(4-propylpiperazin-1-yl)-1,4-dihydroquinoline-3-carboxylate1-cyclopropyl-6-fluoro-4-oxo-7-(4-propylpiperazin-1-yl)-1,4-dihydroquinoline-3-carboxylic acid (8.86 g, 23.70 mmol) was dissolved in anhydrous dichloromethane (370 ml) under argon. 2,2-dimethyl-1,3-dioxolane-4-methanol (4.71 g, 35.60 mmol), triethylamine (9.59 g, 94.80 mmol) and HBTU (9.90 g, 26.10 mmol) were added and the reaction mixture was stirred at room temperature for three days (exclusion of light). The react

# Save and reload FAISS database

**references:**

[1] https://huggingface.co/docs/datasets/v1.2.0/faiss_and_ea.html

[2] https://discuss.huggingface.co/t/save-and-load-datasets/9260

## Save

IMPORTANT:

[1] must save the dataset which contains the corresponding computed embeddings

In [139]:
embeddings_dataset.save_to_disk('./data_embeddings/epo_dataset2')

Saving the dataset (1/1 shards): 100%|██████████| 826/826 [00:00<00:00, 54299.88 examples/s]


In [96]:
# ds_with_embeddings.save_faiss_index('embeddings', 'my_index.faiss')
embeddings_dataset.save_faiss_index('embeddings', './data_embeddings/epo_index1.faiss')

## Load

In [7]:
# ds = load_dataset('crime_and_punish', split='train[:100]')
# ds.load_faiss_index('embeddings', 'my_index.faiss')
from datasets import load_from_disk
load_dataset = load_from_disk('./data_embeddings/epo_dataset')

In [8]:
load_dataset.load_faiss_index('embeddings', './data_embeddings/epo_index.faiss')

# Searching in multiple databse using FAISS index

[1] https://huggingface.co/learn/cookbook/en/semantic_cache_chroma_vector_database

[2] https://www.pinecone.io/learn/series/faiss/faiss-tutorial/

In [140]:
from datasets import load_from_disk
load_all_dataset = []
for i in range(2):
    dataset_name = './data_embeddings/epo_dataset' + str(i+1)
    loaded_dataset = load_from_disk(dataset_name)
    # dataset_faiss_name = './data_embeddings/epo_index' + str(i+1) + '.faiss'
    # loaded_dataset.load_faiss_index('embeddings', dataset_faiss_name)
    load_all_dataset.append(loaded_dataset)
print(len(load_all_dataset))

2


In [141]:
print(
    load_all_dataset[1],
    load_all_dataset[0]['title'][0]
)

Dataset({
    features: ['title', 'description', 'ipc', 'description_length', 'embeddings'],
    num_rows: 826
}) METHOD FOR MULTIPLEX NUCLEIC ACID ANALYSIS


In [142]:
from datasets import concatenate_datasets
all_dataset = concatenate_datasets(load_all_dataset)

In [143]:
all_dataset

Dataset({
    features: ['title', 'description', 'ipc', 'description_length', 'embeddings'],
    num_rows: 1669
})

In [144]:
all_dataset.add_faiss_index(column="embeddings")

100%|██████████| 2/2 [00:00<00:00, 249.36it/s]


Dataset({
    features: ['title', 'description', 'ipc', 'description_length', 'embeddings'],
    num_rows: 1669
})

In [145]:
question = "How to test nucleic acids in a sample"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [148]:

scores, samples = all_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=10
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

for _, row in samples_df.iterrows():
    print(f"TITLE: {row.title}")
    print(f"SCORE: {row.scores}")
    print(f"DESCRIPTION: {row.description}")
    print("=" * 50)
    print()


TITLE: METHOD FOR MULTIPLEX NUCLEIC ACID ANALYSIS
SCORE: 35.79100799560547
DESCRIPTION: DNA <213> Artificial<220> <223> synthetic oligonucleotide primers or probes<400> 886 tgctaactag atcgcgggtt gattgacaga acacatagcc tgggcaaat 49<210> 887 <211> 45 <212> DNA <213> Artificial<220> <223> synthetic oligonucleotide primers or probes<400> 887 agacaaatag cagcggtggt gccaaaatca cccgctctag ggaag 45<210> 888 <211> 42 <212> DNA <213> Artificial<220> <223> synthetic oligonucleotide primers or probes<400> 888 tattcgctca taacgggttc gccaagaggg aaggcaggca ga 42<210> 889 <211> 51 <212> DNA <213> Artificial<220> <223> synthetic oligonucleotide primers or probes<400> 889 atgtcctact ccaccagttc ccagcattac agtccgttag cccgatggta a 51<210> 890 <211> 42 <212> DNA <213> Artificial<220> <223> synthetic oligonucleotide primers or probes<400> 890 tattcgctca taacgggttc gaggaacaca ctgactccgc cc 42<210> 891 <211> 60 <212> DNA <213> Artificial<220> <223> synthetic oligonucleotide primers or probes<400> 891 gttattcctg a