# Semantic search with FAISS (PyTorch)

# Libs

In [1]:
# ! pip install datasets evaluate transformers[sentencepiece]

# reference:
# [1] https://stackoverflow.com/questions/58957169/faiss-error-could-not-find-a-version-that-satisfies-the-requirement-faiss-from/58957380
# [1] Self-summary: 
#   1.1 Python version too high (for example: 3.13 has problem with installing faiss)
#   1.2 must state the cuda version explicitly while installing faiss -> (after install torch) check 'nvidia-...' version in  `conda list > requirement.txt` 
# ! pip install faiss-gpu-cu12 # (NO NEED TO INSTALL THIS, WE DO NOT USE GPU)
# ! pip install faiss-cpu 
# ! pip install torch

In [2]:
# reference: https://stackoverflow.com/questions/20554074/sklearn-omp-error-15-initializing-libiomp5md-dll-but-found-mk2iomp5md-dll-a
# for solving crashing problem: 22:48:44.157 [error] Disposing session as kernel process died ExitCode: 3, 
# Reason: OMP: Error #15: Initializing libomp140.x86_64.dll, but found libiomp5md.dll already initialized
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [3]:
import pandas as pd

# Data processing

In [4]:
df= pd.read_csv("C:/Users/20245580/Documents/Others/EPO2025/epo_data/Full_dataset.csv")
print(f"row counts in df: {len(df)}")

row counts in df: 24901


In [7]:
df.head()

Unnamed: 0,publication_number,publication_kind,publication_date,ipc,cpc,title_en,claims,abstract_text,description_text,prior_art,...,pct_publication_number,designated_states_contracting,designated_states_extension,designated_states_validation,applicant,inventor,sdg_number,analysis_explanation,ipc_tech_field,ipc_technologies
0,6,B1,19810204,C02F3/04,[],Process for nitrification of waste water,process for nitrifying oxidizable nitrogen com...,,The invention relates to a process for nitrifi...,[],...,,"['BE', 'DE', 'FR', 'GB', 'NL', 'SE']",[],[],[],"[{'name': 'Bakker, Gerhard', 'country': 'NL'},...",6,"– Goal 6: The patent explicitly describes a ""p...",,
1,41,B1,19810429,A61M25/00,[],An intravascular catheter,an intravascular catheter comprising a hub 2 h...,,This invention relates to an intravascular cat...,[],...,,"['BE', 'DE', 'FR', 'GB', 'SE']",[],[],[],"[{'name': 'Sagae, Kyuta', 'country': 'JP'}, {'...",3,"– Goal 3: The patent describes ""an intravascul...",Human Necessities,Other Human Necessities
2,85,B1,19801029,"C01B7/13, C01B17/90, C01B3/06, C01B13/02",[],Process for the production and separation of h...,a process for the production and separation of...,,This invention relates to a process for the pr...,[],...,,"['BE', 'DE', 'FR', 'GB', 'LU', 'NL']",[],[],[],"[{'name': 'De Beni, Giancarlo', 'country': 'IT...",7,"– Goal 7: The patent describes ""the use of the...",Chemistry & Metallurgy,Chemical Processes
3,196,B1,19820421,B65D5/02,[],A corrugated fibreboard box,a corrugated fibreboard shipping box for packi...,,This invention relates to a corrugated fibrebo...,[],...,,"['DE', 'FR', 'GB']",[],[],[],"[{'name': 'Ken Kohayakawa', 'country': 'JP'}, ...",12,"– Goal 12: The patent describes a ""corrugated ...",Performing Operations & Transport,Packaging & Containers
4,197,B1,19811111,"H01M10/36, H01M4/58, C01G1/12",[],Rechargeable non-aqueous cell with a chalcogen...,"a nonaqueous secondary cell, comprising a nega...",,Background of the Invention1. Field of the Inv...,[],...,,"['BE', 'DE', 'FR', 'GB', 'NL', 'SE']",[],[],[],"[{'name': 'DiSalvo, Francis Joseph', 'country'...",7,"– Goal 7: The patent describes a ""nonaqueous s...",Chemistry & Metallurgy; Electricity,Electrical Power; Other Chemistry & Metallurgy


In [14]:
df["claims"][2490]

'a bone anchor 116, comprisingan interconnection portion 172 to slidingly engage a spacer 112a fixation portion 184 spaced apart from the interconnection portion 172 to fix the anchor 116 to a bone anda leg 196, 198 connecting the fixation portion 184 to the interconnection portion 172the bone anchor 116 being characterized in that said fixation portion 184 is substantially circular in a view from a trailing end 170 of the anchor 116.a system for spinal fusion, comprisingan intervertebral spacer 112 sized and shaped to at least partially fill an intervertebral disc space 6 between first and second adjacent vertebrae andthe bone anchor 116 of claim 1,wherein the anchor 116 is sized and shaped to secure the spacer 112 to the first vertebra so that the spacer 112 and the first vertebra are substantially relatively immobilized against antagonistic spinal motions.the bone anchor 116 of claim 1, wherein the interconnection portion 172 includes a locking feature 178 proximate a trailing end 1

In [29]:
df["publication_number"].is_unique

True

In [5]:
# convert panda data frame to dataset
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['publication_number', 'publication_kind', 'publication_date', 'ipc', 'cpc', 'title_en', 'claims', 'abstract_text', 'description_text', 'prior_art', 'reference', 'parent', 'pct_publication_number', 'designated_states_contracting', 'designated_states_extension', 'designated_states_validation', 'applicant', 'inventor', 'sdg_number', 'analysis_explanation', 'ipc_tech_field', 'ipc_technologies'],
    num_rows: 24901
})

In [None]:
# remove column => table `main`
columns = dataset.column_names
columns_to_keep = ['publication_number', 'title_en', 'claims', 'inventor']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
dataset_main = dataset.remove_columns(columns_to_remove)
dataset_main


Dataset({
    features: ['publication_number', 'title_en', 'claims', 'inventor'],
    num_rows: 24901
})

In [33]:
# remove column => table `sdg_pub`
columns = dataset.column_names
columns_to_keep = ['publication_number', 'sdg_number']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
dataset_sdg_pub = dataset.remove_columns(columns_to_remove)
dataset_sdg_pub

Dataset({
    features: ['publication_number', 'sdg_number'],
    num_rows: 24901
})

In [34]:
# remove column => table `ipc_pub`
columns = dataset.column_names
columns_to_keep = ['publication_number', 'ipc']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
dataset_ipc_pub = dataset.remove_columns(columns_to_remove)
dataset_ipc_pub

Dataset({
    features: ['publication_number', 'ipc'],
    num_rows: 24901
})

> Now that we **have one comment per row**, let’s **create a new claims_length column** that **contains the number of words per claims**:  

In [25]:
description_dataset = dataset.map(
    lambda x: {"claims_length": len(x["claims"].split())}
)

Map: 100%|██████████| 24901/24901 [00:02<00:00, 9166.42 examples/s] 


> We can **use this new column to filter out short comments**, which typically **include things like “cc @lewtun” or “Thanks!” that are not relevant** for our search engine. There’s **no precise number to select for the filter**, **but around 15 words** seems like a good start:  

In [26]:
description_dataset = description_dataset.filter(lambda x: x["claims_length"] > 10)
description_dataset

Filter:   0%|          | 0/24901 [00:00<?, ? examples/s]

Filter: 100%|██████████| 24901/24901 [00:00<00:00, 117718.54 examples/s]


Dataset({
    features: ['publication_number', 'ipc', 'title_en', 'claims', 'claims_length'],
    num_rows: 24900
})

In [10]:
# description_dataset[0]['claims_length']

In [27]:
max = 0
for i in range(len(description_dataset)):
    length_description_dataset = description_dataset[i]['claims_length']
    if i== 0:
        min = length_description_dataset
        max = length_description_dataset
    else:
        if length_description_dataset <= min:
            min = length_description_dataset
        
        if length_description_dataset >= max:
            max = length_description_dataset
print(f"min length in dataset: {min} words\nmax length in dataset: {max}")

min length in dataset: 16 words
max length in dataset: 8538


Split the long description into small chunks

In [16]:
def spilt_into_smaller_chunks(examples):
    res = []
    index = 0
    num_words_per_chunk = 100
    total_chunks = examples["claims"].split()
    total_len = examples["claims_length"]
    while index < total_len:
        chunk = ' '.join(total_chunks[index: index+num_words_per_chunk]) 
                        # the elem with index = index + num_words_per_chunk 
                        # is excluded
        res.append(chunk)
        index = index + num_words_per_chunk
    last_chunk = ' '.join(total_chunks[index - num_words_per_chunk: total_len])
    res.append(last_chunk)
    return {
        "claims": res
    }

In [10]:
print(len(dataset))

1848


In [13]:
# 1. For demonstration purpose:
#    extract only a small fraction of the database.
# 2. For demonstrating the ability to add new data to later the embedding database for similar search:
#    two samples are used. First sample = old data, New sample = new data
all_samples = []
all_samples.append(description_dataset.select(range(0, 400))) # extract row from 0th to 19th
all_samples.append(description_dataset.select(range(400, 800)))
all_samples.append(description_dataset.select(range(800, 1200)))
all_samples.append(description_dataset.select(range(1200, len(dataset)-1)))

In [14]:
# all_samples[1][0]['title'], all_samples[0][0]['title'] 

In [17]:
all_samples_split = []
for idx, elem in enumerate(all_samples):
    all_samples_split.append(elem.map(spilt_into_smaller_chunks))

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map: 100%|██████████| 400/400 [00:00<00:00, 5085.38 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 7728.16 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 7855.94 examples/s]
Map: 100%|██████████| 647/647 [00:00<00:00, 7796.37 examples/s]


In [16]:
# all_samples_split[1][0]

convert to dataframe to use `explode`

In [18]:
df_all_samples_split = []
for idx, elem in enumerate(all_samples_split):
    elem.set_format("pandas")
    df_elem = elem[:] # convert to dataframe
    df_all_samples_split.append(df_elem.explode("claims", ignore_index=True)) # explode


In [19]:
df_all_samples_split[0].head(4)

Unnamed: 0,title,claims,ipc,claims_length
0,METHOD FOR MULTIPLEX NUCLEIC ACID ANALYSIS,A method of assaying nucleic acids in a sample...,C,1495
1,METHOD FOR MULTIPLEX NUCLEIC ACID ANALYSIS,5' end of the first probe is adjacent to the 3...,C,1495
2,METHOD FOR MULTIPLEX NUCLEIC ACID ANALYSIS,"obtain an amplification product, each set of p...",C,1495
3,METHOD FOR MULTIPLEX NUCLEIC ACID ANALYSIS,is labeled with a detectable moiety; and at le...,C,1495


Convert the dataframe back to dataset 

In [20]:
for idx, elem in enumerate(df_all_samples_split):
    all_samples[idx] =  Dataset.from_pandas(elem)
print(all_samples)

[Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length'],
    num_rows: 3728
}), Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length'],
    num_rows: 4207
}), Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length'],
    num_rows: 3726
}), Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length'],
    num_rows: 6081
})]


In [22]:
# all_samples[0][0]['title'],all_samples[1][0]['title'] 

# Creating text embeddings

In [21]:
# import torch
# x = torch.rand(5, 3)
# print(x)

In [22]:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

# **reference**
# [1] what is the difference between AutoModel and AutoModelForCausalLM: https://www.reddit.com/r/huggingface/comments/1bv1kfk/what_is_the_difference_between/
#  => Summary
#       AutoModel: load the base model -> output is the model output embeddings
#       AutoModelForCausalLM: -> output is the model output embeddings + a head layer to convert the model output embeddings to a meaningful reply.

  from .autonotebook import tqdm as notebook_tqdm


## Normal tokenizer

In [None]:
# Because Arnab did not save the tokenizer of the fine-tune model,
# so we should use the tokenizer downloaded from Hugging face of the model Arnab used in his training.
# NOTE: actually, we could combine the model with tokenizer of other model such as:
# token_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1".
# However, not so sure there will be mismatching between the tokenizer id of other model with the used model?
# and how it affects the output?
# But using Arnab fine-tuned model gives much better relevancy test, 
# than using random model like "sentence-transformers/multi-qa-mpnet-base-dot-v1"
token_ckpt = "sadickam/sdg-classification-bert"
model_ckpt = "../current_batch"
             # this is possible because Arnab saved the model using Huggingface trainer.save_model()
tokenizer = AutoTokenizer.from_pretrained(token_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

## Mistral ai model

**REFERENCE**

[1] tokenizer for gguf model: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/, https://docs.vllm.ai/en/v0.6.2/quantization/gguf.html

[2] get output embbedings from AutoModelForLLM: https://stackoverflow.com/questions/76051807/automodelforcausallm-for-extracting-text-embeddings

[3] Load model, tokenzier from pre-downloaded files which are stored in .cache: https://huggingface.co/transformers/v3.0.2/model_doc/auto.html

[4] how to use mistral ai for text-generation: https://huggingface.co/mistralai/Mistral-7B-v0.3

[5] https://blog.steelph0enix.dev/posts/llama-cpp-guide/

In [49]:
## FAIL
# from llama_cpp.llama_tokenizer import LlamaHFTokenizer
# tokenizer=LlamaHFTokenizer.from_pretrained("MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF")
# from llama_cpp import Llama

# llm = Llama.from_pretrained(
# 	repo_id="MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF",
# 	filename="Mistral-7B-Instruct-v0.3.Q4_K_M.gguf",
#     n_ctx = 5000,
#     tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3"),
#     embedding=True
# )
# output_mistral_ai_tokenized = llm.create_embedding("what is this?")
# print(output_mistral_ai_tokenized)

### Embeddings from the normal mistral ai model

In [23]:
model_cache_path = "C:/Users/20245580/Documents/Others/EPO2025/EPO-CodeFest-2025/models/mistralai/Mistral-7B-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_cache_path)
model = AutoModel.from_pretrained(model_cache_path)

Loading checkpoint shards: 100%|██████████| 3/3 [01:04<00:00, 21.44s/it]


In [29]:
import torch

device = torch.device("cpu")
model.to(device)

MistralModel(
  (embed_tokens): Embedding(32768, 4096)
  (layers): ModuleList(
    (0-31): 32 x MistralDecoderLayer(
      (self_attn): MistralAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
      )
      (mlp): MistralMLP(
        (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
        (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
        (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
    )
  )
  (norm): MistralRMSNorm((4096,), eps=1e-05)
  (rotary_emb): MistralRotaryEmbedding()
)

In [30]:
encoded_input = tokenizer(
        "who am I?", return_tensors="pt"
    )
encoded_input = {k: v.to("cpu") for k, v in encoded_input.items()}
with torch.no_grad():
    model_output = model(**encoded_input)

In [33]:
print(model_output.last_hidden_state[:, 0][0][0:10], model_output.last_hidden_state[:, 0].shape )

tensor([-1.4323,  1.0071, -1.8032,  3.2069,  0.7589, -4.1136,  3.6168,  0.4659,
        -0.5501, -2.4445]) torch.Size([1, 4096])


### Embeddings from the gguf mistral ai model

In [None]:
# see reference Mistral ai [1]
from llama_cpp import Llama
import llama_cpp
# Load the GGUF model
model_pre_downloaded_path = "C:/Users/20245580/.cache/huggingface/hub/models--MaziyarPanahi--Mistral-7B-Instruct-v0.3-GGUF/snapshots/ce89f595755a4bf2e2e05d155cc43cb847c78978/Mistral-7B-Instruct-v0.3.Q4_K_M.gguf"
llm = Llama(model_path=model_pre_downloaded_path, embedding=True, pooling_type = llama_cpp.LLAMA_POOLING_TYPE_CLS)

llama_model_loader: loaded meta data with 29 key-value pairs and 291 tensors from C:/Users/20245580/.cache/huggingface/hub/models--MaziyarPanahi--Mistral-7B-Instruct-v0.3-GGUF/snapshots/ce89f595755a4bf2e2e05d155cc43cb847c78978/Mistral-7B-Instruct-v0.3.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = models--mistralai--Mistral-7B-Instruc...
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 32768
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              

In [2]:
embeddings = llm.create_embedding("who am I?")


llama_perf_context_print:        load time =    1342.36 ms
llama_perf_context_print: prompt eval time =    1329.60 ms /     5 tokens (  265.92 ms per token,     3.76 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    1342.52 ms /     6 tokens


In [34]:
print(# type(embeddings),
      # embeddings.keys(),
      type(embeddings['data']),
      # embeddings['data'][0],
      type(embeddings['data'][0]),
      embeddings['data'][0].keys(),
      type(embeddings['data'][0]['embedding']),
      len(embeddings['data'][0]['embedding']),
      embeddings['data'][0]['embedding'][0:10]
      )

<class 'list'> <class 'dict'> dict_keys(['object', 'embedding', 'index']) <class 'list'> 4096 [-1.540399193763733, 1.707515001296997, -2.0029244422912598, 3.711146116256714, 1.1398707628250122, -4.969476699829102, 3.873652696609497, 1.419399380683899, 0.028787625953555107, -3.411803960800171]


In [None]:
# Generator of tokens from a prompt
# NOTICE: must use, the following
llm = Llama(model_path=model_pre_downloaded_path, n_ctx= 2048*4)

llama_model_loader: loaded meta data with 29 key-value pairs and 291 tensors from C:/Users/20245580/.cache/huggingface/hub/models--MaziyarPanahi--Mistral-7B-Instruct-v0.3-GGUF/snapshots/ce89f595755a4bf2e2e05d155cc43cb847c78978/Mistral-7B-Instruct-v0.3.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = models--mistralai--Mistral-7B-Instruc...
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 32768
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              

In [None]:
tokens = llm.tokenize(b"Who am I?")
for token in llm.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.0):
    print(llm.detokenize([token]))

llama_model_loader: loaded meta data with 29 key-value pairs and 291 tensors from C:/Users/20245580/.cache/huggingface/hub/models--MaziyarPanahi--Mistral-7B-Instruct-v0.3-GGUF/snapshots/ce89f595755a4bf2e2e05d155cc43cb847c78978/Mistral-7B-Instruct-v0.3.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = models--mistralai--Mistral-7B-Instruc...
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 32768
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              

llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,32768]   = ["<unk>", "<s>", "</s>", "[INST]", "[...
llama_model_loader: - kv  16:                      tokenizer.ggml.scores arr[f32,32768]   = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv  17:                  tokenizer.ggml.token_type arr[i32,32768]   = [2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 1
llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 2
llama_model_loader: - kv  20:            tokenizer.ggml.unknown_token_id u32              = 0
llama_model_loader: - kv  21:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  22:               tokenizer.ggml.add_eos_token bool             = false
llama_model_loader: - kv  23:                    tokenizer.chat_template str              = {{ bos_token }}{% for message in

b' I'
b"'"
b'm'
b' an'
b' artist'
b' and'
b' a'
b' musician'
b'.'
b'\n'
b'\n'
b'I'
b"'"
b've'
b' been'
b' a'
b' working'
b' artist'
b' for'
b' over'
b' '
b'2'
b'0'
b' years'
b'.'
b' I'
b"'"
b've'
b' done'
b' graphic'
b' design'
b','
b' illustr'
b'ation'
b','
b' web'
b' design'
b','
b' and'
b' more'
b' for'
b' many'
b' clients'
b'.'
b' I'
b"'"
b'm'
b' also'
b' a'
b' musician'
b' with'
b' two'
b' albums'
b' out'
b' and'
b' a'
b' third'
b' in'
b' the'
b' works'
b'.'
b' I'
b"'"
b'm'
b' always'
b' looking'
b' for'
b' new'
b' projects'
b' and'
b' opportunities'
b' to'
b' collabor'
b'ate'
b'.'
b'\n'
b'\n'
b'I'
b"'"
b've'
b' been'
b' working'
b' in'
b' the'
b' creative'
b' industry'
b' for'
b' a'
b' long'
b' time'
b' and'
b' I'
b"'"
b've'
b' had'
b' the'
b' opportunity'
b' to'
b' work'
b' with'
b' a'
b' wide'
b' range'
b' of'
b' clients'
b'.'
b' Some'
b' of'
b' my'
b' projects'
b' have'
b' included'
b' designing'
b' log'
b'os'
b' for'
b' businesses'
b','
b' creating'
b' illustr'
b'ations'
b' f

llama_decode: failed to decode, ret = 1


b' Sound'
b' of'


RuntimeError: llama_decode returned 1

In [None]:
# Generator of tokens from a prompt
def generate_text(
    prompt="Who is the CEO of Apple?",
    max_tokens=512,
    temperature=1,
    top_p=0.95,
    echo=False,
    stop=["#"],
):
    output = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        echo=echo,
        stop=stop,
    )
    output_text = output["choices"][0]["text"].strip()
    return output_text


def generate_prompt_from_template(input):
    chat_prompt_template = f"""<|im_start|>system
                            You are a helpful chatbot.<|im_end|>
                            <|im_start|>user
                            {input}<|im_end|>"""
    return chat_prompt_template

In [None]:
# Generator of tokens from a prompt
context1 = "CLAIMS: A bioerodible polymer - bioactive agent conjugate comprising as part of its polymer backbone a moiety of general formula (I):  wherein: A and B, which may be the same or different, represent the remainder of the polymer backbone and are (i) attached to the -O-R(ZD)-O- moiety as shown in formula (I) via a bioerodible moiety, and (ii) comprise a poly(urethane-ester) having monomeric units coupled via bioerodible urethane or ester moieties;R represents a linear or branched optionally substituted hydrocarbon;Z is a linking group; andD is a releasable bioactive agent.The bioerodible polymer - bioactive agent conjugate according to claim 1, wherein A and B each comprise a copolymer of polyurethane and polyester.The bioerodible polymer - bioactive agent conjugate according to claim 1 or claim 2, which comprises less than 25 mol% of polymerised residues that are derived from a C2 diol, relative to the total number of moles of polymerized diol residues.The bioerodible polymer - bioactive agent conjugate according to any one of claims 1 to 3, wherein D is coupled through Z to R by an ester, amide, anhydride, imide, carbonate, peroxide, peroxyester, thiol, phosphate ester, thioester, sulphate ester, carbamate, azo or boronate ester moiety.The bioerodible polymer - bioactive agent conjugate according to claim 4, wherein Z is selected from -O-; -C(O)-; and optionally substituted: -OC(O)-C1-18alkylene-C(O)-; -C(O)O-C1-18alkylene-C(O)-; -NRaC(O)-C1-18alkylene-C(O)-; -C(O)O-C1-18alkylene-O-; -O-C1-18alkylene-O-; -O-C1-18alkylene-NRa-; -OC(O)-C1-18alkylene-NRa-; -C(O)-C1-18alkylene-NRa-; -OC(O)-C1-18alkylene-O-; -C(O)-C1-18alkylene-O-; and-C(O)NRa-C1-18alkylene-NRa- where Ra is selected from hydrogen, C1-18alkyl, C1-18alkenyl, C1-18alkynyl, C6-18aryl, C3-18carbocyclyl, C3-18heteroaryl, C3-18heterocyclyl, and C7-18arylalkyl.The bioerodible polymer - bioactive agent conjugate according to any one of claims 1 to 5, wherein R is a linear or branched optionally substituted hydrocarbon having from 1 to 12 carbon atoms.The bioerodible polymer - bioactive agent conjugate according to any one of claims 1 to 6, wherein the bioactive agent (D) is selected from 5-alpha-reductase inhibitors, amebicides, aminosalicylates, anaesthetics (general and local), analgesics, angiotensin inhibitors, anorexiants, antacid agents, anti-angiogenic agents, antianginal agents, antiarrythmic agents, antiarthritic agents, antibiotics, antibacterial agents, antibodies, anticoagulants, anticonvulsants, antidepressants, antiepileptic agents, antifungals, anthelmintics, antihistamines, antihypertensives, antihyperlipidemic agents, antiinfectives, antiinflammatories, antiemetics, antimalarial, antimetabolites, antimigraine, antimitotics, antiparasitic agents, antiparkinson agents, antipsychotics, antiprotozoals, antitussives, antiulcer agents, antivirals, anxiolytics, bronchodilators, decongestants and expectorants, cancer therapy and related pharmaceuticals, cardiovascular pharmaceuticals, central nervous system pharmaceuticals, benzopidazepines, beta-adrenergic blocking agents, bisphosphonates, calcium channel blockers, carbonic anhydrase inhibitors, chemokine receptor antagonist, coumarins and indadiones, cox-2 inhibitors, contraceptives, cytotoxics, diuretics, diabetes therapies, growth hormones, fertility pharmaceuticals, hematinics, glucose modifying agents, growth promoters, H2 antagonists, heparin and heparin antagonists, hormone replacement therapies, hemostatics, immunosuppressants, immunostimulants, inotropic agents, interferons, hormones and analogs, impotence agents, kinase inhibitors, laxatives, leukotriene modifiers, macrolides, mast cell stabilizers, muscle relaxants/stimulants, mydiratics, neuromuscular blocking agents, obesity therapeutics, ophthalmic pharmaceuticals, osteoporosis drugs, pain therapeutics, peptides and polypeptides, peripheral vasodilators, platelet inhibitors/stimulating agents, prolactin inhibitors, protease inhibitors, protein therapeutics, proton pump inhibtors, radiopharmaceuticals, respiratory pharmaceuticals, sedatives, spermicides, steroids, smoking cessation agents, statins, stimulants and tranquilizers, sulphonamides, thyroid drugs, urinary acidifiers/alkalinisers, and vasodilators.The bioerodible polymer - bioactive agent conjugate according to any one of claims 1 to 7, wherein the bioactive agent (D) is selected from fluoroquinolone antibiotics.The bioerodible polymer - bioactive agent conjugate according to claim 8, wherein the bioactive agent (D) is selected from alatrofloxacin, balofloxacin, ciprofloxacin, clinafloxacin, danofoxacin, delafloxacin, dextrofloxacin, difloxacin, enoxacin, enrofloxacin, garenoxacin, gatifloxacin, gemifloxacin, grepafloxacin, levofloxacin, lomefloxacin, marbofloxacin, moxifloxacin, nadifloxacin, norfloxacin, ofloxacin, orbifloxacin, pefloxacin, sitafloxacin, sparfloxacin, temafloxacin, tosufloxacin, tosulfloxacin and trovafloxacin.The bioerodible polymer - bioactive agent conjugate according to any one of claims 1 to 9, obtainable by polymerising a monomer - bioactive moiety conjugate of formula (II):  where: R represents a linear or branched optionally substituted hydrocarbon;Z is a spacer moiety; andD is a releasable bioactive moiety; with a polyisocyanate and a polyester polyol.The bioerodible polymer - bioactive agent conjugate according to claim 10, wherein the polyisocyanate is selected from the group consisting of m-phenylene diisocyanate, p-phenylene diisocyanate, 2,4-toluene diisocyanate, 2,6-toluene diisocyanate, 1,6-hexamethylene diisocyanate, 1,4-hexamethylene diisocyanate, 1,3-cyclohexane diisocyanate, 1,4-cyclohexane diisocyanate, hexahydro-toluene diisocyanate and its isomers, isophorone diisocyanate, dicyclo-hexylmethane diisocyanates, 1,5-napthylene diisocyanate, 4,4'-diphenylmethane diisocyanate, 2,4' diphenylmethane diisocyanate, 4,4'-biphenylene diisocyanate, 3,3'-dimethoxy-4,4'-biphenylene diisocyanate, 3,3'-dimethyl-diphenylpropane-4,4'-diisocyanate, 2,4,6-toluene triisocyanate, 4,4'-dimethyl-diphenylmethane-2,2',5,5'-tetraisocyanate, polymethylene polyphenhyl polyisocyanates and alkyl esters of lysine diisocyanate (preferably ethyl ester of lysine diisocyanate) and combinations thereof.The bioerodible polymer - bioactive agent conjugate according to any one of claims 10 or claim 11, wherein the polyester polyol is selected from the group consisting of polycaprolactone diol (PCLD), poly(DL lactide) (DLLA) and poly(lactic acid-co-glycolic acid) (PLGA) and combinations thereof.A process for preparing a bioerodible polymer - bioactive agent conjugate according to any one of claims 1 to 9, said process comprising the step of polymerising a monomer - bioactive agent conjugate of formula (II):  wherein: R, Z and D are as defined in any one of claims 1 to 9;with a polyisocyanate and a polyester polyol.A monomer - bioactive agent conjugate that is suitable for use in preparing a bioerodible polymer - bioactive agent conjugate, the monomer - bioactive agent conjugate having a structure of general formula (II):  wherein: R represents a linear or branched optionally substituted hydrocarbon;Z is a linking group; andD is a releasable bioactive agent selected from fluoroquinolone antibiotics.A monomer - bioactive agent conjugate according to claim 14, wherein the bioactive agent (D) is selected from alatrofloxacin, balofloxacin, ciprofloxacin, clinafloxacin, danofoxacin, delafloxacin, dextrofloxacin, difloxacin, enoxacin, enrofloxacin, garenoxacin, gatifloxacin, gemifloxacin, grepafloxacin, levofloxacin, lomefloxacin, marbofloxacin, moxifloxacin, nadifloxacin, norfloxacin, ofloxacin, orbifloxacin, pefloxacin, sitafloxacin, sparfloxacin, temafloxacin, tosufloxacin, tosulfloxacin and trovafloxacin."
context2 = "CLAIMS: A method of assaying nucleic acids in a sample, comprising the steps of: a) adding multiple sets of probes into the sample to form a mixture, each set of probes comprising: i. a first probe having a first portion at least partially complementary to a first region of a target nucleic acid in the sample and a second portion forming a first primer binding site;ii. a second probe having a first portion at least partially complementary to a second region of the target nucleic acid in the sample and a second portion forming a second primer binding site, wherein the 5' end of the first probe is adjacent to the 3' end of the second probe when both probes are hybridized to the target nucleic acid;b) denaturing nucleic acids in the mixture;c) hybridizing the set of probes to the complementary regions of the target nucleic acid;d) performing a ligation reaction with a ligase enzyme on the set of hybridized probes to connect the adjacent 5' end of the first probe and the 3' end of the second probe to form a third probe, wherein steps b-d are repeated 1-100 times;e) amplifying the third probe with multiple sets of primers to obtain an amplification product, each set of primers comprising: i. a first primer at least partially complementary to the first primer binding site in one or more first probes of the multiple sets of probes;ii. a second primer at least partially complementary to the second primer binding site in one or more second probes of the multiple sets of probes;f) assaying the presence, absence or quantity of the target nucleic acid in the sample by determining the presence, absence or quantity of the third probe in the amplification product; and wherein at least one primer of each set of primers is labeled with a detectable moiety; and at least one primer of the multiple sets of primers includes a stuffer sequence; and at least one probe of the multiple sets of probes includes a stuffer sequence, wherein the measurement is carried out using capillary electrophoresis.The method of claim 1, wherein the stuffer sequence in the at least one primer of the multiple sets of primers has about 10 to about 500 nucleotides, preferably about 10 to about 60 nucleotides; and/or the stuffer sequence in the at least one probe of the multiple sets of probes has about 1 to about 200 nucleotides, preferably about 1 to about 55 nucleotides.The method of claim 1 or 2, wherein at least one primer of each set of primers includes an oligonucleotide comprising a sequence GTTTCTT or a functional equivalent variant of the oligonucleotide comprising a sequence GTTTCTT.The method of any one of preceding claims, wherein the determination of the presence, absence or quantity of the third probe in the amplification product is carried out by measuring the presence, absence or quantity of the third probe in the amplification product on the basis of detectable moieties, fragment sizes, or both.The method of any one of preceding claims, wherein the moiety is a fluorescent dye; preferably the moiety is a fluorescent dye selected from the group consisting of FAM (5-or 6-carboxyfluorescein), VIC, NED, PET, Fluorescein, FITC, IRD-700/800, CY3, CY5, CY3.5, CY5.5, HEX, TET, TAMRA, JOE, ROX, BODIPY TMR, Oregon Green, Rhodamine Green, Rhodamine Red, Texas Red, and Yakima Yellow.The method of any one of preceding claims, wherein the target nucleic acid is the dystrophin gene having a deletion of one or more exons and the sets of probes for assaying the dystrophin gene comprise one or more probe pairs selected from SEQ ID NOs: 158-541; or the target nucleic acid corresponds to a part of human chromosome 21 and the sets of probes for assaying the part of human chromosome 21 comprise probe pairs selected from SEQ ID NOs: 559-942.The method of any one of preceding claims, wherein the denaturing step is carried at about 90°C to about 99°C for about 5 seconds to about 30 minutes, and the hybridization and the ligation steps are carried out simultaneously at about 4°C to about 70°C for about 1 minute to about 48 hours, preferably the denaturing step is carried at about 95°C for about 30 seconds, and the hybridization and the ligation steps are carried out simultaneously at about 58°C for about 4 hours, and the steps of denaturing, hybridization and ligation are repeated 4 times.The method of any one of preceding claims, wherein two or more sets of probes are used to hybridize to two or more target sites in the target nucleic acid, with each set of probes hybridizing to a different target site.The method of any one of preceding claims, wherein the target nucleic acid has a quantitative variation of about 0.1% to about 30% between two samples.The method of any one of preceding claims, wherein one set of primers is used to amplify a group of the third probes, said group of the third probes comprises multiple third probes which are formed from multiple sets of probes hybridizing to multiple target sites and from multiple sets of reference probes hybridizing to multiple reference target sites.The method of any one of preceding claims, wherein the multiple third probes in the group are formed from about 1 to about 100 sets of probes hybridizing to target sites and about 1 to about 100 sets of reference probes.The method of any one of preceding claims, wherein about 50 to about 500 sets of probes are used to hybridize to about 50 to about 500 target sites on the target nucleic acid.The method of any one of preceding claims, wherein the target nucleic acid corresponds to at least a part of human chromosome 21, human chromosome 18, human chromosome 13, human chromosome region 22q11.2, or the pseudoautosomal regions of human chromosomes X or Y in a maternal blood or urine sample.The method of any one of preceding claims, wherein the copy number of each target site is determined by the following four steps: a) a ratio of the quantity of the third probe targeting the target site to the quantity of the third probe in the same group targeting one reference sites are calculated; b) a copy number value is calculated by this ratio value in a test sample divided by the corresponding ratio value in a control sample or the median value of the corresponding ratio values in all control samples or all test samples, and then times 2; c) more copy number values are calculated by repeating a-b by using the quantity of another third probe in the same group targeting another reference sites in step a; d) the copy number of each target site is calculated by taking the average or median of all the copy number values with or without abandoning egregious value.The method of any one of preceding claims, further comprising a step of determining the copy number of the target nucleic acid in a sample by taking the average or median of the copy numbers of all target sites on the target nucleic acid or by taking the average or median of the copy numbers of all target sites on the target nucleic acid after abandoning egregious values.A kit for assaying nucleic acids in a sample according to claims 1-15, comprising: a) multiple sets of probes corresponding to a target nucleic acid, each set of probes comprising: i. a first probe having a first portion at least partially complementary to a first region of a target nucleic acid in the sample and a second portion forming a first primer binding site;ii. a second probe having a first portion at least partially complementary to a second region of the target nucleic acid in the sample and a second portion forming a second primer binding site, wherein the 5' end of the first probe is adjacent to the 3' end of the second probe when both probes are hybridized to the target nucleic acid and the first and the second probes may be ligated to form a third probe;b) multiple sets of primers for amplifying the third probe, wherein each set of multiple sets of primers comprising: i. a first primer at least partially complementary to the first primer binding site in one or more first probes of the multiple sets of probes;ii. a second primer at least partially complementary to the second primer binding site in one or more second probes of the multiple sets of probes;c) reagents including a ligase, a buffer for a ligation reaction, a DNA polymerase, a buffer for a polymerase chain reaction, or a combination thereof; andd) optionally a brochure containing instructions of using the kit; wherein at least one primer of each set of primers is labeled with a detectable moiety; and at least one primer of the multiple sets of primers includes a stuffer sequence; and at least one probe of the multiple sets of probes includes a stuffer sequence, wherein the measurement is carried out using capillary electrophoresis."

context = "Lynx have a short tail, characteristic tufts of black hair on the tips of their ears, large, padded paws for walking on snow and long whiskers on the face. Under their neck, they have a ruff, which has black bars resembling a bow tie, although this is often not visible. Body colour varies from medium brown to goldish to beige-white, and is occasionally marked with dark brown spots, especially on the limbs. All species of lynx have white fur on their chests, bellies and on the insides of their legs, fur which is an extension of the chest and belly fur. The lynx 's colouring, fur length and paw size vary according to the climate in their range. In the Southwestern United States, they are short-haired, dark in colour and their paws are smaller and less padded. In colder northern climates lynx have thicker and lighter fur as well as larger and more padded paws that are well-adapted to snow." 
retrieved_chunk = context1 + context2
question = "Does the given context relate to business?"
rag_prompt = f"""
Context information is below.
---------------------
{retrieved_chunk}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {question}
Answer:
"""

prompt = generate_prompt_from_template(
    rag_prompt
)

generate_text(
    prompt,
    max_tokens=2048, # length of the model response.
)

Llama.generate: 4965 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =  227701.52 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    5940.47 ms /    32 runs   (  185.64 ms per token,     5.39 tokens per second)
llama_perf_context_print:       total time =    5952.48 ms /    33 tokens


'No, the given context does not primarily relate to business. Instead, it focuses on scientific research, specifically biomaterials and genetic assays.'

 > As we mentioned earlier, we’d **like to represent each entry in our GitHub issues corpus as a single vector**, so we **need to “pool” or average our token embeddings** in some way. One popular approach is to **perform CLS pooling on our model’s outputs**, where we **simply collect the last hidden state for the special [CLS] token**. The following function does the trick for us:

## Start creating embeddings

In [51]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

> Next, we’ll create a helper function that will tokenize a list of documents, place the tensors on the GPU, feed them to the model, and finally apply CLS pooling to the outputs:

In [54]:
def get_embeddings(text_list, imp_model, imp_tokenizer):
    # encoded_input = imp_tokenizer(
    #     text_list, padding=True, truncation=True, return_tensors="pt"
    # )
    encoded_input = imp_tokenizer(
        text_list, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = imp_model(**encoded_input)
    return cls_pooling(model_output)

In [55]:
embedding = get_embeddings(all_samples[0]["claims"][0], model, tokenizer)
embedding.shape

torch.Size([1, 4096])

In [30]:
embedding_all_samples = []
for idx, elem in enumerate(all_samples):
    embedding_all_samples.append( elem.map(
                                    lambda x: {"embeddings": get_embeddings(x["claims"], model, tokenizer).detach().cpu().numpy()[0]}
                                    )
                                )   

Map:  44%|████▍     | 1632/3728 [03:10<04:04,  8.56 examples/s]


KeyboardInterrupt: 

In [28]:
# for demonstration, assume we only have the first sample being tracked by FAISS:
embedding_all_samples[0].add_faiss_index(column="embeddings")

100%|██████████| 1/1 [00:00<?, ?it/s]


Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length', 'embeddings'],
    num_rows: 20
})

In [29]:
# from the embedded samples, get some idea for testing questions:
embedding_all_samples[0]['claims'][0], embedding_all_samples[0]['title'][0]

("A method of assaying nucleic acids in a sample, comprising the steps of: a) adding multiple sets of probes into the sample to form a mixture, each set of probes comprising: i. a first probe having a first portion at least partially complementary to a first region of a target nucleic acid in the sample and a second portion forming a first primer binding site;ii. a second probe having a first portion at least partially complementary to a second region of the target nucleic acid in the sample and a second portion forming a second primer binding site, wherein the 5' end of the first probe is adjacent to the 3' end of the second probe when both probes are hybridized to the target nucleic acid;b) denaturing nucleic acids in the mixture;c) hybridizing the set of probes to the complementary regions of the target nucleic acid;d) performing a ligation reaction with a ligase enzyme on the set of hybridized probes to connect the adjacent 5' end of the first probe and the 3' end of the second pro

In [30]:
# question = "How can I load a dataset offline?"
question = "How to test nucleic acids in a sample"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [31]:
scores, samples = embedding_all_samples[0].get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [32]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [33]:
for _, row in samples_df.iterrows():
    print(f"TITLE: {row.title}")
    print(f"SCORE: {row.scores}")
    print(f"CLAIMS: {row.claims}")
    print("=" * 50)
    print()

TITLE: POLYMER-BIOACTIVE AGENT CONJUGATES
SCORE: 338.6058349609375
CLAIMS: A bioerodible polymer - bioactive agent conjugate comprising as part of its polymer backbone a moiety of general formula (I):  wherein: A and B, which may be the same or different, represent the remainder of the polymer backbone and are (i) attached to the -O-R(ZD)-O- moiety as shown in formula (I) via a bioerodible moiety, and (ii) comprise a poly(urethane-ester) having monomeric units coupled via bioerodible urethane or ester moieties;R represents a linear or branched optionally substituted hydrocarbon;Z is a linking group; andD is a releasable bioactive agent.The bioerodible polymer - bioactive agent conjugate according to claim 1, wherein A and B each comprise a copolymer of polyurethane and polyester.The bioerodible polymer - bioactive agent conjugate according to claim 1 or claim 2, which comprises less than 25 mol% of polymerised residues that are derived from a C2 diol, relative to the total number of mo

# Save and reload FAISS database

**references:**

[1] https://huggingface.co/docs/datasets/v1.2.0/faiss_and_ea.html

[2] https://discuss.huggingface.co/t/save-and-load-datasets/9260

## Save

IMPORTANT:

[1] must save the dataset which contains the corresponding computed embeddings

In [34]:
for idx, elem in enumerate(embedding_all_samples):
    saved_path = '../backend-API/data_embeddings/epo_dataset' + str(idx)
    print(saved_path)
    try:
        elem.save_to_disk(saved_path)
    except ValueError:
        elem.drop_index('embeddings')
        elem.save_to_disk(saved_path)
# embedding_all_samples[0].save_to_disk('./data_embeddings/epo_dataset0')

../backend-API/data_embeddings/epo_dataset0


Saving the dataset (1/1 shards): 100%|██████████| 20/20 [00:00<?, ? examples/s]


../backend-API/data_embeddings/epo_dataset1


Saving the dataset (1/1 shards): 100%|██████████| 20/20 [00:00<?, ? examples/s]


# Searching in multiple databse using FAISS index

[1] https://huggingface.co/learn/cookbook/en/semantic_cache_chroma_vector_database

[2] https://www.pinecone.io/learn/series/faiss/faiss-tutorial/

In [3]:
# check how many data files 
# ref: https://pythonclcoding.medium.com/count-files-and-folders-using-python-93e0f8dc9337#:~:text=dirs%20%2B%3D%20len(dirnames)%20%3A,directory%20to%20the%20files%20counter.
PATH = 'G:/PhD/EPO2025/Shared/data_embeddings'
files, dir_names = 0, []
for root, dirnames, filenames in os.walk(PATH):
    print(f"looking into: {root}")
    if dirnames != []:
        for idx, elem in enumerate(dirnames):
            dir_names.append(elem)
    print(dirnames)
    files += len(filenames)

print(f"files: {files}")
print(f"directories: {dir_names}")


looking into: G:/PhD/EPO2025/Shared/data_embeddings
['epo_dataset0']
looking into: G:/PhD/EPO2025/Shared/data_embeddings\epo_dataset0
[]
files: 3
directories: ['epo_dataset0']


In [4]:
from datasets import load_from_disk
load_all_dataset = []
for idx, names in enumerate(dir_names):
    dataset_name = PATH + '/'+ names
    loaded_dataset = load_from_disk(dataset_name)
    load_all_dataset.append(loaded_dataset)
print( load_all_dataset[0],
     len(load_all_dataset)
    )

Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length', 'embeddings'],
    num_rows: 375
}) 1


In [6]:
print(
    load_all_dataset[0],
    load_all_dataset[0]['title'][0],
    load_all_dataset[0]['title'][0]
)

Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length', 'embeddings'],
    num_rows: 375
}) METHOD FOR MULTIPLEX NUCLEIC ACID ANALYSIS METHOD FOR MULTIPLEX NUCLEIC ACID ANALYSIS


In [7]:
from datasets import concatenate_datasets
all_dataset = concatenate_datasets(load_all_dataset)

In [9]:
all_dataset

Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length', 'embeddings'],
    num_rows: 375
})

## FAISS 

In [10]:
import numpy as np
import faiss

In [103]:
# get the stored embeddings 
test_emds = np.array(all_dataset['embeddings'])
print(test_emds.shape)

# add faiss index
index_faiss = faiss.IndexFlatL2(test_emds.shape[1]) 
print(index_faiss.is_trained)
index_faiss.add(test_emds)                  # add vectors to the index
print(index_faiss.ntotal)

(40, 768)
True
40


In [115]:
question = "How to test nuclear acids in a sample?"
question_embedding = get_embeddings([question], model, tokenizer).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [105]:
k = 5
[distance_res], [index_res] = index_faiss.search(question_embedding, k)
print(index_res)
print(distance_res)

[ 2  8 28 16 18]
[177.28308 183.54887 212.45772 240.05504 245.35034]


In [106]:
for i, d in zip(index_res, distance_res):
    # print(i, d)
    print(f"TITLE: {all_dataset['title'][i]}")
    print(f"DISTANCE: {d}")
    print(f"CLAIMS: {all_dataset['claims'][i]}")
    print("=" * 50)
    print()

TITLE: EMULSIFYING DISPERSANTS, METHOD FOR EMUSIFICATION AND DISPERSION WITH THE SAME, EMULSIONS, AND EMULSION FUELS
DISTANCE: 177.2830810546875
CLAIMS: An emulsification dispersant, wherein the main component is vesicles that are formed from amphiphilic substances capable of forming vesicles spontaneously and that adhere onto the surface of an oil based material, and wherein the average particle size of said vesicles is 8 nm to 500 nm when the emulsion is being formed, and 200 nm to 800 nm when the dispersant is being conditioned within a concentration range of 5 to 20 wt.% in the dispersion, wherein said amphiphilic substance is selected from a group comprising a. derivatives with an average number of 5 to 15 added ethylene oxide molecules (E), selected from among polyoxyethylene hydrogenated castor oil derivatives represented by the following general formula (Formula 1): b. halides of dialkylammonium derivatives, trialkylammonium derivatives, tetraalkylammonium derivatives, dialkeny

## HUGGING FACE FAISS

In [107]:
all_dataset.add_faiss_index(column="embeddings")

100%|██████████| 1/1 [00:00<00:00, 179.94it/s]


Dataset({
    features: ['title', 'claims', 'ipc', 'claims_length', 'embeddings'],
    num_rows: 40
})

In [108]:
# question = "Is solar pannel friendly to the environment"
# question_embedding = get_embeddings([question]).cpu().detach().numpy()
# question_embedding.shape

In [109]:

scores, samples = all_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=k
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

for _, row in samples_df.iterrows():
    print(f"TITLE: {row.title}")
    print(f"SCORE: {row.scores}")
    print(f"DESCRIPTION: {row.claims}")
    print("=" * 50)
    print()


TITLE: POLYMER-BIOACTIVE AGENT CONJUGATES
SCORE: 245.350341796875
DESCRIPTION: A bioerodible polymer - bioactive agent conjugate comprising as part of its polymer backbone a moiety of general formula (I):  wherein: A and B, which may be the same or different, represent the remainder of the polymer backbone and are (i) attached to the -O-R(ZD)-O- moiety as shown in formula (I) via a bioerodible moiety, and (ii) comprise a poly(urethane-ester) having monomeric units coupled via bioerodible urethane or ester moieties;R represents a linear or branched optionally substituted hydrocarbon;Z is a linking group; andD is a releasable bioactive agent.The bioerodible polymer - bioactive agent conjugate according to claim 1, wherein A and B each comprise a copolymer of polyurethane and polyester.The bioerodible polymer - bioactive agent conjugate according to claim 1 or claim 2, which comprises less than 25 mol% of polymerised residues that are derived from a C2 diol, relative to the total number o

# Save Huggingface dataset to json file

In [20]:
import json

In [21]:
json_db_path = 'G:/PhD/EPO2025/SQLite_Tutorial/experiments/data/experiment_3/epo_sample_embeddings_dataset.json'

In [22]:
test = all_dataset.to_list()

In [23]:
print(
    len(test),
    type(test[0])
    )

375 <class 'dict'>


In [None]:
# Save to json file
with open(json_db_path, 'w') as file:
    json.dump(test, file)

In [None]:
# Load from json file
with open(json_db_path, 'r') as file:
    data = json.load(file)

In [34]:
print(
    len(data),
    type(data[0]),
    data[0]['ipc']
    )

375 <class 'dict'> C
