# Create the embeddings for the query examples

## Prerequisits
- [Ollama](https://ollama.com/search?c=embedding) with the embedding models: `mxbai-embed-large`, `nomic-embed-text`, `all-minilm`
- A pkl file of the classes (check the notebook `Construct_Schema_IDSM_1.ipynb` to generate it)

## Import the required modules

In [2]:
from tqdm import tqdm
from langchain_ollama import OllamaEmbeddings
from langchain_community.docstore import InMemoryDocstore
import faiss
from langchain_community.vectorstores import FAISS
import logging
from pathlib import Path
import os
import pickle

logging.getLogger("httpx").propagate = False
logging.getLogger("httpx").setLevel("CRITICAL") 

## Prepare the embedding variables

We use the Ollama Embeddings with one of the following models `mxbai-embed-large`, `nomic-embed-text`, `all-minilm`

In [3]:
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
)

We use the FAISS vector storage

In [None]:
vectorstore = FAISS(
    embedding_function=embeddings,
    docstore= InMemoryDocstore(),
    index= faiss.IndexFlatL2(len(embeddings.embed_query("hello world"))),
    index_to_docstore_id={}
)

We initialise the query directory and the saving/loading path. 

Note that all the embeddings are available at [this MyBox URL](https://mybox.inria.fr/d/24d9423c67d64f8284fa/) you can download them to avoid waiting for the embedding process to be done. The password is: `Kc8(-8aE`

## Load the classes from the pickle

In [None]:
classes_directory = Path(os.getcwd()).parent.parent / 'data' / 'saved_pkls' / 'idsm'

with open(f"{classes_directory}/classes.pkl", 'rb') as handle:
    classes = pickle.load(handle)
    print(len(classes))

In [5]:
saving_path = Path(os.getcwd()).parent.parent / 'data' / 'faiss_embeddings' / 'idsm' / "v3_4_full_nomic_faiss_index"

Prepare the documents to be injested

In [None]:
documents = [cls.__str__() for cls in classes]
len(documents)

## Injest the documents 

In [None]:
db = None
with tqdm(total=len(documents), desc="Ingesting documents") as pbar:
    for d in documents:
        if db:
            db.add_texts([d])
        else:
            db = FAISS.from_texts([d], embedding=embeddings)
        pbar.update(1)  

## Save the embeddings locally

In [None]:
db.save_local(saving_path)

## Load the embeeding

In [6]:
db = FAISS.load_local(saving_path,embeddings=embeddings,allow_dangerous_deserialization=True)
db.index.ntotal

232922

## Example of query selection

In [7]:
queries = [
    "What protein targets does donepezil (CHEBI_53289) inhibit with an IC50 less than 10 µM?",
    "What protein targets does (CHEBI_124758) inhibit with an PF5 less than 10 µM?",
    "protein targets donepezil (CHEBI_53289) inhibit with IC50",
    "protein targets donepezil (CHEBI_53289) IC50",
    "protein donepezil (CHEBI_53289) IC50",
    "donepezil (CHEBI_53289) IC50",
    "donepezil 53289 IC50"
    ]

query = queries[0]

# Retrieve the most similar text
retrieved_documents = db.similarity_search(query,k=2)

# show the retrieved document's content
for doc in retrieved_documents:
    print(f"{doc.page_content}\n\n-----------------------------------------\n")

('http://purl.obolibrary.org/obo/CHEBI_174058', 'Donepezil metabolite M4', None)

-----------------------------------------

('http://purl.obolibrary.org/obo/OBI_0001992', 'assay measuring the half maximal inhibitory concentration [IC50] of a MHC:ligand complex', None)

-----------------------------------------



## Evaluate the diffrent query forms

This is good to see if a preprocessing task would be benificial

In [8]:
queries = [
    "What protein targets does donepezil (CHEBI_53289) inhibit with an IC50 less than 10 µM?",
    "protein, targets, donepezil, CHEBI_53289, inhibit, IC50",
    "protein targets donepezil (CHEBI_53289) inhibit with IC50",
    "protein targets donepezil (CHEBI_53289) IC50",
    "protein donepezil (CHEBI_53289) IC50",
    "donepezil (CHEBI_53289) IC50",
    "donepezil 53289 IC50",
    ]

to_find_iris = [
    "http://purl.obolibrary.org/obo/CHEBI_53289",
    "http://www.bioassayontology.org/bao#BAO_0000190",
    "http://www.bioassayontology.org/bao#BAO_0000040",
    "http://purl.obolibrary.org/obo/CHEBI_105741",
    "http://purl.obolibrary.org/obo/CHEBI_109001",
    "http://purl.obolibrary.org/obo/CHEBI_109002" ,
    "http://purl.obolibrary.org/obo/CHEBI_109462" ,
    "http://purl.obolibrary.org/obo/CHEBI_114247" ,
    "http://purl.obolibrary.org/obo/CHEBI_95316"
     ]


best_q = -1
best_score = 0
score = 0

for q in queries:
    score = 0
    # Retrieve the most similar text
    retrieved_documents = db.similarity_search(q,k=20)

    for doc in retrieved_documents:
        for iri in to_find_iris:
            if doc.page_content.find(iri) != -1:
                score = score + 1
        
        if score > best_score:
            best_score = score
            best_q = q

    print(f"{q} => {score}")

print()
print(f"{best_q} => {best_score}")

What protein targets does donepezil (CHEBI_53289) inhibit with an IC50 less than 10 µM? => 0
protein, targets, donepezil, CHEBI_53289, inhibit, IC50 => 1
protein targets donepezil (CHEBI_53289) inhibit with IC50 => 1
protein targets donepezil (CHEBI_53289) IC50 => 1
protein donepezil (CHEBI_53289) IC50 => 1
donepezil (CHEBI_53289) IC50 => 1
donepezil 53289 IC50 => 2

donepezil 53289 IC50 => 2


## Get the context of the selected classes

In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..', '..'))  # Adjust the dots as needed
if module_path not in sys.path:
    sys.path.append(module_path)

if os.path.exists("./logs") == False:
    os.mkdir("logs")

import ast
from app.core.utils.construct_util import get_empty_graph_with_prefixes, format_class_graph_file, get_context_if_not_found

In [None]:
g = get_empty_graph_with_prefixes()

for doc in retrieved_documents:
    cls = ast.literal_eval(doc.page_content)
    cls_path = format_class_graph_file(cls[0])
    print("Current Classe: "+cls[0])
    cls_context_graph = get_context_if_not_found(cls,cls_path)
    g = g + cls_context_graph
    
saving_graph = Path(os.getcwd()).parent.parent / 'tmp' / 'notebook_merging.ttl'

# Save the graph
g.serialize(destination=f"{saving_graph}", format='turtle')


### Embeddings with Chroma Vector Store

The idea would be the same but we need to use the right classes and for saving the embeddings it is **important** to specify the `persist_directory` on the creation.

In [9]:
from langchain_chroma import Chroma

In [10]:
embeddings_cn = OllamaEmbeddings(
    model="nomic-embed-text",
)

In [11]:
saving_path_chroma = Path(os.getcwd()).parent.parent / 'data' / 'chroma_embeddings' / 'idsm' / "v3_4_full_nomic_chroma_index"
dbcn = Chroma(persist_directory=str(saving_path_chroma), embedding_function=embeddings_cn)
len(dbcn.get()['documents']) 

232922

In [None]:

dbcn = None
with tqdm(total=len(documents), desc="Ingesting documents") as pbar:
    for d in documents:
        if dbcn:
            dbcn.add_texts([d])
        else:
            dbcn = Chroma.from_texts(texts=[d], embedding=embeddings_cn,persist_directory=str(saving_path_chroma))
        pbar.update(1)  

And for the search we use:

In [None]:
dbcn.similarity_search(query="ic50",k=3)

In [None]:
queries = [
    "What protein targets does donepezil (CHEBI_53289) inhibit with an IC50 less than 10 µM?",
    "protein, targets, donepezil, CHEBI_53289, inhibit, IC50",
    "protein targets donepezil (CHEBI_53289) inhibit with IC50",
    "protein targets donepezil (CHEBI_53289) IC50",
    "protein donepezil (CHEBI_53289) IC50",
    "donepezil (CHEBI_53289) IC50",
    "donepezil 53289 IC50",
    ]

to_find_iris = [
    "http://purl.obolibrary.org/obo/CHEBI_53289",
    "http://www.bioassayontology.org/bao#BAO_0000190",
    "http://www.bioassayontology.org/bao#BAO_0000040",
    "http://purl.obolibrary.org/obo/CHEBI_105741",
    "http://purl.obolibrary.org/obo/CHEBI_109001",
    "http://purl.obolibrary.org/obo/CHEBI_109002" ,
    "http://purl.obolibrary.org/obo/CHEBI_109462" ,
    "http://purl.obolibrary.org/obo/CHEBI_114247" ,
    "http://purl.obolibrary.org/obo/CHEBI_95316"
     ]


best_q = -1
best_score = 0
score = 0

for q in queries:
    score = 0
    # Retrieve the most similar text
    retrieved_documents = dbcn.similarity_search(q,k=15)

    for doc in retrieved_documents:
        for iri in to_find_iris:
            if doc.page_content.find(iri) != -1:
                score = score + 1
        
        if score > best_score:
            best_score = score
            best_q = q

    print(f"{q} => {score}")

print()
print(f"{best_q} => {best_score}")

NameError: name 'dbcn' is not defined

### Embeddings with Qdrant Vector Store

The idea would be the same but we need to use the right classes and for saving the embeddings it is **important** to specify the `path` on the creation of the `QdrantClient`. Also you need tp put the right size when creating the collection e.g. `size=384` for `miniln`

In [None]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

In [None]:
embeddings_qdrant = OllamaEmbeddings(
    model="mxbai-embed-large",
)

saving_path_qdrant = Path(os.getcwd()).parent.parent / 'data' / 'qdrant_embeddings' / 'idsm' / "v3_4_full_mxbai_qdrant_index"

client = QdrantClient(path=saving_path_qdrant)

In [None]:
client.create_collection(
    collection_name="pubchem_classes",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)


In [None]:
qudrant_vs = QdrantVectorStore(
    client=client,
    collection_name="pubchem_classes",
    embedding=embeddings_qdrant,
)

In [None]:
with tqdm(total=len(documents), desc="Ingesting documents") as pbar:
    for d in documents:
        if qudrant_vs:
            qudrant_vs.add_texts([d])
        else:
            qudrant_vs = qudrant_vs.from_texts(texts=[d], embedding=embeddings_qdrant,path=saving_path_qdrant)
        pbar.update(1)  

And for the search we use:

In [None]:
qudrant_vs.similarity_search(query="ic50",k=15)

In [None]:
queries = [
    "What protein targets does donepezil (CHEBI_53289) inhibit with an IC50 less than 10 µM?",
    "protein, targets, donepezil, CHEBI_53289, inhibit, IC50",
    "protein targets donepezil (CHEBI_53289) inhibit with IC50",
    "protein targets donepezil (CHEBI_53289) IC50",
    "protein donepezil (CHEBI_53289) IC50",
    "donepezil (CHEBI_53289) IC50",
    "donepezil 53289 IC50",
    ]

to_find_iris = [
    "http://purl.obolibrary.org/obo/CHEBI_53289",
    "http://www.bioassayontology.org/bao#BAO_0000190",
    "http://www.bioassayontology.org/bao#BAO_0000040",
    "http://purl.obolibrary.org/obo/CHEBI_105741",
    "http://purl.obolibrary.org/obo/CHEBI_109001",
    "http://purl.obolibrary.org/obo/CHEBI_109002" ,
    "http://purl.obolibrary.org/obo/CHEBI_109462" ,
    "http://purl.obolibrary.org/obo/CHEBI_114247" ,
    "http://purl.obolibrary.org/obo/CHEBI_95316"
     ]


best_q = -1
best_score = 0
score = 0

for q in queries:
    score = 0
    # Retrieve the most similar text
    retrieved_documents = qudrant_vs.similarity_search(q,k=20)

    for doc in retrieved_documents:
        for iri in to_find_iris:
            if doc.page_content.find(iri) != -1:
                score = score + 1
        
        if score > best_score:
            best_score = score
            best_q = q

    print(f"{q} => {score}")

print()
print(f"{best_q} => {best_score}")