In [1]:
import os 


In [3]:
%pwd


'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix\\notebooks'

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix'

In [6]:
from tqdm import tqdm
from src.database.load_vector_data import DataLoader

In [7]:
dataloader = DataLoader()
records = dataloader.get_records()
document = dataloader.data_chuncking(records)
vector_db = dataloader.get_vector_db()

# Process documents in batches
BATCH_SIZE = 500
for i in tqdm(range(0, len(document), BATCH_SIZE), desc="Embedding Batches"):
    batch = document[i:i + BATCH_SIZE]  # Get batch slice
    vector_db.add_documents(batch)

Extracting Data
Data Extraction Completed
Chunking Completed


Embedding Batches: 100%|██████████| 5/5 [01:53<00:00, 22.63s/it]


In [24]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_chroma import Chroma
from src.prompts.Prompts import Filter_template ,Generator_template
from dotenv import load_dotenv
from src.llm_model.gemini_model import embedding_model,chat_model
from src.config import VECTORDB_DIR
import os


In [None]:

load_dotenv()

class ChatBot:
    def __init__(self):

        self.embeddings = embedding_model()
        self.vector_db = Chroma(embedding_function=self.embeddings, persist_directory=VECTORDB_DIR)
        self.llm = chat_model()
        self.parser = JsonOutputParser()
        self.filtering_chain = Filter_template | self.llm | self.parser
        self.generator_chain = Generator_template | self.llm


    def get_result(self, query):

        lower_query = query.lower()
        filtering_result = self.filtering_chain.invoke({"query" : lower_query})
        print(filtering_result)
        
        retriever = self.vector_db.as_retriever(search_type="mmr", search_kwargs = {"k": 10, "filter":filtering_result, 'fetch_k':1000})
        response = retriever.invoke(query)

        tupled_doc = [(doc.metadata,doc.page_content) for doc in response]

        result = self.generator_chain.invoke({"query": query, "content" : tupled_doc})

        return result.content
    


In [26]:
# user_input = input("Enter your query: ").lower()
# output = get_result(query=user_input)
# print("\nFinal Result:\n", output)
chatbot = ChatBot()
result = chatbot.get_result("Give a brief on Peanut Allergy?")
print(result)

{'phenotype_name': 'Peanut Allergy'}
I am unable to provide a response as the content is empty.


In [27]:
filtering_result = chatbot.filtering_chain.invoke({"query" : "Give a brief on Peanut Allergy?"})
print(filtering_result)

{'phenotype_name': 'Peanut Allergy'}


In [37]:
query = "What are PID of Acne?"

lower_query = query.lower()
filtering_result = chatbot.filtering_chain.invoke({"query" : lower_query})
print(filtering_result)

{'phenotype_name': 'acne'}


In [38]:
retriever = chatbot.vector_db.as_retriever(search_type="mmr", search_kwargs = {"k": 10, "filter":filtering_result, 'fetch_k':1000})
response = retriever.invoke(query)


In [39]:
response

[Document(id='4a68184c-b2e7-4043-a1f9-4324438f747b', metadata={'phenotype_name': 'acne', 'website_name': 'cprd'}, page_content='{"pname": "acne", "phenotype_props": {"id": "HXCXX0024", "name": "Acne", "cprd_PID": "CP000003", "hdruk_PID": "HP001121"}, "website_props": {"name": "cprd", "pid": {"0": "CP000003"}}, "detail_props": {"Disease_num": 3, "Disease": "Acne", "PID": "CP000003"}, "concept_props": {"Disease_num": "[3]", "Description": "Iodine acne", "Disease": "[\\"Acne\\"]", "Med_code_id": "[309116010]", "PIDs": "[\\"CP000003\\"]", "System_num": "[15]", "Read_code": "M261400", "Snomed_ct_description_id": "[309116010]", "CID": "CC008401", "Category": "[\\"Diagnosis of Acne\\"]", "Snomed_ct_concept_id": "[201217006]", "Mapping": "[\\"cleansedreadcode\\"]", "System": "[\\"Skin conditions\\"]", "Med_code": "[67453.0]"}}'),
 Document(id='9fc2c18a-3aae-4f27-9ab1-46423a47f8f2', metadata={'website_name': 'cprd', 'phenotype_name': 'acne'}, page_content='{"pname": "acne", "phenotype_props": {

In [40]:
tupled_doc = [(doc.metadata,doc.page_content) for doc in response]

result = chatbot.generator_chain.invoke({"query": query, "content" : tupled_doc})

print(result.content)

The PIDs of Acne are CP000003 and HP001121.
