In [1]:
import os 


In [34]:
%pwd


'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix\\notebooks'

In [35]:
os.chdir("../")

In [36]:
%pwd

'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix'

In [6]:
from tqdm import tqdm
from src.database.load_vector_data import DataLoader

In [7]:
dataloader = DataLoader()
records = dataloader.get_records()
document = dataloader.data_chuncking(records)
vector_db = dataloader.get_vector_db()

# Process documents in batches
BATCH_SIZE = 500
for i in tqdm(range(0, len(document), BATCH_SIZE), desc="Embedding Batches"):
    batch = document[i:i + BATCH_SIZE]  # Get batch slice
    vector_db.add_documents(batch)

Extracting Data
Data Extraction Completed
Chunking Completed


Embedding Batches: 100%|██████████| 5/5 [01:53<00:00, 22.63s/it]


In [37]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_chroma import Chroma
from src.prompts.Prompts import Filter_template ,Generator_template
from dotenv import load_dotenv
from src.llm_model.gemini_model import embedding_model,chat_model
from src.config import VECTORDB_DIR
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_precision,
    context_recall,
)


In [44]:

load_dotenv()

class ChatBot:
    def __init__(self):

        self.embeddings = embedding_model()
        self.vector_db = Chroma(embedding_function=self.embeddings, persist_directory=VECTORDB_DIR)
        self.llm = chat_model()
        self.parser = JsonOutputParser()
        self.filtering_chain = Filter_template | self.llm | self.parser
        self.generator_chain = Generator_template | self.llm


    
    def get_result(self, query):

        self.query = query
        lower_query = query.lower()
        filtering_result = self.filtering_chain.invoke({"query" : lower_query})
        
        if filtering_result:
            metadata_filter = filtering_result
        else:
            metadata_filter = None
        retriever = self.vector_db.as_retriever(search_type="mmr", search_kwargs = {"k": 10, "filter":metadata_filter, 'fetch_k':1000})
        response = retriever.invoke(query)


        tupled_doc = [(doc.metadata,doc.page_content) for doc in response]

        result = self.generator_chain.invoke({"query": query, "content" : tupled_doc})

        self.ragas(query, result.content, response)

        return result.content
    

    def ragas(self,query,generated_response,retrieved_documents):

        self.data = {
            "query":[query],
            "generated_response":[generated_response],
            "retrieved_documents":[retrieved_documents]
        }

        self.dataset = Dataset.from_dict(self.data)

        self.metrics = [
            faithfulness,
            answer_relevancy,
            answer_correctness,
            context_precision,
            context_recall,
        ]

        accuracy_result = evaluate(self.dataset, self.metrics)
        
        for metrics,score in accuracy_result.items():
            print(f"{metrics} : {score:.2f}")




In [45]:

chatbot = ChatBot()


In [46]:
query = 'Give a brief on BMI'

In [47]:
chatbot.get_result(query)

ArrowInvalid: Could not convert Document(id='85d14a61-89d8-4e2c-8a8d-e1f6e005804a', metadata={'phenotype_name': 'bmi', 'website_name': 'hdruk'}, page_content='{"detail_props": {"Implementation": "Unknown value", "Phenoflow_id": "Unknown value", "Name": "BMI", "Tags": "Unknown value", "Status": 2, "Is_deleted": "Unknown value"}, "concept_props": {"Description": "Body mass index centile", "Concept_history_id": "[4752]", "Logical_type": "[1]", "Coding_system_id": "[5]", "Coding_system_name": "[\\"Read codes v2\\"]", "PIDs": "[\\"HP000636\\"]", "Component_history_id": "[2698]", "Concept_history_date": "[\\"2021-10-06T16:19:42.629679Z\\"]", "Code": "[\\"22K9\\"]", "Codelist_history_id": "[2644]", "CID": "HC032527", "Phenotype_version_id": "[896]", "Phenotype_name": "[\\"BMI\\"]", "Codelist_id": "[2011]", "Coding_system_description": "[\\"Read codes v2\\"]", "Component_id": "[2011]", "ID": 96406, "Phenotype_id": "[\\"PH448\\"]", "Concept_id": "[1807]"}}') with type Document: did not recognize Python value type when inferring an Arrow data type

In [48]:

# chatbot.query = query
lower_query = query.lower()
filtering_result = chatbot.filtering_chain.invoke({"query" : lower_query})

if filtering_result:
    metadata_filter = filtering_result
else:
    metadata_filter = None
retriever = chatbot.vector_db.as_retriever(search_type="mmr", search_kwargs = {"k": 10, "filter":metadata_filter, 'fetch_k':1000})
response = retriever.invoke(query)


tupled_doc = [(doc.metadata,doc.page_content) for doc in response]

result = chatbot.generator_chain.invoke({"query": query, "content" : tupled_doc})

In [49]:
tupled_doc

[({'website_name': 'hdruk', 'phenotype_name': 'bmi'},
  '{"detail_props": {"Implementation": "Unknown value", "Phenoflow_id": "Unknown value", "Name": "BMI", "Tags": "Unknown value", "Status": 2, "Is_deleted": "Unknown value"}, "concept_props": {"Description": "Body mass index centile", "Concept_history_id": "[4752]", "Logical_type": "[1]", "Coding_system_id": "[5]", "Coding_system_name": "[\\"Read codes v2\\"]", "PIDs": "[\\"HP000636\\"]", "Component_history_id": "[2698]", "Concept_history_date": "[\\"2021-10-06T16:19:42.629679Z\\"]", "Code": "[\\"22K9\\"]", "Codelist_history_id": "[2644]", "CID": "HC032527", "Phenotype_version_id": "[896]", "Phenotype_name": "[\\"BMI\\"]", "Codelist_id": "[2011]", "Coding_system_description": "[\\"Read codes v2\\"]", "Component_id": "[2011]", "ID": 96406, "Phenotype_id": "[\\"PH448\\"]", "Concept_id": "[1807]"}}'),
 ({'phenotype_name': 'bmi', 'website_name': 'hdruk'},
  '{"detail_props": {"Implementation": "Unknown value", "Phenoflow_id": "Unknown va

In [52]:
response[0].model_dump()

{'id': '85d14a61-89d8-4e2c-8a8d-e1f6e005804a',
 'metadata': {'website_name': 'hdruk', 'phenotype_name': 'bmi'},
 'page_content': '{"detail_props": {"Implementation": "Unknown value", "Phenoflow_id": "Unknown value", "Name": "BMI", "Tags": "Unknown value", "Status": 2, "Is_deleted": "Unknown value"}, "concept_props": {"Description": "Body mass index centile", "Concept_history_id": "[4752]", "Logical_type": "[1]", "Coding_system_id": "[5]", "Coding_system_name": "[\\"Read codes v2\\"]", "PIDs": "[\\"HP000636\\"]", "Component_history_id": "[2698]", "Concept_history_date": "[\\"2021-10-06T16:19:42.629679Z\\"]", "Code": "[\\"22K9\\"]", "Codelist_history_id": "[2644]", "CID": "HC032527", "Phenotype_version_id": "[896]", "Phenotype_name": "[\\"BMI\\"]", "Codelist_id": "[2011]", "Coding_system_description": "[\\"Read codes v2\\"]", "Component_id": "[2011]", "ID": 96406, "Phenotype_id": "[\\"PH448\\"]", "Concept_id": "[1807]"}}',
 'type': 'Document'}