In [1]:
import os 


In [2]:
%pwd


'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix\\notebooks'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix'

In [None]:
from tqdm import tqdm

from src.database.load_vector_data import DataLoader

In [7]:
dataloader = DataLoader()
records = dataloader.get_records()
document = dataloader.data_chuncking(records)
vector_db = dataloader.get_vector_db()

# Process documents in batches
BATCH_SIZE = 500
for i in tqdm(range(0, len(document), BATCH_SIZE), desc="Embedding Batches"):
    batch = document[i:i + BATCH_SIZE]  # Get batch slice
    vector_db.add_documents(batch)

Extracting Data
Data Extraction Completed
Chunking Completed


Embedding Batches: 100%|██████████| 5/5 [01:53<00:00, 22.63s/it]


In [238]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_chroma import Chroma
from src.prompts.Prompts import Filter_template ,Generator_template
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from src.config import VECTORDB_DIR
from datasets import Dataset
from ragas import evaluate,EvaluationDataset
from ragas.metrics import Faithfulness,LLMContextRecall,FactualCorrectness,LLMContextPrecisionWithReference,NoiseSensitivity
from ragas.llms import LangchainLLMWrapper
from google import generativeai as genai


In [21]:
api_key

'AIzaSyDkX0O1Sqd9L1FKvctL-qBFoCNU8hnoNGI'

In [239]:

load_dotenv()
# api_key = os.getenv("API_KEY")
api_key = "AIzaSyDkX0O1Sqd9L1FKvctL-qBFoCNU8hnoNGI"
genai.configure(api_key=api_key)

class ChatBot:
    def __init__(self):

        self.dataset = []
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
        self.vector_db = Chroma(embedding_function=self.embeddings, persist_directory=r'data\Chatbot_vector_db')
        self.llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", api_key=api_key)
        self.parser = JsonOutputParser()
        self.filtering_chain = Filter_template | self.llm | self.parser
        self.generator_chain = Generator_template | self.llm
        self.evaluator_llm = LangchainLLMWrapper(self.llm)
    
    def get_result(self, query):

        self.query = query
        lower_query = query.lower()
        filtering_result = self.filtering_chain.invoke({"query" : query})
        print(filtering_result)
        if filtering_result:
            metadata_filter = filtering_result
        else:
            metadata_filter = None
        retriever = self.vector_db.as_retriever(search_type="mmr", search_kwargs = {"k": 10, "filter":metadata_filter, 'fetch_k':1000})
        response = retriever.invoke(lower_query)

        retrieved_contexts = [f"content:{doc.page_content}, metadata: {doc.metadata} " for doc in response]
        
        reference = ', '.join([doc.page_content for doc in response])

        tupled_doc = [(doc.metadata,doc.page_content) for doc in response]

        result = self.generator_chain.invoke({"query": query, "content" : tupled_doc})

        return query, result.content, retrieved_contexts,reference
    






In [240]:

chatbot = ChatBot()


In [225]:
chatbot.filtering_result


AttributeError: 'ChatBot' object has no attribute 'filtering_result'

In [143]:
import pandas as pd
df = pd.DataFrame()

In [146]:
query = 'What is data sources and coding system of BMI?'
query, response, retrieved_context,reference = chatbot.get_result(query)

In [223]:

dataset = []
data = {
"user_input":query,
"response":response,
"retrieved_contexts":retrieved_context,
"reference":reference
}
dataset.append(data)

In [215]:
query = 'Give the definition of Acne.'
query, response, retrieved_contexts,reference = chatbot.get_result(query)

In [241]:
query = 'Define Anxiety Algorithm.'
query, response, retrieved_contexts,reference = chatbot.get_result(query)

{'phenotype_name': 'anxiety'}


In [201]:
query = 'What is Peanut Allergy and who is Virat Kohli?'
query, response, retrieved_contexts,reference = chatbot.get_result(query)

In [224]:
dataset

[{'user_input': 'Define Anxiety Algorithm.',
  'response': 'I am sorry, but I cannot define "Anxiety Algorithm" based on the content I have.',
  'retrieved_contexts': ['content:{"detail_props": {"Implementation": "Unknown value", "Phenoflow_id": "Unknown value", "Name": "BMI", "Tags": "Unknown value", "Status": 2, "Is_deleted": "Unknown value"}, "concept_props": {"Description": "Body mass index centile", "Concept_history_id": "[4752]", "Logical_type": "[1]", "Coding_system_id": "[5]", "Coding_system_name": "[\\"Read codes v2\\"]", "PIDs": "[\\"HP000636\\"]", "Component_history_id": "[2698]", "Concept_history_date": "[\\"2021-10-06T16:19:42.629679Z\\"]", "Code": "[\\"22K9\\"]", "Codelist_history_id": "[2644]", "CID": "HC032527", "Phenotype_version_id": "[896]", "Phenotype_name": "[\\"BMI\\"]", "Codelist_id": "[2011]", "Coding_system_description": "[\\"Read codes v2\\"]", "Component_id": "[2011]", "ID": 96406, "Phenotype_id": "[\\"PH448\\"]", "Concept_id": "[1807]"}}, metadata: {\'phenot

In [218]:
evaluation_dataset = EvaluationDataset.from_list(dataset)

In [149]:
metrics = [
            Faithfulness(),
            LLMContextRecall(),
            LLMContextPrecisionWithReference()
        ]


In [150]:
evaluator_llm = LangchainLLMWrapper(chatbot.llm)

In [219]:

result = evaluate(dataset=evaluation_dataset,metrics=metrics,llm=evaluator_llm)
        

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

In [220]:
result

{'faithfulness': 0.0000, 'context_recall': 1.0000, 'llm_context_precision_with_reference': 1.0000}

In [221]:
res = result.to_pandas()

In [214]:
df = pd.concat([df,res],ignore_index=True)
df = df.drop(columns = ['retrieved_contexts',"reference"])
df

Unnamed: 0,user_input,response,faithfulness,context_recall,llm_context_precision_with_reference
0,What is data sources and coding system of BMI?,"According to the provided content, the data so...",1.0,1.0,1.0
1,Give the definition of Acne.,Acne is defined as when a patient meets the cr...,0.0,1.0,1.0
2,Give the files present in HIV.,"The files present in HIV are: ""Description of ...",0.0,1.0,1.0
3,Who is Virat Kohli?,"I am sorry, but the provided content does not ...",1.0,0.0,0.0
4,What is Peanut Allergy and who is Virat Kohli?,Peanut allergy is an immune response that occu...,0.2,1.0,0.0
5,Give a brief on BMI.,BMI stands for Body Mass Index. It can be low ...,0.5,1.0,1.0
