In [1]:
import os 


In [2]:
%pwd


'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix\\notebooks'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix'

In [None]:
from tqdm import tqdm

from src.database.load_vector_data import DataLoader

In [7]:
dataloader = DataLoader()
records = dataloader.get_records()
document = dataloader.data_chuncking(records)
vector_db = dataloader.get_vector_db()

# Process documents in batches
BATCH_SIZE = 500
for i in tqdm(range(0, len(document), BATCH_SIZE), desc="Embedding Batches"):
    batch = document[i:i + BATCH_SIZE]  # Get batch slice
    vector_db.add_documents(batch)

Extracting Data
Data Extraction Completed
Chunking Completed


Embedding Batches: 100%|██████████| 5/5 [01:53<00:00, 22.63s/it]


In [243]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_chroma import Chroma
from src.prompts.Prompts import Filter_template ,Generator_template
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from src.config import VECTORDB_DIR
from datasets import Dataset
from ragas import evaluate,EvaluationDataset
from ragas.metrics import Faithfulness,LLMContextRecall,FactualCorrectness,LLMContextPrecisionWithReference,NoiseSensitivity
from ragas.llms import LangchainLLMWrapper
from google import generativeai as genai


In [21]:
api_key

'AIzaSyDkX0O1Sqd9L1FKvctL-qBFoCNU8hnoNGI'

In [284]:

load_dotenv()
# api_key = os.getenv("API_KEY")
api_key = "AIzaSyDkX0O1Sqd9L1FKvctL-qBFoCNU8hnoNGI"
genai.configure(api_key=api_key)

class ChatBot:
    def __init__(self):

        self.dataset = []
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
        self.vector_db = Chroma(embedding_function=self.embeddings, persist_directory=r'data\Chatbot_vector_db')
        self.llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", api_key=api_key)
        self.parser = JsonOutputParser()
        self.filtering_chain = Filter_template | self.llm | self.parser
        self.generator_chain = Generator_template | self.llm
        self.evaluator_llm = LangchainLLMWrapper(self.llm)
    
    def get_result(self, query):

        self.query = query
        lower_query = query.lower()
        filtering_result = self.filtering_chain.invoke({"query" : query})
        print(filtering_result)
        if filtering_result:
            metadata_filter = filtering_result
        else:
            metadata_filter = None
        retriever = self.vector_db.as_retriever(search_type="mmr", search_kwargs = {"k": 4, "filter":metadata_filter, 'fetch_k':1000})
        response = retriever.invoke(lower_query)

        retrieved_contexts = [f"content:{doc.page_content}, metadata: {doc.metadata} " for doc in response]
        
        reference = ', '.join([doc.page_content for doc in response])

        tupled_doc = [(doc.metadata,doc.page_content) for doc in response]

        result = self.generator_chain.invoke({"query": query, "content" : tupled_doc})

        return query, result.content, retrieved_contexts,reference
    



In [285]:

chatbot = ChatBot()


In [272]:
import pandas as pd
df = pd.DataFrame()

In [294]:
query = 'What is Peanut Allergy?'
query, response, retrieved_context,reference = chatbot.get_result(query)

{'phenotype_name': 'peanut allergy'}


In [337]:

dataset = []
data = {
"user_input":query,
"response":response,
"retrieved_contexts":retrieved_context,
"reference":reference
}
dataset.append(data)

In [286]:
query = 'Give a brief on AIDS?'
query, response, retrieved_context,reference = chatbot.get_result(query)

{'phenotype_name': 'aids'}


In [320]:
query = 'Give the defination of Blood pressure?'
query, response, retrieved_context,reference = chatbot.get_result(query)

{'phenotype_name': 'blood pressure'}


In [327]:
query = 'Give data sources and coding system of Acne?'
query, response, retrieved_context,reference = chatbot.get_result(query)

{'phenotype_name': 'acne'}


In [313]:
query = 'Who is Virat Kohli'
query, response, retrieved_context,reference = chatbot.get_result(query)

{}


In [336]:
query = 'Give me the coding system of Peanut Allergy and who is Virat Kohli?'
query, response, retrieved_context,reference = chatbot.get_result(query)

{'phenotype_name': 'peanut allergy'}


In [338]:
dataset

[{'user_input': 'Give me the coding system of Peanut Allergy and who is Virat Kohli?',
  'response': 'The coding system for Peanut Allergy is:\nCPT Codes, Laboratories, PP000062, PC000062, XPXXX1259.\n\nI am sorry, but the provided content does not have information on who Virat Kohli is.',
  'retrieved_contexts': ['content:{"detail_props": {"PID": "PP000062", "Date_created": "2016-12-05T00:00:00.000Z", "Phenotype_attributes": "CPT Codes, Laboratories", "Authors": "Hemler JA, Phillips EJ, Mallal SA, Kendall PL", "Institutions": "Vanderbilt University, Vanderbilt University Medical Center", "Type_of_phenotype": "Disease or Syndrome", "Name": "Peanut Allergy", "Status": "Final", "Genders": "Female, Male", "Files": "{\\"Peanut_Allergy_algorithm.pdf\\": \\"https://phekb.org/sites/phenotype/files/Peanut_Allergy_algorithm_0.pdf\\"}", "Networks": "eMERGE", "Ages": "Pediatric, Adult", "Phenotype_id": 609, "Races": "Black (African American, West African, Ethiopian, etc.), Caucasian (European), C

In [339]:
evaluation_dataset = EvaluationDataset.from_list(dataset)

In [340]:
metrics = [
            Faithfulness(),
            LLMContextRecall(),
            LLMContextPrecisionWithReference(),
            NoiseSensitivity()
        ]


In [341]:
evaluator_llm = LangchainLLMWrapper(chatbot.llm)

result = evaluate(dataset=evaluation_dataset,metrics=metrics,llm=evaluator_llm)
        
result

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

{'faithfulness': 0.5000, 'context_recall': 1.0000, 'llm_context_precision_with_reference': 0.0000, 'noise_sensitivity(mode=relevant)': 0.3333}

In [342]:
res = result.to_pandas()

In [343]:
df = pd.concat([df,res],ignore_index=True)
df = df.drop(columns = ['retrieved_contexts',"reference"])
df

Unnamed: 0,user_input,response,faithfulness,context_recall,llm_context_precision_with_reference,noise_sensitivity(mode=relevant)
0,Give a brief on AIDS?,AIDS is an HIV disease that can result in othe...,1.0,1.0,1.0,0.0
1,What is Peanut Allergy?,Peanut allergy is a common type of food allerg...,1.0,1.0,1.0,0.0
2,Give the defination of Blood pressure?,Blood pressure is defined as the average of mu...,1.0,1.0,1.0,0.0
3,Who is Virat Kohli,"I am sorry, but the provided content does not ...",1.0,1.0,0.0,0.0
4,Give the defination of Blood pressure?,Blood pressure is defined as the average of mu...,1.0,1.0,1.0,0.0
5,Give me the coding system of Peanut Allergy an...,The coding system for Peanut Allergy is:\nCPT ...,0.5,1.0,0.0,0.333333


In [345]:
df.to_excel(r'data\Chatbot_evaluation.xlsx', index=False)