In [1]:
import os 


In [2]:
%pwd


'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix\\notebooks'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix'

In [None]:
from tqdm import tqdm

from src.database.load_vector_data import DataLoader

In [7]:
dataloader = DataLoader()
records = dataloader.get_records()
document = dataloader.data_chuncking(records)
vector_db = dataloader.get_vector_db()

# Process documents in batches
BATCH_SIZE = 500
for i in tqdm(range(0, len(document), BATCH_SIZE), desc="Embedding Batches"):
    batch = document[i:i + BATCH_SIZE]  # Get batch slice
    vector_db.add_documents(batch)

Extracting Data
Data Extraction Completed
Chunking Completed


Embedding Batches: 100%|██████████| 5/5 [01:53<00:00, 22.63s/it]


In [243]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_chroma import Chroma
from src.prompts.Prompts import Filter_template ,Generator_template
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from src.config import VECTORDB_DIR
from datasets import Dataset
from ragas import evaluate,EvaluationDataset
from ragas.metrics import Faithfulness,LLMContextRecall,FactualCorrectness,LLMContextPrecisionWithReference,NoiseSensitivity
from ragas.llms import LangchainLLMWrapper
from google import generativeai as genai


In [21]:
api_key

'AIzaSyDkX0O1Sqd9L1FKvctL-qBFoCNU8hnoNGI'

In [284]:

load_dotenv()
# api_key = os.getenv("API_KEY")
api_key = "AIzaSyDkX0O1Sqd9L1FKvctL-qBFoCNU8hnoNGI"
genai.configure(api_key=api_key)

class ChatBot:
    def __init__(self):

        self.dataset = []
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
        self.vector_db = Chroma(embedding_function=self.embeddings, persist_directory=r'data\Chatbot_vector_db')
        self.llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", api_key=api_key)
        self.parser = JsonOutputParser()
        self.filtering_chain = Filter_template | self.llm | self.parser
        self.generator_chain = Generator_template | self.llm
        self.evaluator_llm = LangchainLLMWrapper(self.llm)
    
    def get_result(self, query):

        self.query = query
        lower_query = query.lower()
        filtering_result = self.filtering_chain.invoke({"query" : query})
        print(filtering_result)
        if filtering_result:
            metadata_filter = filtering_result
        else:
            metadata_filter = None
        retriever = self.vector_db.as_retriever(search_type="mmr", search_kwargs = {"k": 4, "filter":metadata_filter, 'fetch_k':1000})
        response = retriever.invoke(lower_query)

        retrieved_contexts = [f"content:{doc.page_content}, metadata: {doc.metadata} " for doc in response]
        
        reference = ', '.join([doc.page_content for doc in response])

        tupled_doc = [(doc.metadata,doc.page_content) for doc in response]

        result = self.generator_chain.invoke({"query": query, "content" : tupled_doc})

        return query, result.content, retrieved_contexts,reference
    



In [285]:

chatbot = ChatBot()


In [272]:
import pandas as pd
df = pd.DataFrame()

In [294]:
query = 'What is Peanut Allergy?'
query, response, retrieved_context,reference = chatbot.get_result(query)

{'phenotype_name': 'peanut allergy'}


In [328]:

dataset = []
data = {
"user_input":query,
"response":response,
"retrieved_contexts":retrieved_context,
"reference":reference
}
dataset.append(data)

In [286]:
query = 'Give a brief on AIDS?'
query, response, retrieved_context,reference = chatbot.get_result(query)

{'phenotype_name': 'aids'}


In [320]:
query = 'Give the defination of Blood pressure?'
query, response, retrieved_context,reference = chatbot.get_result(query)

{'phenotype_name': 'blood pressure'}


In [327]:
query = 'Give data sources and coding system of Acne?'
query, response, retrieved_context,reference = chatbot.get_result(query)

{'phenotype_name': 'acne'}


In [313]:
query = 'Who is Virat Kohli'
query, response, retrieved_context,reference = chatbot.get_result(query)

{}


In [329]:
dataset

[{'user_input': 'Give data sources and coding system of Acne?',
  'response': 'The data sources for Acne are CPRD GOLD and HES Admitted Patient Care data for CPRD GOLD. The coding system used is ICD10 codes, specifically L70.8.',
  'retrieved_contexts': ['content:{"detail_props": {"Status": 2, "Is_deleted": "Unknown value"}, "concept_props": {"Description": "Other acne", "Concept_history_id": "[3015]", "Logical_type": "[1]", "Coding_system_id": "[4]", "Coding_system_name": "[\\"ICD10 codes\\"]", "PIDs": "[\\"HP001121\\"]", "Component_history_id": "[1829]", "Concept_history_date": "[\\"2021-10-06T16:02:40.341969Z\\"]", "Code": "[\\"L70.8\\"]", "Codelist_history_id": "[1775]", "CID": "HC153363", "Phenotype_version_id": "[182]", "Phenotype_name": "[\\"Acne\\"]", "Codelist_id": "[1142]", "Coding_system_description": "[\\"ICD10 Codes\\"]", "Component_id": "[1142]", "ID": 56327, "Phenotype_id": "[\\"PH91\\"]", "Concept_id": "[938]"}}, metadata: {\'phenotype_name\': \'acne\', \'website_name\'

In [330]:
evaluation_dataset = EvaluationDataset.from_list(dataset)

In [331]:
metrics = [
            Faithfulness(),
            LLMContextRecall(),
            LLMContextPrecisionWithReference(),
            NoiseSensitivity()
        ]


In [332]:
evaluator_llm = LangchainLLMWrapper(chatbot.llm)

result = evaluate(dataset=evaluation_dataset,metrics=metrics,llm=evaluator_llm)
        
result

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 13
}
].
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 9
}
].
Retrying langchain_google_g

{'faithfulness': 1.0000, 'context_recall': 1.0000, 'llm_context_precision_with_reference': nan, 'noise_sensitivity(mode=relevant)': nan}

In [325]:
res = result.to_pandas()

In [326]:
df = pd.concat([df,res],ignore_index=True)
df = df.drop(columns = ['retrieved_contexts',"reference"])
df

Unnamed: 0,user_input,response,faithfulness,context_recall,llm_context_precision_with_reference,noise_sensitivity(mode=relevant)
0,Give a brief on AIDS?,AIDS is an HIV disease that can result in othe...,1.0,1.0,1.0,0.0
1,What is Peanut Allergy?,Peanut allergy is a common type of food allerg...,1.0,1.0,1.0,0.0
2,Give the defination of Blood pressure?,Blood pressure is defined as the average of mu...,1.0,1.0,1.0,0.0
3,Who is Virat Kohli,"I am sorry, but the provided content does not ...",1.0,1.0,0.0,0.0
4,Give the defination of Blood pressure?,Blood pressure is defined as the average of mu...,1.0,1.0,1.0,0.0
