In [None]:
# Install required packages
%pip install python-dotenv langchain-groq --quiet


In [None]:
# Create .env file if it doesn't exist
import os

env_content = """# Groq API Configuration
GROQ_API_KEY=your_groq_api_key_here

# Add other environment variables as needed
# OPENAI_API_KEY=your_openai_key_here
# ANTHROPIC_API_KEY=your_anthropic_key_here"""

if not os.path.exists('.env'):
    with open('.env', 'w') as f:
        f.write(env_content)
    print("✅ Created .env file template")
    print("📝 Please edit .env file and add your actual GROQ_API_KEY")
else:
    print("✅ .env file already exists")


**1. Load Documents**

In [14]:
from langchain.document_loaders import PyPDFLoader, WebBaseLoader

# # --- 1️⃣ Load Multiple PDFs ---
# pdf_files = [
#     "WHO_BP_guidelines.pdf",
#     "Diabetes_guidelines.pdf",
#     "Cholesterol_guidelines.pdf",
#     "BMI_guidelines.pdf",
#     "Smoking_guidelines.pdf"
# ]

# pdf_docs = []
# for file in pdf_files:
#     loader = PyPDFLoader(file)
#     pdf_docs.extend(loader.load())

# --- 2️⃣ Load Multiple Websites ---
web_pages = [
    "https://www.who.int/news-room/fact-sheets/detail/hypertension",
    "https://www.who.int/news-room/fact-sheets/detail/diabetes",
    "https://www.who.int/news-room/fact-sheets/detail/obesity",
    "https://www.cdc.gov/cholesterol/facts.html",
    "https://www.cdc.gov/tobacco/data_statistics/fact_sheets/index.htm"
]

web_docs = []
for url in web_pages:
    loader = WebBaseLoader(url)
    web_docs.extend(loader.load())

# --- 3️⃣ Combine all docs and split into chunks ---
all_docs = web_docs


**2. Split Text into Chunks**

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
web_chunks = text_splitter.split_documents(all_docs)


**3. Create Vector DB**

In [16]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Use 'embedding' instead of 'embedding_function'
vectordb = Chroma.from_documents(
    web_chunks,
    embedding=embeddings,        # Correct parameter name
    persist_directory="rag_db"
)

vectordb.persist()
print("✅ Vector DB created successfully")


✅ Vector DB created successfully


**4. Query RAG with Input Vitals + Prediction**

In [17]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})
query = "BP=150/95, Chol=220, BMI=32, Prediction: High risk"
relevant_docs = retriever.get_relevant_documents(query)

for doc in relevant_docs:
    print(doc.page_content)


in the vessels when the heart rests between beats.Hypertension is diagnosed if, when it is measured on two different days, the systolic blood pressure readings on both days is ≥140 mmHg and/or the diastolic blood pressure readings on both days is ≥90 mmHg.Risk factorsModifiable risk factors include unhealthy diets (excessive salt consumption, a diet high in saturated fat and trans fats, low intake of fruits and vegetables), physical inactivity, consumption of tobacco and alcohol, and being
in the vessels when the heart rests between beats.Hypertension is diagnosed if, when it is measured on two different days, the systolic blood pressure readings on both days is ≥140 mmHg and/or the diastolic blood pressure readings on both days is ≥90 mmHg.Risk factorsModifiable risk factors include unhealthy diets (excessive salt consumption, a diet high in saturated fat and trans fats, low intake of fruits and vegetables), physical inactivity, consumption of tobacco and alcohol, and being
profession

In [20]:
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Check if API key is loaded
api_key = os.getenv("GROQ_API_KEY")
if not api_key:
    print("❌ GROQ_API_KEY not found in environment variables!")
    print("Please create a .env file with: GROQ_API_KEY=your_actual_key_here")
    print("Or set it directly: groq = ChatGroq(groq_api_key='your_key_here', ...)")
else:
    print("✅ GROQ_API_KEY loaded successfully")

groq = ChatGroq(groq_api_key=api_key, model="llama-3.1-8b-instant", temperature=0.3)

# RetrievalQA chain: LLM + RAG
qa_chain = RetrievalQA.from_chain_type(
    llm=groq,
    chain_type="stuff",  # combines all retrieved chunks
    retriever=retriever, # your vectordb retriever
    return_source_documents=True
)

# Query includes vitals + prediction
query = """
Patient vitals: BP=150/95, Cholesterol=220, BMI=32, Smoker=yes, Diabetic=no.
Predicted risk: High.
Please provide:
1. Explanation of the risk
2. Possible diagnosis
3. Suggested actions or next steps
"""
result = qa_chain(query)

print("=== Diagnosis Explanation ===")
print(result)


✅ GROQ_API_KEY loaded successfully
=== Diagnosis Explanation ===
{'query': '\nPatient vitals: BP=150/95, Cholesterol=220, BMI=32, Smoker=yes, Diabetic=no.\nPredicted risk: High.\nPlease provide:\n1. Explanation of the risk\n2. Possible diagnosis\n3. Suggested actions or next steps\n', 'result': "**1. Explanation of the risk:**\nThe patient's predicted risk is high due to several factors:\n\n- Elevated blood pressure (BP=150/95): The systolic blood pressure is above 140 mmHg, and the diastolic blood pressure is above 90 mmHg, indicating hypertension.\n- High cholesterol (Cholesterol=220): This is above the normal range, indicating hypercholesterolemia.\n- High BMI (BMI=32): This indicates obesity, which is a significant risk factor for various health conditions, including hypertension and cardiovascular disease.\n- Smoking status (Smoker=yes): Smoking is a major risk factor for cardiovascular disease and other health conditions.\n\n**2. Possible diagnosis:**\nBased on the patient's symp

In [19]:
print(os.getenv("GROQ_API_KEY"))

None
