In [15]:
import joblib

# Load the classifier and vectorizer
rf_model = joblib.load('random_forest_classifier.pkl')



In [16]:
import os
import json

folder_path = "../datasets/rag_json_dataset"
rag_entries = []

for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        full_path = os.path.join(folder_path, filename)
        print(f"\n🔍 Loading: {filename}")

        with open(full_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                print(f"✅ Loaded: {type(data)} entries")

                # If it's a single dict, warn user
                if isinstance(data, dict):
                    print(f"⚠️ Warning: {filename} contains a single dictionary. Expected a list.")
                    continue

            except json.JSONDecodeError as e:
                print(f"❌ JSON decode error in {filename}: {e}")
                continue

            for entry in data:
                condition = entry.get("condition", os.path.splitext(filename)[0])
                section = entry.get("section", "")
                source = entry.get("source", "")

                for i in range(1, 6):
                    text_key = f"text{i}"
                    if text_key in entry:
                        rag_entries.append({
                            "condition": condition,
                            "section": section,
                            "text": entry[text_key],
                            "source": source
                        })

print(f"\n✅ Total RAG entries collected: {len(rag_entries)}")

# Preview a few
for i, e in enumerate(rag_entries[:5]):
    print(f"{i+1}. [{e['condition']}] ({e['section']}) -> {e['text'][:80]}...")



🔍 Loading: ADHD.json
✅ Loaded: <class 'list'> entries

🔍 Loading: Anxiety.json
✅ Loaded: <class 'list'> entries

🔍 Loading: ASD.json
✅ Loaded: <class 'list'> entries

🔍 Loading: Bipolar.json
✅ Loaded: <class 'list'> entries

🔍 Loading: Depression.json
✅ Loaded: <class 'list'> entries

🔍 Loading: EatingDisorders.json
✅ Loaded: <class 'list'> entries

🔍 Loading: OCD.json
✅ Loaded: <class 'list'> entries

🔍 Loading: PTSD.json
✅ Loaded: <class 'list'> entries

🔍 Loading: Schizophrenia.json
✅ Loaded: <class 'list'> entries

✅ Total RAG entries collected: 260
1. [ADHD] (Overview) -> Attention‑deficit/hyperactivity disorder (ADHD) is a chronic neurodevelopmental ...
2. [ADHD] (Overview) -> It typically begins in childhood and often persists into adulthood, affecting bo...
3. [ADHD] (Overview) -> Individuals with ADHD may experience difficulties with time management, staying ...
4. [ADHD] (Overview) -> The condition is not a result of poor parenting or lack of discipline, but is in...
5. [ADH

In [17]:
print(json_data.keys())


dict_keys(['ADHD', 'Anxiety', 'ASD', 'Bipolar', 'Depression', 'EatingDisorders', 'OCD', 'PTSD', 'Schizophrenia'])


In [18]:
for condition, entries in json_data.items():
    print(f"{condition}: {len(entries)} entries")


ADHD: 5 entries
Anxiety: 5 entries
ASD: 6 entries
Bipolar: 5 entries
Depression: 7 entries
EatingDisorders: 6 entries
OCD: 6 entries
PTSD: 6 entries
Schizophrenia: 6 entries


In [19]:
for i in range(1, 6):
    text_key = f"text{i}"
    if text_key in entry:
        rag_entries.append({
            "condition": condition,
            "text": entry[text_key],
            "source": source,
            "section": section
        })


In [20]:
pip install -U sentence-transformers


Note: you may need to restart the kernel to use updated packages.


In [21]:
from sentence_transformers import SentenceTransformer

# Use multilingual E5-base model
model = SentenceTransformer("intfloat/multilingual-e5-base")


In [22]:
texts_to_embed = [f"passage: {entry['text']}" for entry in rag_entries]


In [23]:
import numpy as np

embeddings = model.encode(texts_to_embed, convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


In [24]:
print(embeddings.shape)


(265, 768)


In [25]:
print(embeddings[0])


[-2.41563972e-02  4.71256934e-02 -6.41165394e-03  1.32578798e-02
  6.37654588e-02 -5.23205958e-02 -2.64642797e-02 -2.81749833e-02
  3.26313004e-02  1.55196874e-03  5.13178529e-03  9.53843817e-03
  8.18833858e-02  6.44195378e-02 -2.97775045e-02 -5.52880839e-02
  4.36479086e-03 -6.69282814e-03  2.35192031e-02  4.50767780e-04
  1.87563673e-02 -5.11019975e-02  5.44421449e-02  4.31766501e-03
  4.41579521e-02 -3.08767464e-02  2.40049027e-02  3.00229806e-02
  1.17149120e-02 -3.31099285e-03  1.36672156e-02 -2.81083230e-02
  3.68309394e-02  2.90402882e-02  4.15527523e-02  2.44918205e-02
 -1.82772137e-03 -4.33964580e-02  1.25958677e-02  1.09881777e-02
 -1.40696566e-03  3.39005105e-02  3.96252722e-02 -4.21955027e-02
 -1.51136918e-02 -9.16636456e-03  8.77277646e-03  2.17701737e-02
 -4.68138941e-02 -4.53322567e-02  5.14240041e-02  4.38960716e-02
  2.27574520e-02  4.33881059e-02 -8.99863243e-02 -5.79611436e-02
  4.54921983e-02  2.46980041e-02 -1.82775185e-02  5.58530428e-02
 -2.15763506e-02  6.18751

In [26]:
for i in range(3):  # check first 3 entries
    print(f"Condition: {rag_entries[i]['condition']}")
    print(f"Text: {rag_entries[i]['text']}")
    print(f"Embedding vector (first 5 dims): {embeddings[i][:5]}")
    print("-" * 80)


Condition: ADHD
Text: Attention‑deficit/hyperactivity disorder (ADHD) is a chronic neurodevelopmental disorder characterized by patterns of inattention and/or hyperactivity-impulsivity.
Embedding vector (first 5 dims): [-0.0241564   0.04712569 -0.00641165  0.01325788  0.06376546]
--------------------------------------------------------------------------------
Condition: ADHD
Text: It typically begins in childhood and often persists into adulthood, affecting both academic and occupational functioning.
Embedding vector (first 5 dims): [ 0.00983265  0.03000807 -0.02537476  0.02042854  0.00448109]
--------------------------------------------------------------------------------
Condition: ADHD
Text: Individuals with ADHD may experience difficulties with time management, staying organized, and maintaining focus on tasks.
Embedding vector (first 5 dims): [-0.0018345   0.03636994 -0.02482457  0.0201223   0.06638292]
------------------------------------------------------------------------------

In [27]:
seen_conditions = set()

for entry, emb in zip(rag_entries, embeddings):
    condition = entry['condition']
    if condition not in seen_conditions:
        print(f"Condition: {condition}")
        print(f"Text: {entry['text']}")
        print(f"Embedding vector (first 5 dims): {emb[:5]}")
        print("-" * 80)
        seen_conditions.add(condition)
    
    if len(seen_conditions) == len(json_data):
        break


Condition: ADHD
Text: Attention‑deficit/hyperactivity disorder (ADHD) is a chronic neurodevelopmental disorder characterized by patterns of inattention and/or hyperactivity-impulsivity.
Embedding vector (first 5 dims): [-0.0241564   0.04712569 -0.00641165  0.01325788  0.06376546]
--------------------------------------------------------------------------------
Condition: Anxiety
Text: Anxiety disorders involve more than temporary worry or fear and can interfere with daily life such as job performance, schoolwork, and relationships.
Embedding vector (first 5 dims): [-0.00968351  0.04795368 -0.01863425  0.03643321  0.05808113]
--------------------------------------------------------------------------------
Condition: Autism Spectrum Disorder
Text: Autism spectrum disorder (ASD) is a neurological and developmental condition that impacts how people interact with others, communicate, learn, and behave.
Embedding vector (first 5 dims): [-0.01859888  0.04123342  0.00433646  0.03260419  0.02298

In [28]:
!pip install langchain




In [29]:
pip install langchain faiss-cpu sentence-transformers openai


Note: you may need to restart the kernel to use updated packages.


In [30]:
!pip install -U langchain-community




In [31]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI  # or BedrockChat

In [32]:
from langchain_community.embeddings import HuggingFaceEmbeddings

e5_embedding_model = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-base",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)


  e5_embedding_model = HuggingFaceEmbeddings(


In [33]:
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Convert your entries into LangChain Document objects
docs = [
    Document(
        page_content=entry["text"],
        metadata={
            "condition": entry["condition"],
            "source": entry["source"],
            "section": entry["section"]
        }
    )
    for entry in rag_entries
]

# Create FAISS vectorstore
db = FAISS.from_documents(docs, embedding=e5_embedding_model)

# Save the vectorstore locally
db.save_local("rag_vectorstore")  # this will create index.faiss and index.pkl




  return forward_call(*args, **kwargs)


In [34]:
db = FAISS.load_local("rag_vectorstore", embeddings=e5_embedding_model, allow_dangerous_deserialization=True)


In [35]:
from langchain_community.vectorstores import FAISS

db = FAISS.load_local(
    "rag_vectorstore",
    embeddings=e5_embedding_model,
    allow_dangerous_deserialization=True  # ✅ Add this flag
)


In [36]:
retriever = db.as_retriever(search_kwargs={"k": 3})  # Retrieve top 3 passages


In [37]:
pip install langchain-google-genai


Note: you may need to restart the kernel to use updated packages.


In [38]:
pip install -U langchain langchain-google-genai google-generativeai


Collecting langchain-google-genai
  Obtaining dependency information for langchain-google-genai from https://files.pythonhosted.org/packages/84/d8/e1162835d5d6eefaae341c2d1cf750ab53222a421252346905187e53b8a2/langchain_google_genai-2.1.9-py3-none-any.whl.metadata
  Using cached langchain_google_genai-2.1.9-py3-none-any.whl.metadata (7.2 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain-google-genai)
  Obtaining dependency information for google-ai-generativelanguage<0.7.0,>=0.6.18 from https://files.pythonhosted.org/packages/e5/77/ca2889903a2d93b3072a49056d48b3f55410219743e338a1d7f94dc6455e/google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata
  Using cached google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
INFO: pip is looking at multiple versions of google-generativeai to determine which version is compatible with other requirements. This could take a while.
Collecting google-generativeai
  Obtaining dependency information for goog

In [39]:
!pip install -U google-generativeai




In [40]:
# Mapping from classifier prediction to dataset labels
condition_mapping = {
    "Obsessive-Compulsive Disorder (OCD)": "OCD",
    "Post-Traumatic Stress Disorder (PTSD)": "PTSD",
    "Attention Deficit Hyperactivity Disorder (ADHD)": "ADHD",
    "Autism Spectrum Disorder (ASD)": "ASD",
    "Bipolar Disorder": "Bipolar",
    "Depression": "Depression",
    "Schizophrenia": "Schizophrenia",
    "Anxiety": "Anxiety",
    "Eating Disorders": "EatingDisorders"
}


In [76]:
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Step 1: Define your query
query = "I struggle in social situations and prefer routines — any change makes me feel overwhelmed"

# Step 2: Embed the query (use same format as training)
query_embedding = model.encode(f"passage: {query}", convert_to_numpy=True)

# Step 3: Predict condition using classifier
raw_predicted_condition = rf_model.predict([query_embedding])[0]
predicted_condition = condition_mapping.get(raw_predicted_condition, raw_predicted_condition)
print("🔍 Predicted condition:", predicted_condition)

# Step 4: Use full document set for retrieval (not filtered)
full_vectorstore = FAISS.from_documents(docs, embedding=e5_embedding_model)
retriever = full_vectorstore.as_retriever(search_kwargs={"k": 10})  # retrieve more docs for reranking
retrieved_docs = retriever.invoke(query)

# Step 5: Hybrid reranking — prioritize predicted condition
reranked_docs = sorted(
    retrieved_docs,
    key=lambda doc: 0 if doc.metadata["condition"].lower() == predicted_condition.lower() else 1
)

# Step 6: Take top N (e.g., 3-5) after reranking
final_docs = reranked_docs[:5]

# ✅ Print final documents
print(f"📄 Final documents passed to LLM: {len(final_docs)}")
for i, doc in enumerate(final_docs, 1):
    print(f"\n--- Document {i} ---\nCondition: {doc.metadata['condition']}\nContent: {doc.page_content[:200]}...")


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


🔍 Predicted condition: Bipolar
📄 Final documents passed to LLM: 5

--- Document 1 ---
Condition: Depression
Content: Practice relaxation techniques like meditation or deep breathing....

--- Document 2 ---
Condition: PTSD
Content: Factors like prior trauma, lack of support, and personal coping style can increase vulnerability....

--- Document 3 ---
Condition: Eating Disorders
Content: Seek professional help if eating habits are interfering with daily life, health, or emotional well-being....

--- Document 4 ---
Condition: Obsessive‑Compulsive Disorder
Content: Practice self-care routines like sleep hygiene, regular exercise, and balanced nutrition to support emotional resilience....

--- Document 5 ---
Condition: Schizophrenia
Content: Create and follow a consistent routine—sleep, meals, medication—to stabilize mood and functioning....


  return forward_call(*args, **kwargs)


In [77]:
from langchain_google_genai import ChatGoogleGenerativeAI

# ✅ Use your Gemini API key
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    google_api_key="AIzaSyArKn3cNq90kWTa94TTIBbnKVfvBPPqIrs",  # Replace with your real API key
    temperature=0
)

# ✅ Create context from final reranked docs
context_text = "\n\n".join([doc.page_content for doc in final_docs])

# ✅ Custom prompt as you described
prompt = f"""You are a mental health assistant. Based on the following context, identify the most likely condition that matches the user's concern. Do not guess. Only use the provided context.

Query:
{query}

Context:
{context_text}

Respond with:
Condition:
<What mental health condition could this relate to?>

Overview:
- <Why might the user be feeling or behaving this way?>

Symptoms of Condition:
- <List common symptoms of the condition>

What You Can Do:
- <Suggest helpful and responsible next steps>

Source:
-<Provide Link of official resources i can read more about this condition , clickabale would be better >

Note:
This is not a diagnosis. Encourage the user to consult a qualified mental health professional.

"""

# ✅ Send prompt to Gemini
response = llm.invoke(prompt)



In [78]:
result = filtered_rag_chain.run(query)
print(result)


  return forward_call(*args, **kwargs)


Condition:
Autism Spectrum Disorder (ASD)

Overview:
- Your experiences with social challenges and a strong preference for routines could be related to how your brain processes information. For individuals on the autism spectrum, social interactions can be complex and draining because they require interpreting unwritten social rules and nonverbal cues that may not be intuitive. Routines provide a sense of predictability and safety, which helps manage the anxiety and sensory overload that can come from unexpected changes. Feeling overwhelmed when a routine is disrupted is a very common experience.

Symptoms of Condition:
- Challenges with social communication and interaction across various settings.
- Difficulty initiating or maintaining back-and-forth conversations.
- Trouble understanding nonverbal cues like body language or tone of voice.
- Restricted and repetitive patterns of behavior, interests, or activities.
- Strong adherence to routines and significant distress at small change