In [None]:
print("🚀 Step 1: Installing required libraries...")

🚀 Step 1: Installing required libraries...


In [None]:
!pip install -q pypdf langchain sentence-transformers chromadb tqdm --quiet
!pip install -U langchain-community --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
print("\nMounting Google Drive...")
from google.colab import drive
drive.mount('/content/drive')
print("✅ Google Drive mounted successfully!")


Mounting Google Drive...
Mounted at /content/drive
✅ Google Drive mounted successfully!


In [None]:
import os
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import time
import shutil

In [None]:
DRIVE_PDF_SOURCE_DIR = "/content/drive/MyDrive/RAG"


In [None]:
DRIVE_PDF_SOURCE_DIR = "/content/drive/MyDrive/RAG"

# ✨ NEW: Path to store the final database directly in Google Drive ✨
DRIVE_CHROMA_DB_PATH = "/content/drive/MyDrive/NEW_RAG"

COLLECTION_NAME = "therapeutic_knowledge_base"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150

In [None]:
print("Configuration set:")
print(f" - PDF Source: {DRIVE_PDF_SOURCE_DIR}")
print(f" - ChromaDB Storage on Drive: {DRIVE_CHROMA_DB_PATH}")

Configuration set:
 - PDF Source: /content/drive/MyDrive/RAG
 - ChromaDB Storage on Drive: /content/drive/MyDrive/NEW_RAG


In [None]:
def load_all_pdfs_from_drive(directory_path):
    documents = []
    print(f"\n🔍 Step 3.1: Searching for PDF files in: {directory_path}")
    if not os.path.exists(directory_path):
        print(f"❌ ERROR: The directory '{directory_path}' was not found.")
        return None
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
    if not pdf_files:
        print(f"⚠️ No PDF files found in '{directory_path}'.")
        return None
    print(f"✅ Found {len(pdf_files)} PDF files. Processing...")
    for filename in tqdm(pdf_files, desc="Processing PDFs"):
        try:
            loader = PyPDFLoader(os.path.join(directory_path, filename))
            doc = loader.load()
            documents.extend(doc)
        except Exception as e:
            print(f"\n- Warning for {filename}: pypdf issue detected, but text extraction will proceed.")
    print(f"✅ Loaded {len(documents)} pages from {len(pdf_files)} files.")
    return documents

In [None]:
def split_documents_into_chunks(documents):
    print("\n🔪 Step 3.2: Chunking documents...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len)
    chunks = text_splitter.split_documents(documents)
    print(f"✅ Successfully created {len(chunks)} text chunks.")
    return chunks

In [None]:
def build_knowledge_base_on_drive():
    start_time = time.time()
    documents = load_all_pdfs_from_drive(DRIVE_PDF_SOURCE_DIR)
    if not documents:
        print("\nHalting execution: No documents loaded.")
        return
    chunks = split_documents_into_chunks(documents)
    chunk_contents = [chunk.page_content for chunk in chunks]

    print(f"\n🧠 Step 3.3: Loading embedding model: {EMBEDDING_MODEL_NAME}...")
    embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device='cpu')

    print(f"\n💾 Step 3.4: Creating/populating ChromaDB store on Google Drive...")

    # ✨ NEW: Remove old database from Google Drive if it exists ✨
    if os.path.exists(DRIVE_CHROMA_DB_PATH):
        print(f"Found old database. Removing '{DRIVE_CHROMA_DB_PATH}' for a fresh start.")
        shutil.rmtree(DRIVE_CHROMA_DB_PATH)

    # ✨ NEW: Create the client pointing directly to your Drive path ✨
    client = chromadb.PersistentClient(path=DRIVE_CHROMA_DB_PATH)
    collection = client.create_collection(name=COLLECTION_NAME)
    ids = [f"chunk_{i}" for i in range(len(chunks))]

    batch_size = 128
    for i in tqdm(range(0, len(chunks), batch_size), desc="Embedding & Storing"):
        batch_contents = chunk_contents[i:i+batch_size]
        batch_ids = ids[i:i+batch_size]
        batch_embeddings = embedding_model.encode(batch_contents).tolist()
        collection.add(embeddings=batch_embeddings, documents=batch_contents, ids=batch_ids)

    end_time = time.time()
    total_time = end_time - start_time

    print("\n\n🎉 --- Knowledge Base Creation Complete! --- 🎉")
    print(f"Total PDFs processed: {len(os.listdir(DRIVE_PDF_SOURCE_DIR))}")
    print(f"Total text chunks created: {len(chunks)}")
    print(f"✅ Vector store successfully created in your Google Drive at: '{DRIVE_CHROMA_DB_PATH}'")
    print(f"Total time taken: {total_time:.2f} seconds")

In [None]:
build_knowledge_base_on_drive()


🔍 Step 3.1: Searching for PDF files in: /content/drive/MyDrive/RAG
✅ Found 31 PDF files. Processing...


Processing PDFs: 100%|██████████| 31/31 [11:20<00:00, 21.94s/it]


✅ Loaded 11921 pages from 31 files.

🔪 Step 3.2: Chunking documents...
✅ Successfully created 41310 text chunks.

🧠 Step 3.3: Loading embedding model: all-MiniLM-L6-v2...

💾 Step 3.4: Creating/populating ChromaDB store on Google Drive...


Embedding & Storing: 100%|██████████| 323/323 [1:40:46<00:00, 18.72s/it]




🎉 --- Knowledge Base Creation Complete! --- 🎉
Total PDFs processed: 31
Total text chunks created: 41310
✅ Vector store successfully created in your Google Drive at: '/content/drive/MyDrive/NEW_RAG'
Total time taken: 6729.93 seconds


In [None]:
print("\n📚 Loading the knowledge base from Drive and the embedding model...")

# Check if the database path exists
if not os.path.exists(DRIVE_CHROMA_DB_PATH):
    raise FileNotFoundError(
        f"Database not found at '{DRIVE_CHROMA_DB_PATH}'. "
        "Please ensure the 'build_knowledge_base' script ran successfully."
    )



📚 Loading the knowledge base from Drive and the embedding model...


In [None]:
client = chromadb.PersistentClient(path=DRIVE_CHROMA_DB_PATH)
collection = client.get_collection(name=COLLECTION_NAME)


In [None]:
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device='cpu')
print("✅ Knowledge base and model loaded successfully!")


✅ Knowledge base and model loaded successfully!


In [None]:
def search_knowledge_base(query_text, n_results=5):
    """
    Searches the vector database for text chunks related to a query.
    """
    query_embedding = embedding_model.encode(query_text).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )
    return results['documents'][0]

In [None]:
simulated_classifier_output = [
    ('sadness', 0.85),
    ('anxiety', 0.62),
    ('loneliness', 0.45)
]
print(f"\nReceived classifier output: {simulated_classifier_output}")


Received classifier output: [('sadness', 0.85), ('anxiety', 0.62), ('loneliness', 0.45)]


In [None]:
def create_query_from_emotions(emotion_data, score_threshold=0.5):
    """
    Translates a list of emotions and scores into a natural language query.
    """
    if not emotion_data:
        return "General advice for mental well-being."
    primary_emotion = emotion_data[0][0]
    secondary_emotions = [
        emotion[0] for emotion in emotion_data[1:]
        if emotion[1] > score_threshold
    ]
    query = f"What are some therapeutic techniques, coping mechanisms, and advice for someone experiencing strong feelings of {primary_emotion}?"
    if secondary_emotions:
        secondary_str = " and ".join(secondary_emotions)
        query += f" They may also be feeling {secondary_str}."
    return query

In [None]:
def retrieve_context(classifier_output, n_results=5):
    """
    Orchestrates the retrieval process.
    """
    search_query = create_query_from_emotions(classifier_output)
    print(f"\n🧠 Translated into search query: '{search_query}'")

    print(f"\n🔍 Searching knowledge base...")
    retrieved_docs = search_knowledge_base(search_query, n_results)

    return retrieved_docs


In [None]:
# --- EXECUTION ---
retrieved_context = retrieve_context(simulated_classifier_output)

# 5. DISPLAY RESULTS
print("\n✅ Retrieval complete! Here is the context we found:\n")
for i, doc in enumerate(retrieved_context):
    print(f"--- Context Chunk {i+1} ---\n")
    print(doc)
    print("\n" + "="*50 + "\n")


🧠 Translated into search query: 'What are some therapeutic techniques, coping mechanisms, and advice for someone experiencing strong feelings of sadness? They may also be feeling anxiety.'

🔍 Searching knowledge base...

✅ Retrieval complete! Here is the context we found:

--- Context Chunk 1 ---

III: Conditions & Issues: Loss & Bereavement 
89 
 
Counseling Interventions 
 
Individuals will vary in the way and degree to which they 
grieve.  People with prolonged distress or distress affecting 
their function should be evaluated for major depression and 
other psychological conditions.  
 
The following general points may be useful in helping people 
through the grieving process: 
 
1. Ensure that normal culturally appropriate mourning proc-
esses have been able to take place. 
2. Provide reassurance that the grieving process is normal de-
spite the painful feelings it causes.  Do not force talking.  Peo-
ple choose their own times and situations to share feelings – 
but make it clea

In [None]:
# ==============================================================================
# PHASE 3: THE GENERATION ENGINE (Using Gemini 1.5 Pro)
# ==============================================================================
import google.generativeai as genai
from google.colab import userdata
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="sentence_transformers")
print("🚀 Initializing the Generation Engine...")

# ------------------------------------------------------------------------------
# 1. CONFIGURE THE GEMINI LLM
# ------------------------------------------------------------------------------
try:
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
    genai.configure(api_key=GEMINI_API_KEY)
    model = genai.GenerativeModel('gemini-2.5-pro')
    print("✅ Gemini 1.5 Pro model configured successfully!")
except Exception as e:
    print(f"❌ Error configuring Gemini: {e}")
    print("Please make sure you have set up your GEMINI_API_KEY in Colab Secrets.")
    model = None

# ------------------------------------------------------------------------------
# 2. GATHER ALL OUR INPUT DATA
#    We'll use the simulated/retrieved data from our previous steps.
# ------------------------------------------------------------------------------
original_text = "I messed up the presentation I've been working on for weeks. My boss was not happy. I just feel like a total failure right now."

simulated_classifier_output = [
    ('failure', 0.90),
    ('disappointment', 0.75),
    ('anxiety', 0.55)
]

# (For a real run, you'd call retrieve_context() here. We'll use a sample for this example.)
retrieved_context_str = """
- From "Mindset" (Carol S. Dweck): "Even in the growth mindset, failure can be a painful experience. But it doesn't define you. It's a problem to be faced, dealt with, and learned from."
- From "Cognitive Behavioural Therapy Workbook": "Challenging Automatic Thoughts: When you have a thought like 'I'm a total failure,' that is an automatic negative thought (ANT). The first step is to recognize it. The second is to question it."
- From "Acceptance and Commitment Therapy": "The goal is not to eliminate feelings of failure, but to make room for them. This is called acceptance. You can acknowledge the feeling without letting it dictate your actions."
"""

# ------------------------------------------------------------------------------
# 3. STEP A: GENERATE CLARIFYING QUESTIONS
# ------------------------------------------------------------------------------

# This is a carefully crafted "meta-prompt" that tells the LLM how to behave.
prompt_for_questions = f"""
You are an empathetic therapeutic assistant. Your goal is to help the user reflect on their situation.
You have been given the following information about a user:

1.  **Original User Text:** "{original_text}"
2.  **Detected Emotions:** {simulated_classifier_output}
3.  **Relevant Therapeutic Concepts:** "{retrieved_context_str}"

Your first task is to ask the user 5 clarifying questions to better understand their situation. Do NOT offer any advice or solutions yet.
The questions must be either Yes/No or on a gradient scale (e.g., "On a scale of 1 to 5...").

The goal of these questions is to help you understand the user's immediate state and their thought patterns.

Return ONLY a numbered list of the 5 questions and nothing else.
"""

print("\n🤖 Generating clarifying questions from the LLM...")
if model:
    try:
        response_questions = model.generate_content(prompt_for_questions)
        clarifying_questions = response_questions.text
        print("\n--- LLM Generated Questions ---")
        print(clarifying_questions)

        # ------------------------------------------------------------------------------
        # 4. SIMULATE USER ANSWERS
        #    In a real app, you would pause here and get input from the user.
        # ------------------------------------------------------------------------------
        simulated_user_answers = """
        1. Yes
        2. 5
        3. Yes
        4. No
        5. 2
        """
        print("\n--- User's Answers (Simulated) ---")
        print(simulated_user_answers)

        # ------------------------------------------------------------------------------
        # 5. STEP B: GENERATE THE FINAL RESPONSE
        # ------------------------------------------------------------------------------
        prompt_for_final_response = f"""
        You are an empathetic therapeutic assistant. You have all the initial information about the user, and they have now answered your clarifying questions.

        **Initial Information:**
        - User Text: "{original_text}"
        - Detected Emotions: {simulated_classifier_output}
        - Relevant Concepts: "{retrieved_context_str}"

        **Your Questions and the User's Answers:**
        - Your Questions: "{clarifying_questions}"
        - User's Answers: "{simulated_user_answers}"

        Your final task is to synthesize ALL of this information into a single, supportive, and actionable paragraph.
        In your response, you must:
        1. Acknowledge the user's feelings and their answers.
        2. Integrate the most relevant insight from the retrieved therapeutic concepts.
        3. Offer one gentle, actionable suggestion or a reflective prompt based on these concepts.
        4. Maintain a supportive and non-judgmental tone.

        Your entire response must be a single paragraph of 300 to 500 words.
        """

        print("\n🤖 Generating final response from the LLM...")
        response_final = model.generate_content(prompt_for_final_response)
        final_paragraph = response_final.text
        print("\n\n--- FINAL THERAPEUTIC RESPONSE ---")
        print(final_paragraph)

    except Exception as e:
        print(f"❌ An error occurred during LLM generation: {e}")

🚀 Initializing the Generation Engine...
✅ Gemini 1.5 Pro model configured successfully!

🤖 Generating clarifying questions from the LLM...

--- LLM Generated Questions ---
1. On a scale of 1 to 5 (where 5 is 'completely'), how much do you believe the thought "I'm a total failure"?
2. On a scale of 1 to 5 (where 5 is 'overwhelming'), how intense is this feeling for you at this very moment?
3. Does it feel like this one event defines your entire worth right now?
4. Is this feeling about the presentation casting a shadow over other, unrelated areas of your life as well?
5. Before you received your boss's feedback, was there any part of the work you did that you felt proud of?

--- User's Answers (Simulated) ---

        1. Yes
        2. 5
        3. Yes
        4. No
        5. 2
        

🤖 Generating final response from the LLM...


--- FINAL THERAPEUTIC RESPONSE ---
Thank you for sharing that with me. It makes complete sense that this feeling of failure is so overwhelming for you righ