In [6]:
# Mental Health Counselor Assistant - Simple RAG System
# Uses TF-IDF + Cosine Similarity (no complex dependencies)

# Cell 1: Import libraries and load data
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os

print("Loading mental health conversation data...")
df = pd.read_csv('../data/raw/mental_health_conversations.csv')
print(f"✅ Loaded {len(df)} conversation pairs")
print(f"Columns: {list(df.columns)}")

# Clean and prepare data
responses = df['Response'].fillna('').tolist()
contexts = df['Context'].fillna('').tolist()

print(f"✅ Prepared {len(responses)} counselor responses")
print(f"Sample context: {contexts[0][:100]}...")
print(f"Sample response: {responses[0][:100]}...")

# Cell 2: Create TF-IDF vectorizer and vectors
print("\n🔄 Creating TF-IDF vectors...")
print("This analyzes word patterns in counselor responses...")

# Initialize TF-IDF vectorizer with good settings for therapy text
vectorizer = TfidfVectorizer(
    max_features=5000,        # Keep top 5000 words
    stop_words='english',     # Remove common words like 'the', 'and'
    ngram_range=(1, 2),       # Include single words and word pairs
    max_df=0.8,               # Ignore very common words
    min_df=2,                 # Ignore very rare words (appear < 2 times)
    lowercase=True            # Convert to lowercase
)

# Create vectors for all counselor responses
response_vectors = vectorizer.fit_transform(responses)
print(f"✅ Created TF-IDF matrix with shape: {response_vectors.shape}")
print(f"   - {response_vectors.shape[0]} responses")
print(f"   - {response_vectors.shape[1]} unique terms/phrases")

# Cell 3: Save the RAG system components
print("\n💾 Saving RAG system components...")

# Create data directory
os.makedirs('../data', exist_ok=True)

# Save vectorizer (converts text to numbers)
with open('../data/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    
# Save response vectors (numerical representations)
with open('../data/response_vectors.pkl', 'wb') as f:
    pickle.dump(response_vectors, f)
    
# Save responses and contexts for retrieval
with open('../data/responses.pkl', 'wb') as f:
    pickle.dump(responses, f)
    
with open('../data/contexts.pkl', 'wb') as f:
    pickle.dump(contexts, f)

print("✅ All RAG components saved successfully!")
print("Files saved:")
print("  - vectorizer.pkl (converts queries to vectors)")
print("  - response_vectors.pkl (all counselor responses as vectors)")
print("  - responses.pkl (original counselor responses)")
print("  - contexts.pkl (original patient contexts)")

# Cell 4: Create and test search function
def search_similar_cases(query, k=3):
    """
    Search for counselor responses most similar to the query
    
    Args:
        query (str): Description of patient situation
        k (int): Number of similar cases to return
        
    Returns:
        tuple: (indices, similarities) of top matches
    """
    print(f"\n🔍 Searching for: '{query}'")
    print("="*60)
    
    # Convert query to same vector space as responses
    query_vector = vectorizer.transform([query])
    
    # Calculate similarity between query and all responses
    similarities = cosine_similarity(query_vector, response_vectors).flatten()
    
    # Get indices of top k most similar responses
    top_indices = similarities.argsort()[-k:][::-1]  # Descending order
    
    # Display results
    for i, idx in enumerate(top_indices):
        similarity_score = similarities[idx]
        print(f"\n📋 Result {i+1} (Similarity: {similarity_score:.3f})")
        print(f"Patient Context: {contexts[idx][:120]}...")
        print(f"Counselor Response: {responses[idx][:180]}...")
        print("-" * 50)
    
    return top_indices, similarities[top_indices]

# Cell 5: Test with various mental health scenarios
print("\n🧪 Testing RAG system with different scenarios...")

test_scenarios = [
    "My patient feels anxious and can't sleep at night",
    "Patient is struggling with relationship problems and fights",
    "Someone dealing with depression and feelings of worthlessness",
    "Client feels overwhelmed with work stress and burnout",
    "Patient having panic attacks and fear of social situations"
]

for scenario in test_scenarios:
    indices, scores = search_similar_cases(scenario, k=2)
    print(f"Average similarity score: {scores.mean():.3f}\n")

print("🎉 RAG system is working successfully!")
print("\nNext step: Run the Streamlit app with:")
print("streamlit run streamlit_app.py")

Loading mental health conversation data...
✅ Loaded 3512 conversation pairs
Columns: ['Context', 'Response']
✅ Prepared 3512 counselor responses
Sample context: I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think...
Sample response: If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Serious...

🔄 Creating TF-IDF vectors...
This analyzes word patterns in counselor responses...
✅ Created TF-IDF matrix with shape: (3512, 5000)
   - 3512 responses
   - 5000 unique terms/phrases

💾 Saving RAG system components...
✅ All RAG components saved successfully!
Files saved:
  - vectorizer.pkl (converts queries to vectors)
  - response_vectors.pkl (all counselor responses as vectors)
  - responses.pkl (original counselor responses)
  - contexts.pkl (original patient contexts)

🧪 Testing RAG system with different scenarios...

🔍 Searching for: 'My patient feels anxious and can't sleep at night'

📋 Result 1 (Sim