In [7]:
!pip install pandas scikit-learn spacy faiss-cpu joblib numpy

# Download spaCy model
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
import spacy
import faiss
import joblib
import json
import os
from pathlib import Path

# Create directories
os.makedirs("models", exist_ok=True)
os.makedirs("data", exist_ok=True)

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


In [10]:


def generate_training_data():
    """
    Generate comprehensive mock data for model training.

    Security Note: This function generates synthetic data only.
    In production, ensure all real patient data is:
    - Anonymized (remove PII)
    - Encrypted at rest
    - Access-controlled
    - HIPAA/GDPR compliant
    """

    # Intent Classification Training Data
    intent_data = {
        'text': [
            # Symptom checker intents
            "I have a severe headache", "my stomach hurts", "feeling dizzy and nauseous",
            "sharp pain in my chest", "I can't stop coughing", "fever and chills",
            "sore throat for 3 days", "back pain when I walk", "swollen ankles",
            "difficulty breathing", "persistent fatigue", "joint pain in knees",

            # Medication explainer intents
            "what is metformin used for", "side effects of aspirin", "how to take ibuprofen",
            "is lisinopril safe", "interactions with amoxicillin", "dosage for paracetamol",
            "what does omeprazole do", "can I take advil with food", "benefits of vitamin D",
            "how long to take antibiotics", "what is insulin for", "warfarin precautions",

            # General wellness intents
            "how to improve sleep", "tips for healthy eating", "best exercises for back",
            "stress management techniques", "meditation for beginners", "water intake daily",
            "how to lose weight safely", "immune system boosters", "yoga for flexibility",
            "heart health tips", "digestive health advice", "mental clarity foods",

            # Mental health intents
            "feeling anxious lately", "symptoms of depression", "panic attack help",
            "stress coping strategies", "burnout signs", "meditation for anxiety",
            "therapy options available", "improving self esteem", "dealing with grief",
            "social anxiety tips", "sleep and mental health", "mindfulness practices",

            # Health summary requests
            "show my health summary", "what's my health status", "recent health data",
            "my medication list", "symptom history", "health trends this month",
            "vital signs overview", "appointment history", "lab results summary",
            "wellness score", "health goals progress", "fitness statistics"
        ],
        'intent': [
            # Labels for symptom checker (15 samples)
            'symptom_checker', 'symptom_checker', 'symptom_checker',
            'symptom_checker', 'symptom_checker', 'symptom_checker',
            'symptom_checker', 'symptom_checker', 'symptom_checker',
            'symptom_checker', 'symptom_checker', 'symptom_checker',

            # Labels for medication explainer (12 samples)
            'medication_explainer', 'medication_explainer', 'medication_explainer',
            'medication_explainer', 'medication_explainer', 'medication_explainer',
            'medication_explainer', 'medication_explainer', 'medication_explainer',
            'medication_explainer', 'medication_explainer', 'medication_explainer',

            # Labels for general wellness (12 samples)
            'general_wellness', 'general_wellness', 'general_wellness',
            'general_wellness', 'general_wellness', 'general_wellness',
            'general_wellness', 'general_wellness', 'general_wellness',
            'general_wellness', 'general_wellness', 'general_wellness',

            # Labels for mental health (12 samples)
            'mental_health', 'mental_health', 'mental_health',
            'mental_health', 'mental_health', 'mental_health',
            'mental_health', 'mental_health', 'mental_health',
            'mental_health', 'mental_health', 'mental_health',

            # Labels for health summary (12 samples)
            'health_summary', 'health_summary', 'health_summary',
            'health_summary', 'health_summary', 'health_summary',
            'health_summary', 'health_summary', 'health_summary',
            'health_summary', 'health_summary', 'health_summary'
        ]
    }

    # Medical Knowledge Graph
    knowledge_graph = [
        {
            "id": "kg_001",
            "topic": "Headache",
            "content": "Headaches can be caused by tension, dehydration, eye strain, or underlying conditions. Rest, hydration, and over-the-counter pain relievers may help. Seek medical attention for severe or persistent headaches.",
            "keywords": ["headache", "pain", "head", "migraine", "tension"],
            "category": "symptom"
        },
        {
            "id": "kg_002",
            "topic": "Metformin",
            "content": "Metformin is a medication used to treat type 2 diabetes. It helps control blood sugar levels by improving insulin sensitivity. Common side effects include nausea and digestive issues.",
            "keywords": ["metformin", "diabetes", "blood sugar", "medication"],
            "category": "medication"
        },
        {
            "id": "kg_003",
            "topic": "Sleep Hygiene",
            "content": "Good sleep hygiene includes maintaining a consistent sleep schedule, creating a comfortable sleep environment, avoiding screens before bed, and limiting caffeine intake in the afternoon.",
            "keywords": ["sleep", "insomnia", "rest", "bedtime", "hygiene"],
            "category": "wellness"
        },
        {
            "id": "kg_004",
            "topic": "Anxiety Management",
            "content": "Managing anxiety involves deep breathing exercises, regular physical activity, mindfulness practices, and sometimes professional therapy. It's important to identify triggers and develop coping strategies.",
            "keywords": ["anxiety", "stress", "mental health", "worry", "panic"],
            "category": "mental_health"
        },
        {
            "id": "kg_005",
            "topic": "Fever",
            "content": "A fever is a temporary increase in body temperature, often due to infection. Rest, fluids, and fever-reducing medications can help. Seek medical care if fever exceeds 103°F or persists for more than 3 days.",
            "keywords": ["fever", "temperature", "infection", "hot", "chills"],
            "category": "symptom"
        }
    ]

    # Wellness Tips Database
    wellness_tips = [
        {
            "id": "wt_001",
            "title": "Improve Sleep Quality",
            "content": "Try establishing a bedtime routine: dim lights 1 hour before sleep, avoid screens, keep your room cool (65-68°F), and consider relaxation exercises.",
            "tags": ["sleep", "rest", "insomnia", "wellness"],
            "health_goals": ["improve_sleep", "reduce_stress"]
        },
        {
            "id": "wt_002",
            "title": "Heart-Healthy Eating",
            "content": "Incorporate more omega-3 fatty acids (salmon, walnuts), reduce sodium intake, eat plenty of fruits and vegetables, and choose whole grains over refined carbs.",
            "tags": ["nutrition", "heart health", "diet", "wellness"],
            "health_goals": ["heart_health", "weight_management"]
        },
        {
            "id": "wt_003",
            "title": "Stress Reduction Technique",
            "content": "Practice the 4-7-8 breathing technique: Inhale for 4 counts, hold for 7 counts, exhale for 8 counts. Repeat 3-4 times to activate your relaxation response.",
            "tags": ["stress", "breathing", "mindfulness", "mental health"],
            "health_goals": ["reduce_stress", "improve_mental_health"]
        },
        {
            "id": "wt_004",
            "title": "Daily Hydration",
            "content": "Aim for 8-10 glasses of water daily. Start your morning with a glass of water, keep a water bottle nearby, and set hourly reminders if needed.",
            "tags": ["hydration", "water", "wellness", "health"],
            "health_goals": ["general_wellness", "energy_boost"]
        }
    ]

    # Medical Terms Simplification Dictionary
    medical_terms = {
        "Hypertension": "High blood pressure",
        "Myocardial infarction": "Heart attack",
        "Cerebrovascular accident": "Stroke",
        "Diabetes mellitus": "Diabetes (high blood sugar condition)",
        "Hyperlipidemia": "High cholesterol",
        "Gastroesophageal reflux disease": "Acid reflux or heartburn",
        "Osteoarthritis": "Joint inflammation due to wear and tear",
        "Pneumonia": "Lung infection",
        "Bronchitis": "Inflammation of breathing tubes",
        "Anemia": "Low red blood cell count",
        "Tachycardia": "Fast heart rate",
        "Bradycardia": "Slow heart rate",
        "Edema": "Swelling from fluid buildup",
        "Dyspnea": "Difficulty breathing or shortness of breath",
        "Nausea": "Feeling of wanting to vomit",
        "Pruritus": "Itching",
        "Vertigo": "Feeling of spinning or dizziness",
        "Insomnia": "Difficulty falling or staying asleep",
        "Hyponatremia": "Low sodium levels in blood",
        "Hyperglycemia": "High blood sugar"
    }

    return {
        'intent_data': pd.DataFrame(intent_data),
        'knowledge_graph': knowledge_graph,
        'wellness_tips': wellness_tips,
        'medical_terms': medical_terms
    }

# Generate all training data
print("Generating mock training data...")
training_data = generate_training_data()
print(f"✓ Generated {len(training_data['intent_data'])} training samples")
print(f"✓ Created knowledge graph with {len(training_data['knowledge_graph'])} entries")





Generating mock training data...
✓ Generated 60 training samples
✓ Created knowledge graph with 5 entries


In [11]:
def train_intent_model(df):
    """
    Train intent classification model using TF-IDF and SGD Classifier.

    Args:
        df: DataFrame with 'text' and 'intent' columns

    Returns:
        Trained model and vectorizer
    """
    print("\n" + "="*60)
    print("TRAINING INTENT CLASSIFICATION MODEL")
    print("="*60)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        df['text'], df['intent'], test_size=0.2, random_state=42, stratify=df['intent']
    )

    # Vectorize text
    vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train classifier
    model = SGDClassifier(loss='log_loss', max_iter=1000, random_state=42)
    model.fit(X_train_vec, y_train)

    # Evaluate
    train_score = model.score(X_train_vec, y_train)
    test_score = model.score(X_test_vec, y_test)

    print(f"Training Accuracy: {train_score:.3f}")
    print(f"Testing Accuracy: {test_score:.3f}")
    print(f"Classes: {model.classes_}")

    return model, vectorizer

# Train the model
intent_model, vectorizer = train_intent_model(training_data['intent_data'])

# Save models
joblib.dump(intent_model, "models/intent_model.joblib")
joblib.dump(vectorizer, "models/vectorizer.joblib")
print("\n✓ Models saved to 'models/' directory")



TRAINING INTENT CLASSIFICATION MODEL
Training Accuracy: 1.000
Testing Accuracy: 0.167
Classes: ['general_wellness' 'health_summary' 'medication_explainer'
 'mental_health' 'symptom_checker']

✓ Models saved to 'models/' directory


In [12]:
def create_faiss_index(knowledge_graph):
    """
    Create FAISS index for efficient similarity search in knowledge graph.

    Security Note: Ensure knowledge graph contains only verified,
    anonymized medical information. Never include patient-specific data.
    """
    print("\n" + "="*60)
    print("CREATING FAISS INDEX FOR KNOWLEDGE GRAPH")
    print("="*60)

    # Load spaCy model for embeddings
    nlp = spacy.load("en_core_web_sm")

    # Generate embeddings for each knowledge graph entry
    embeddings = []
    for entry in knowledge_graph:
        # Combine topic and content for better matching
        text = f"{entry['topic']} {entry['content']}"
        doc = nlp(text)
        embeddings.append(doc.vector)

    embeddings = np.array(embeddings).astype('float32')

    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    print(f"✓ Created FAISS index with {index.ntotal} entries")
    print(f"✓ Embedding dimension: {dimension}")

    return index

# Create FAISS index
faiss_index = create_faiss_index(training_data['knowledge_graph'])

# Save FAISS index and knowledge graph
joblib.dump(faiss_index, "models/faiss_index.joblib")

with open("data/knowledge_graph.json", "w") as f:
    json.dump(training_data['knowledge_graph'], f, indent=2)

with open("data/wellness_tips.json", "w") as f:
    json.dump(training_data['wellness_tips'], f, indent=2)

with open("data/medical_terms.json", "w") as f:
    json.dump(training_data['medical_terms'], f, indent=2)

print("\n✓ All data files saved to 'data/' directory")

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print("="*60)
print("\nNext steps:")
print("1. Download the 'models' folder from Colab")
print("2. Download the 'data' folder from Colab")
print("3. Upload both folders to your GitHub repository")
print("4. Deploy to Streamlit Cloud")


CREATING FAISS INDEX FOR KNOWLEDGE GRAPH
✓ Created FAISS index with 5 entries
✓ Embedding dimension: 96

✓ All data files saved to 'data/' directory

MODEL TRAINING COMPLETE!

Next steps:
1. Download the 'models' folder from Colab
2. Download the 'data' folder from Colab
3. Upload both folders to your GitHub repository
4. Deploy to Streamlit Cloud
