In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    train_test_split, RandomizedSearchCV, StratifiedKFold
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, f1_score, confusion_matrix,
    roc_curve, roc_auc_score, recall_score, precision_score,
    matthews_corrcoef
)
import joblib
import warnings
import os
import json
from datetime import datetime

warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

class ModelEvaluator:
    def __init__(self, data_path, target_column='Depression', output_dir='evaluation_results'):
        self.data_path = data_path
        self.target_column = target_column
        self.output_dir = output_dir
        self.data = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.tuned_models = {}
        self.best_params = {}
        self.evaluation_results = {}
        self.best_model = None
        self.best_model_name = None

        # Create output directory structure
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(f'{self.output_dir}/visualizations', exist_ok=True)
        os.makedirs(f'{self.output_dir}/models', exist_ok=True)
        os.makedirs(f'{self.output_dir}/reports', exist_ok=True)

    def load_data(self):
        """Load and prepare data with safer Label Fixing"""
        print("\n" + "="*70)
        print("LOADING DATA")
        print("="*70)

        try:
            if not os.path.exists(self.data_path):
                print(f"ERROR: File not found at {self.data_path}")
                return False

            self.data = pd.read_csv(self.data_path)
            print(f"Data loaded: {self.data.shape}")

            # Check if target column exists
            if self.target_column not in self.data.columns:
                print(f"ERROR: Target column '{self.target_column}' not found!")
                return False

            # ---------------------------------------------------------
            # TARGET FIXING (Safer Logic)
            # ---------------------------------------------------------
            print("Checking target labels...")
            unique_vals = sorted(self.data[self.target_column].unique())
            print(f"  Raw unique values: {unique_vals}")

            # Only map -1 to 0. Leave 0 and 1 alone.
            if -1 in unique_vals:
                print("  Found -1. Mapping to 0 (Healthy).")
                self.data[self.target_column] = self.data[self.target_column].replace({-1: 0})

            # Verify we have at least 2 classes
            final_counts = self.data[self.target_column].value_counts()
            print(f"  Final Distribution:\n{final_counts}")

            if len(final_counts) < 2:
                print("CRITICAL ERROR: Dataset has only 1 class! Models cannot train.")
                print("   Please check your raw data or preprocessing logic.")
                return False
            # ---------------------------------------------------------

            # Prepare features and target
            X = self.data.drop(columns=[self.target_column])
            y = self.data[self.target_column]

            # Split data
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y
            )

            print(f"Train shape: {self.X_train.shape}")
            print(f"Test shape: {self.X_test.shape}")
            print(f"Train class distribution: {self.y_train.value_counts().to_dict()}")
            print(f"Test class distribution: {self.y_test.value_counts().to_dict()}")
            return True

        except Exception as e:
            print(f"CRITICAL ERROR loading data: {e}")
            return False

    def get_param_grids(self):
        """Define hyperparameter grids"""
        return {
            'Logistic Regression': {
                'C': [0.01, 0.1, 1, 10],
                'solver': ['liblinear'],
                'class_weight': ['balanced', None]
            },
            'Random Forest': {
                'n_estimators': [50, 100, 200],
                'max_depth': [10, 20, None],
                'min_samples_leaf': [1, 2, 4],
                'class_weight': ['balanced', None]
            },
            'Gradient Boosting': {
                'n_estimators': [50, 100],
                'learning_rate': [0.05, 0.1, 0.2],
                'max_depth': [3, 5]
            },
            'K-Nearest Neighbors': {
                'n_neighbors': [3, 5, 7, 9],
                'weights': ['uniform', 'distance']
            },
            'Decision Tree': {
                'max_depth': [5, 10, None],
                'min_samples_leaf': [2, 5],
                'class_weight': ['balanced', None]
            },
            'Naive Bayes': {
                'var_smoothing': [1e-9, 1e-8]
            }
        }

    def hyperparameter_tuning(self, n_iter=10):
        """Perform hyperparameter tuning with error catching"""
        print("\n" + "="*70)
        print(f"HYPERPARAMETER TUNING")
        print("="*70)

        base_models = {
            'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
            'Random Forest': RandomForestClassifier(random_state=42),
            'Gradient Boosting': GradientBoostingClassifier(random_state=42),
            'K-Nearest Neighbors': KNeighborsClassifier(),
            'Decision Tree': DecisionTreeClassifier(random_state=42),
            'Naive Bayes': GaussianNB()
        }

        param_grids = self.get_param_grids()
        cv_splitter = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

        for model_name, base_model in base_models.items():
            print(f"\nTraining {model_name}...")

            try:
                search = RandomizedSearchCV(
                    base_model,
                    param_distributions=param_grids[model_name],
                    n_iter=n_iter,
                    cv=cv_splitter,
                    scoring='f1',
                    n_jobs=-1,
                    random_state=42,
                    error_score='raise'
                )

                search.fit(self.X_train, self.y_train)

                self.tuned_models[model_name] = search.best_estimator_
                self.best_params[model_name] = search.best_params_
                print(f"  Success! Best F1 Score: {search.best_score_:.4f}")
                print(f"  Best Parameters: {search.best_params_}")

            except Exception as e:
                print(f"  FAILED: {str(e)}")
                continue

        if not self.tuned_models:
            print("\nCRITICAL: No models were trained successfully.")
            return False

        print(f"\nSuccessfully trained {len(self.tuned_models)} models")
        return True

    def evaluate_models(self):
        """Evaluate all successfully tuned models with MEDICAL METRICS"""
        print("\n" + "="*70)
        print("EVALUATING MODELS (Medical Metrics)")
        print("="*70)

        for model_name, model in self.tuned_models.items():
            try:
                y_pred = model.predict(self.X_test)
                y_pred_train = model.predict(self.X_train)

                # Probabilities for ROC
                try:
                    y_prob = model.predict_proba(self.X_test)[:, 1]
                    roc_auc = roc_auc_score(self.y_test, y_prob)
                except:
                    y_prob = None
                    roc_auc = 0.5

                # MEDICAL METRICS (Critical for Healthcare)
                f1 = f1_score(self.y_test, y_pred)

                # Class-specific metrics (pos_label=1 for Depressed)
                recall_depressed = recall_score(self.y_test, y_pred, pos_label=1)
                precision_depressed = precision_score(self.y_test, y_pred, pos_label=1, zero_division=0)

                # Class-specific metrics (pos_label=0 for Healthy)
                recall_healthy = recall_score(self.y_test, y_pred, pos_label=0)
                precision_healthy = precision_score(self.y_test, y_pred, pos_label=0, zero_division=0)

                # Matthews Correlation Coefficient (Best for imbalanced data)
                mcc = matthews_corrcoef(self.y_test, y_pred)

                # Confusion Matrix values
                cm = confusion_matrix(self.y_test, y_pred)
                tn, fp, fn, tp = cm.ravel()

                # Specificity (True Negative Rate)
                specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

                self.evaluation_results[model_name] = {
                    'model': model,
                    'test_accuracy': accuracy_score(self.y_test, y_pred),
                    'train_accuracy': accuracy_score(self.y_train, y_pred_train),
                    'f1_score': f1,
                    'mcc': mcc,
                    'recall_depressed': recall_depressed,
                    'precision_depressed': precision_depressed,
                    'recall_healthy': recall_healthy,
                    'precision_healthy': precision_healthy,
                    'specificity': specificity,
                    'roc_auc': roc_auc,
                    'overfit_gap': accuracy_score(self.y_train, y_pred_train) - accuracy_score(self.y_test, y_pred),
                    'confusion_matrix': cm,
                    'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn,
                    'y_pred': y_pred,
                    'y_prob': y_prob
                }

                print(f"\n{model_name}:")
                print(f"  Accuracy: {accuracy_score(self.y_test, y_pred):.3f}")
                print(f"  F1 Score: {f1:.3f}")
                print(f"  MCC: {mcc:.3f}")
                print(f"  Recall (Depressed): {recall_depressed:.3f} [CRITICAL]")
                print(f"  Precision (Depressed): {precision_depressed:.3f}")
                print(f"  Specificity: {specificity:.3f}")

            except Exception as e:
                print(f"Error evaluating {model_name}: {e}")

    def compare_models(self):
        """Compare models with comprehensive medical metrics"""
        print("\n" + "="*70)
        print("FINAL RESULTS - MODEL COMPARISON")
        print("="*70)

        if not self.evaluation_results:
            print("No evaluation results found. Pipeline failed.")
            return False

        comparison_data = []
        for name, res in self.evaluation_results.items():
            comparison_data.append({
                'Model': name,
                'Accuracy': res['test_accuracy'],
                'F1': res['f1_score'],
                'MCC': res['mcc'],
                'Recall_Depressed': res['recall_depressed'],
                'Precision_Depressed': res['precision_depressed'],
                'Specificity': res['specificity'],
                'ROC_AUC': res['roc_auc'],
                'Overfit_Gap': res['overfit_gap']
            })

        comparison_df = pd.DataFrame(comparison_data)

        if comparison_df.empty:
            print("Comparison table is empty.")
            return False

        # Sort by F1 Score (primary), then by Recall_Depressed (secondary)
        comparison_df = comparison_df.sort_values(['F1', 'Recall_Depressed'], ascending=False)

        print("\nCOMPLETE MODEL COMPARISON:")
        print(comparison_df.to_string(index=False))

        # Save to CSV
        comparison_df.to_csv(f'{self.output_dir}/model_comparison.csv', index=False)

        # Set best model
        self.best_model_name = comparison_df.iloc[0]['Model']
        self.best_model = self.evaluation_results[self.best_model_name]['model']

        print(f"\nWINNER: {self.best_model_name}")
        print(f"   F1 Score: {comparison_df.iloc[0]['F1']:.3f}")
        print(f"   Recall (Depressed): {comparison_df.iloc[0]['Recall_Depressed']:.3f}")
        print(f"   MCC: {comparison_df.iloc[0]['MCC']:.3f}")

        # Medical interpretation
        best_recall = comparison_df.iloc[0]['Recall_Depressed']
        if best_recall < 0.7:
            print(f"\nWARNING: Best model recall is {best_recall:.1%}")
            print("   This model may miss many depressed students.")
            print("   Consider: (1) Collecting more data, (2) Feature engineering, (3) Threshold tuning")
        else:
            print(f"\nGood recall ({best_recall:.1%}) - Model catches most depressed cases")

        return True

    def plot_visualizations(self):
        """Generate comprehensive visualizations"""
        if not self.evaluation_results:
            return

        # 1. Confusion Matrices with Medical Context
        n_models = len(self.evaluation_results)
        rows = (n_models + 2) // 3
        fig, axes = plt.subplots(rows, 3, figsize=(15, 5*rows))
        axes = axes.flatten() if n_models > 1 else [axes]

        for idx, (name, res) in enumerate(self.evaluation_results.items()):
            cm = res['confusion_matrix']
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                        xticklabels=['Healthy', 'Depressed'],
                        yticklabels=['Healthy', 'Depressed'])

            # Add medical context to title
            recall_dep = res['recall_depressed']
            axes[idx].set_title(
                f"{name}\n"
                f"Recall (Depressed): {recall_dep:.2f} | "
                f"MCC: {res['mcc']:.2f}\n"
                f"FN={res['fn']} (Missed Cases)"
            )
            axes[idx].set_ylabel('Actual')
            axes[idx].set_xlabel('Predicted')

        for idx in range(n_models, len(axes)):
            axes[idx].axis('off')

        plt.tight_layout()
        plt.savefig(f'{self.output_dir}/visualizations/confusion_matrices.png', dpi=150)
        plt.close()
        print("Saved confusion matrices")

        # 2. ROC Curves
        plt.figure(figsize=(10, 8))
        for name, res in self.evaluation_results.items():
            if res['y_prob'] is not None:
                fpr, tpr, _ = roc_curve(self.y_test, res['y_prob'])
                plt.plot(fpr, tpr, label=f'{name} (AUC={res["roc_auc"]:.2f})', linewidth=2)

        plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate (Recall)')
        plt.title('ROC Curves - Depression Detection')
        plt.legend(loc='lower right')
        plt.grid(alpha=0.3)
        plt.savefig(f'{self.output_dir}/visualizations/roc_curves.png', dpi=150)
        plt.close()
        print("Saved ROC curves")

        # 3. Medical Metrics Comparison
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))

        models = list(self.evaluation_results.keys())

        # Recall comparison
        recalls = [self.evaluation_results[m]['recall_depressed'] for m in models]
        axes[0, 0].barh(models, recalls, color='skyblue')
        axes[0, 0].set_xlabel('Recall (Depressed)')
        axes[0, 0].set_title('Recall: How many depressed cases caught?')
        axes[0, 0].axvline(x=0.7, color='red', linestyle='--', label='Target 70%')
        axes[0, 0].legend()

        # Precision comparison
        precisions = [self.evaluation_results[m]['precision_depressed'] for m in models]
        axes[0, 1].barh(models, precisions, color='lightcoral')
        axes[0, 1].set_xlabel('Precision (Depressed)')
        axes[0, 1].set_title('Precision: How accurate are positive predictions?')

        # MCC comparison
        mccs = [self.evaluation_results[m]['mcc'] for m in models]
        axes[1, 0].barh(models, mccs, color='lightgreen')
        axes[1, 0].set_xlabel('Matthews Correlation Coefficient')
        axes[1, 0].set_title('MCC: Overall quality metric')

        # F1 comparison
        f1s = [self.evaluation_results[m]['f1_score'] for m in models]
        axes[1, 1].barh(models, f1s, color='plum')
        axes[1, 1].set_xlabel('F1 Score')
        axes[1, 1].set_title('F1: Balance of Precision and Recall')

        plt.tight_layout()
        plt.savefig(f'{self.output_dir}/visualizations/medical_metrics.png', dpi=150)
        plt.close()
        print("Saved medical metrics comparison")

    def save_best_model(self):
        """Save best model and generate detailed report"""
        if self.best_model:
            # Save model
            path = f'{self.output_dir}/models/best_model.pkl'
            joblib.dump(self.best_model, path)
            print(f"Saved best model to {path}")

            # Generate detailed report with UTF-8 encoding
            report_path = f'{self.output_dir}/reports/best_model_details.txt'
            with open(report_path, 'w', encoding='utf-8') as f:
                f.write("="*70 + "\n")
                f.write("BEST MODEL EVALUATION REPORT\n")
                f.write("="*70 + "\n\n")
                f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

                f.write(f"Best Model: {self.best_model_name}\n")
                f.write(f"Parameters: {self.best_params[self.best_model_name]}\n\n")

                res = self.evaluation_results[self.best_model_name]

                f.write("PERFORMANCE METRICS:\n")
                f.write("-" * 50 + "\n")
                f.write(f"Accuracy: {res['test_accuracy']:.4f}\n")
                f.write(f"F1 Score: {res['f1_score']:.4f}\n")
                f.write(f"Matthews Correlation Coefficient: {res['mcc']:.4f}\n")
                f.write(f"ROC AUC: {res['roc_auc']:.4f}\n\n")

                f.write("MEDICAL METRICS (Critical for Healthcare):\n")
                f.write("-" * 50 + "\n")
                f.write(f"Recall (Depressed): {res['recall_depressed']:.4f}\n")
                f.write(f"  >> Catches {res['recall_depressed']*100:.1f}% of depressed students\n")
                f.write(f"Precision (Depressed): {res['precision_depressed']:.4f}\n")
                f.write(f"  >> {res['precision_depressed']*100:.1f}% of positive predictions are correct\n")
                f.write(f"Specificity: {res['specificity']:.4f}\n")
                f.write(f"  >> Correctly identifies {res['specificity']*100:.1f}% of healthy students\n\n")

                f.write("CONFUSION MATRIX:\n")
                f.write("-" * 50 + "\n")
                f.write(f"True Negatives (Correctly identified healthy): {res['tn']}\n")
                f.write(f"False Positives (Healthy flagged as depressed): {res['fp']}\n")
                f.write(f"False Negatives (Depressed missed): {res['fn']} [WARNING]\n")
                f.write(f"True Positives (Correctly identified depressed): {res['tp']}\n\n")

                f.write("CLINICAL INTERPRETATION:\n")
                f.write("-" * 50 + "\n")
                if res['recall_depressed'] < 0.7:
                    f.write("WARNING: Low recall for depressed class.\n")
                    f.write("Model may miss many students needing help.\n")
                    f.write("Recommendations:\n")
                    f.write("  1. Collect more data, especially depressed cases\n")
                    f.write("  2. Adjust decision threshold to favor recall\n")
                    f.write("  3. Use ensemble methods or cost-sensitive learning\n")
                else:
                    f.write("GOOD: Acceptable recall - model catches most depressed cases.\n")

                if res['precision_depressed'] < 0.5:
                    f.write("\nWARNING: Low precision - many false alarms.\n")
                    f.write("Consider threshold adjustment or additional features.\n")

                f.write(f"\nOverfitting Check: {res['overfit_gap']:.4f}\n")
                if res['overfit_gap'] > 0.1:
                    f.write("WARNING: Model may be overfitting (gap > 10%).\n")
                else:
                    f.write("GOOD: Good generalization.\n")

            print(f"Saved detailed report to {report_path}")

    def run_complete_evaluation(self, n_iter=15):
        """Run the complete evaluation pipeline"""
        print("\n" + "="*70)
        print("MEDICAL DEPRESSION MODEL EVALUATION PIPELINE")
        print("="*70)

        if not self.load_data():
            return False

        if not self.hyperparameter_tuning(n_iter=n_iter):
            return False

        self.evaluate_models()

        if self.compare_models():
            self.plot_visualizations()
            self.save_best_model()
            print("\n" + "="*70)
            print("PIPELINE COMPLETED SUCCESSFULLY!")
            print("="*70)
            print(f"\nResults saved to: {self.output_dir}/")
            print(f"   - Model comparison: model_comparison.csv")
            print(f"   - Visualizations: visualizations/")
            print(f"   - Best model: models/best_model.pkl")
            print(f"   - Detailed report: reports/best_model_details.txt")
            return True

        return False

if __name__ == "__main__":
    # Ensure this matches your file path exactly
    DATA_FILE = "data/processed/preprocessed_data.csv"

    evaluator = ModelEvaluator(DATA_FILE)
    # n_iter=15 is a good balance between speed and thorough search
    evaluator.run_complete_evaluation(n_iter=15)


MEDICAL DEPRESSION MODEL EVALUATION PIPELINE

LOADING DATA
Data loaded: (27901, 63)
Checking target labels...
  Raw unique values: [np.int64(0), np.int64(1)]
  Final Distribution:
Depression
1    16336
0    11565
Name: count, dtype: int64
Train shape: (22320, 62)
Test shape: (5581, 62)
Train class distribution: {1: 13068, 0: 9252}
Test class distribution: {1: 3268, 0: 2313}

HYPERPARAMETER TUNING

Training Logistic Regression...
  Success! Best F1 Score: 0.8715
  Best Parameters: {'solver': 'liblinear', 'class_weight': None, 'C': 0.1}

Training Random Forest...
  Success! Best F1 Score: 0.8681
  Best Parameters: {'n_estimators': 200, 'min_samples_leaf': 4, 'max_depth': 20, 'class_weight': None}

Training Gradient Boosting...
  Success! Best F1 Score: 0.8707
  Best Parameters: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.2}

Training K-Nearest Neighbors...
  Success! Best F1 Score: 0.8516
  Best Parameters: {'weights': 'distance', 'n_neighbors': 9}

Training Decision Tree..