In [1]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting pandas<2.2.0 (from pycaret)
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret)
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-2.0.4.tar.gz (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.7/169.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collectin

In [2]:
import pandas as pd
import numpy as np
import json
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from pycaret.classification import setup, compare_models, predict_model

def load_data(file_path):
    """Load and prepare the dataset from CSV"""
    print("Loading data from:", file_path)
    df = pd.read_csv(file_path)

    # Print class distribution
    counts = df['specific.disorder'].value_counts()
    print("\nClass Distribution:")
    print(counts)

    depress_count = (df['specific.disorder'] == 'Depressive disorder').sum()
    normal_count = (df['specific.disorder'] == 'Healthy control').sum()

    print(f"Depressive disorder: {depress_count}")
    print(f"Healthy control: {normal_count}")

    return df

def prepare_features(df, top_features):
    """Extract selected features and prepare target variable"""
    print(f"\nUsing {len(top_features)} features for analysis:")
    for feature in top_features:
        print(f"- {feature}")

    # Extract features
    df_features = df[top_features]

    # Map target variable
    target_mapping = {
        'Depressive disorder': 1,
        'Healthy control': 0
    }
    target = df['specific.disorder'].map(target_mapping).to_list()

    print(f"\nTotal dataset size: {len(df_features)} samples")

    return df_features, target

def normalize_data(df_features):
    """Normalize the feature data using MinMaxScaler"""
    print("\nNormalizing data...")
    scaler = MinMaxScaler()
    normalized_data = scaler.fit_transform(df_features)
    print("Data normalized. Example normalized row:")
    print(normalized_data[0])
    return normalized_data

def split_dataset(data, target, test_size=0.2, random_state=42):
    """Split data into training and test sets"""
    print(f"\nSplitting data: {1-test_size:.0%} training, {test_size:.0%} testing")
    X_train, X_test, y_train, y_test = train_test_split(
        data, target, test_size=test_size, random_state=random_state, stratify=target
    )

    print(f"Training set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")

    return X_train, X_test, y_train, y_test

def save_dataset(X_train, X_test, y_train, y_test, file_path='dataset.npz'):
    """Save the prepared dataset to a file"""
    print(f"\nSaving dataset to {file_path}")
    np.savez(file_path, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
    print("Dataset saved successfully")

def load_saved_dataset(file_path='dataset.npz'):
    """Load a previously saved dataset"""
    print(f"\nLoading dataset from {file_path}")
    data = np.load(file_path)
    X_train = data['X_train']
    X_test = data['X_test']
    y_train = data['y_train']
    y_test = data['y_test']

    print(f"Training set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")

    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train):
    """Train a model using PyCaret"""
    print("\nPreparing training data for PyCaret")
    # Combine features and target for PyCaret
    train_df = pd.DataFrame(X_train)
    train_df['target'] = y_train

    print("Setting up PyCaret classification experiment")
    clf_setup = setup(data=train_df, target='target', session_id=42, verbose=True)

    print("\nComparing models to find the best one")
    best_model = compare_models(verbose=True)

    return best_model

def evaluate_model(model, X_test, y_test):
    """Evaluate model performance on test data"""
    print("\nEvaluating model on test data")
    # Prepare test dataframe for PyCaret
    test_df = pd.DataFrame(X_test)
    test_df['target'] = y_test

    # Make predictions
    preds = predict_model(model, data=test_df)

    # Calculate evaluation metrics
    y_true = preds['target']
    y_pred = preds['prediction_label']
    y_scores = preds['prediction_score']

    # Calculate various metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    try:
        auc = roc_auc_score(y_true, y_scores)
    except:
        auc = None

    # Print metrics
    print("\n===== MODEL EVALUATION METRICS =====")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    if auc:
        print(f"ROC AUC: {auc:.4f}")

    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_true, y_pred)
    print(cm)

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

    return preds, {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, "auc": auc}

def generate_random_supplementary_info(prediction_score):
    """Generate random supplementary health information for a given prediction"""
    # Determine predicted label based on score
    predicted_label = "Depressive disorder" if prediction_score > 0.5 else "Healthy control"

    supplementary_info = {
        "mood": {
            "sadness": random.choice([True, False]),
            "irritability": random.choice([True, False]),
            "mood_swings": random.choice([True, False])
        },
        "sleep": {
            "insomnia": random.choice([True, False]),
            "hypersomnia": random.choice([True, False])
        },
        "energy_levels": random.choice(["low", "normal", "high"]),
        "cognitive_symptoms": {
            "concentration_difficulty": random.choice([True, False]),
            "memory_issues": random.choice([True, False])
        },
        "behavioral_changes": {
            "social_withdrawal": random.choice([True, False]),
            "reduced_activity": random.choice([True, False])
        },
        "physical_symptoms": {
            "appetite_change": random.choice([True, False]),
            "body_pain": random.choice([True, False])
        },
        "risk_factors": {
            "family_history_depression": random.choice([True, False]),
            "recent_trauma": random.choice([True, False]),
            "substance_abuse": random.choice([True, False])
        }
    }

    # Final JSON structure
    output_json = {
        "prediction_score": round(prediction_score, 4),
        "predicted_label": predicted_label,
        "supplementary_info": supplementary_info
    }

    return output_json

def generate_json_results(predictions, output_file='depression_predictions.json'):
    """Generate JSON results file with supplementary information"""
    print(f"\nGenerating detailed JSON results to {output_file}")

    # Extract prediction scores
    pred_scores = list(predictions['prediction_score'])

    # Generate supplementary info for each prediction
    json_arrays = []
    for pred in pred_scores:
        generated_json = generate_random_supplementary_info(pred)
        json_arrays.append(generated_json)

    dump_data = {
        'data': json_arrays
    }

    # Save to file
    with open(output_file, 'w') as f:
        json.dump(dump_data, f, indent=4)

    print(f"Results saved to {output_file}")
    return json_arrays

def main():
    # Define important features based on previous analysis
    top_features = [
        'COH.delta.F7.T3',
        'COH.theta.F7.T3',
        'COH.theta.F7.C3',
        'COH.theta.F7.T5',
        'COH.theta.F7.O1',
        'COH.alpha.F7.T3',
        'COH.alpha.F7.T5',
        'COH.alpha.F7.O1',
        'COH.beta.F7.T5',
        'COH.beta.F7.O1'
    ]

    print("=== EEG-Based Depression Classification System ===")

    # Step 1: Load and prepare data
    df = load_data('machinelearning_data_EEG.csv')
    df_features, target = prepare_features(df, top_features)

    # Step 2: Normalize data
    normalized_data = normalize_data(df_features)

    # Step 3: Split data
    X_train, X_test, y_train, y_test = split_dataset(normalized_data, target)

    # Step 4: Save dataset for future use
    save_dataset(X_train, X_test, y_train, y_test)

    # Step 5: Train model using PyCaret
    best_model = train_model(X_train, y_train)

    # Step 6: Evaluate model
    predictions, metrics = evaluate_model(best_model, X_test, y_test)

    # Step 7: Generate supplementary results
    json_results = generate_json_results(predictions)

    print("\n=== Processing complete ===")
    print(f"Final model accuracy: {metrics['accuracy']:.4f}")
    print(f"Final model F1 score: {metrics['f1']:.4f}")

if __name__ == "__main__":
    # Check for required packages
    try:
        import pycaret
    except ImportError:
        print("PyCaret not found. Installing...")
        import subprocess
        subprocess.check_call(["pip", "install", "pycaret"])
        print("PyCaret installed successfully")

    main()

=== EEG-Based Depression Classification System ===
Loading data from: machinelearning_data_EEG.csv

Class Distribution:
specific.disorder
Depressive disorder    199
Healthy control         95
Name: count, dtype: int64
Depressive disorder: 199
Healthy control: 95

Using 10 features for analysis:
- COH.delta.F7.T3
- COH.theta.F7.T3
- COH.theta.F7.C3
- COH.theta.F7.T5
- COH.theta.F7.O1
- COH.alpha.F7.T3
- COH.alpha.F7.T5
- COH.alpha.F7.O1
- COH.beta.F7.T5
- COH.beta.F7.O1

Total dataset size: 294 samples

Normalizing data...
Data normalized. Example normalized row:
[0.15055588 0.25590199 0.50751191 0.14750493 0.01750742 0.79973624
 0.09362453 0.36387313 0.12367543 0.01682935]

Splitting data: 80% training, 20% testing
Training set: 235 samples
Test set: 59 samples

Saving dataset to dataset.npz
Dataset saved successfully

Preparing training data for PyCaret
Setting up PyCaret classification experiment


Unnamed: 0,Description,Value
0,Session id,42
1,Target,target
2,Target type,Binary
3,Original data shape,"(235, 11)"
4,Transformed data shape,"(235, 11)"
5,Transformed train set shape,"(164, 11)"
6,Transformed test set shape,"(71, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple



Comparing models to find the best one


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.6952,0.6438,0.8917,0.7255,0.797,0.2001,0.2318,0.188
lda,Linear Discriminant Analysis,0.6901,0.6191,0.8818,0.721,0.7917,0.188,0.2076,0.04
lightgbm,Light Gradient Boosting Machine,0.6882,0.648,0.8273,0.7524,0.7773,0.2296,0.2462,0.083
lr,Logistic Regression,0.6835,0.652,0.9727,0.6888,0.806,0.0557,0.0711,0.79
dummy,Dummy Classifier,0.6772,0.5,1.0,0.6772,0.8074,0.0,0.0,0.023
et,Extra Trees Classifier,0.6702,0.5655,0.8636,0.7119,0.7751,0.1443,0.1704,0.232
ridge,Ridge Classifier,0.6658,0.6308,0.9273,0.6874,0.7886,0.0552,0.0705,0.025
svm,SVM - Linear Kernel,0.6654,0.6164,0.9818,0.6725,0.798,-0.0214,-0.027,0.026
knn,K Neighbors Classifier,0.6596,0.543,0.8818,0.6952,0.776,0.0858,0.1104,0.039
gbc,Gradient Boosting Classifier,0.6544,0.6076,0.7924,0.73,0.7559,0.1582,0.1523,0.238


Processing:   0%|          | 0/65 [00:00<?, ?it/s]


Evaluating model on test data


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.6102,0.4961,0.825,0.6735,0.7416,-0.0195,-0.0213



===== MODEL EVALUATION METRICS =====
Accuracy: 0.6102
Precision: 0.6735
Recall: 0.8250
F1 Score: 0.7416
ROC AUC: 0.5467

Confusion Matrix:
[[ 3 16]
 [ 7 33]]

Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.16      0.21        19
           1       0.67      0.82      0.74        40

    accuracy                           0.61        59
   macro avg       0.49      0.49      0.47        59
weighted avg       0.55      0.61      0.57        59


Generating detailed JSON results to depression_predictions.json
Results saved to depression_predictions.json

=== Processing complete ===
Final model accuracy: 0.6102
Final model F1 score: 0.7416
