# Enhanced MLP Architecture + Hyperparameter Exploration Notebook

**What this notebook does**
- Loads a CSV file that is already *categorically encoded* (user must provide path).
- Tries a configurable range of MLP architectures and hyperparameters (hidden layers, solvers, batch sizes, learning rates, epochs, and — for PyTorch models — dropout).
- Runs experiments using both **scikit-learn MLP** (fast, good for many grid search experiments) and an optional **PyTorch MLP** (to test dropout explicitly and obtain training loss/accuracy curves).
- **Enhanced logging**: Comprehensive logging with TensorBoard, MLflow, progress tracking, error handling, and resource monitoring.
- Saves several graphs (PNG files) showing how different parameters affect training/test performance and training loss curves.
- Saves results to a CSV for later inspection.

**Notes**
- The notebook assumes the CSV includes features and a target column (default target column is `risk_level`). If your target has string labels, the notebook will attempt to map `low/medium/high` to `0/1/2`. Adjust the mapping if needed.
- The notebook uses `StandardScaler` to scale inputs (recommended for neural nets).
- The PyTorch implementation supports dropout and logs all metrics to TensorBoard and optionally MLflow.
- Enhanced with comprehensive logging, error handling, and resource monitoring.

You can run this notebook end-to-end; change the parameter grids near the top to expand or narrow the search.


In [None]:
# Enhanced imports with logging and monitoring
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from datetime import datetime
import logging
import traceback
import psutil
import yaml
from tqdm import tqdm
import json

# ML imports
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, 
                           precision_score, recall_score, f1_score, precision_recall_fscore_support)

# Set up enhanced logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('experiment.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Configuration
OUTDIR = 'experiment_outputs'
os.makedirs(OUTDIR, exist_ok=True)
os.makedirs(os.path.join(OUTDIR, 'models'), exist_ok=True)
os.makedirs(os.path.join(OUTDIR, 'tb_logs'), exist_ok=True)
os.makedirs(os.path.join(OUTDIR, 'sklearn_models'), exist_ok=True)
os.makedirs(os.path.join(OUTDIR, 'pytorch_models'), exist_ok=True)
os.makedirs(os.path.join(OUTDIR, 'metrics'), exist_ok=True)
os.makedirs(os.path.join(OUTDIR, 'confusion_matrices'), exist_ok=True)

# Experiment configuration
use_pytorch = True  # Set to False to skip PyTorch experiments
use_mlflow = True  # Set to True to enable MLflow logging
use_cross_validation = True  # Set to True to enable 5-fold cross-validation
n_folds = 5  # Number of cross-validation folds
data_path = '/mnt/data/sample.csv'  # Change this to your CSV path

logger.info(f"Starting ML experiments at {datetime.now()}")
logger.info(f"Output directory: {OUTDIR}")
logger.info(f"PyTorch enabled: {use_pytorch}")
logger.info(f"MLflow enabled: {use_mlflow}")
logger.info(f"Cross-validation enabled: {use_cross_validation} ({n_folds} folds)")

2025-08-10 20:06:54,175 - INFO - Starting ML experiments at 2025-08-10 20:06:54.175088
2025-08-10 20:06:54,177 - INFO - Output directory: experiment_outputs
2025-08-10 20:06:54,178 - INFO - PyTorch enabled: True
2025-08-10 20:06:54,179 - INFO - MLflow enabled: True


In [None]:
# Helper function for comprehensive metrics calculation
def calculate_comprehensive_metrics(y_true, y_pred, model_name, experiment_id, model_type='sklearn'):
    """Calculate and save comprehensive metrics for a model"""
    
    # Calculate all metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    
    precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    # Per-class metrics
    precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
        y_true, y_pred, zero_division=0)
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Create comprehensive metrics dictionary
    metrics = {
        'model_name': model_name,
        'experiment_id': experiment_id,
        'model_type': model_type,
        'accuracy': accuracy,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'precision_weighted': precision_weighted,
        'recall_weighted': recall_weighted,
        'f1_weighted': f1_weighted,
        'confusion_matrix': cm.tolist(),
        'per_class_metrics': {
            'precision': precision_per_class.tolist(),
            'recall': recall_per_class.tolist(),
            'f1_score': f1_per_class.tolist(),
            'support': support.tolist()
        }
    }
    
    # Save metrics to JSON
    metrics_file = os.path.join(OUTDIR, 'metrics', f'{model_type}_model_{experiment_id}_metrics.json')
    with open(metrics_file, 'w') as f:
        json.dump(metrics, f, indent=2)
    
    # Create and save confusion matrix plot
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=[f'Class {i}' for i in range(cm.shape[1])],
                yticklabels=[f'Class {i}' for i in range(cm.shape[0])])
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    cm_file = os.path.join(OUTDIR, 'confusion_matrices', f'{model_type}_model_{experiment_id}_confusion_matrix.png')
    plt.savefig(cm_file, dpi=300, bbox_inches='tight')
    plt.close()
    
    # Generate classification report
    class_report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    report_file = os.path.join(OUTDIR, 'metrics', f'{model_type}_model_{experiment_id}_classification_report.json')
    with open(report_file, 'w') as f:
        json.dump(class_report, f, indent=2)
    
    logger.info(f"Comprehensive metrics saved for {model_name} (ID: {experiment_id})")
    
    return metrics

# Load and prepare data
try:
    logger.info(f"Loading data from {data_path}")
    df = pd.read_csv("training_data.csv")
    logger.info(f"Data loaded successfully. Shape: {df.shape}")
    logger.info(f"Columns: {list(df.columns)}")
    
    # Display basic info about the dataset
    print("Dataset Info:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Missing values: {df.isnull().sum().sum()}")
    
except Exception as e:
    logger.error(f"Failed to load data: {str(e)}")
    logger.error(traceback.format_exc())
    raise

# Prepare features and target
target_col = 'risk_level'  # Change this if your target column has a different name

try:
    if target_col not in df.columns:
        logger.warning(f"Target column '{target_col}' not found. Available columns: {list(df.columns)}")
        target_col = input("Please enter the correct target column name: ")
    
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Handle categorical target if needed
    if y.dtype == 'object':
        logger.info(f"Converting categorical target. Unique values: {y.unique()}")
        if set(y.unique()).issubset({'low', 'medium', 'high'}):
            y = y.map({'low': 0, 'medium': 1, 'high': 2})
            logger.info("Mapped risk levels: low->0, medium->1, high->2")
        else:
            # Use label encoding for other categorical targets
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            y = le.fit_transform(y)
            logger.info(f"Label encoded target. Classes: {le.classes_}")
    
    logger.info(f"Target distribution: {np.bincount(y)}")
    
except Exception as e:
    logger.error(f"Error preparing target variable: {str(e)}")
    logger.error(traceback.format_exc())
    raise

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
logger.info(f"Train-test split completed. Train: {X_train.shape}, Test: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
logger.info("Features scaled using StandardScaler")

2025-08-10 20:06:54,188 - INFO - Loading data from /mnt/data/sample.csv
2025-08-10 20:06:54,279 - INFO - Data loaded successfully. Shape: (25000, 69)
2025-08-10 20:06:54,280 - INFO - Columns: ['has_dental_data', 'has_dietary_data', 'sa_citizen', 'special_needs', 'caregiver_treatment', 'appliance', 'plaque', 'dry_mouth', 'enamel_defects', 'fluoride_water', 'fluoride_toothpaste', 'topical_fluoride', 'regular_checkups', 'sealed_pits', 'restorative_procedures', 'enamel_change', 'dentin_discoloration', 'white_spot_lesions', 'cavitated_lesions', 'multiple_restorations', 'missing_teeth', 'total_dmft_score', 'sweet_sugary_foods', 'sweet_sugary_foods_bedtime', 'takeaways_processed_foods', 'fresh_fruit', 'fresh_fruit_bedtime', 'cold_drinks_juices', 'cold_drinks_juices_bedtime', 'processed_fruit', 'processed_fruit_bedtime', 'spreads', 'spreads_bedtime', 'added_sugars', 'added_sugars_bedtime', 'salty_snacks', 'dairy_products', 'vegetables', 'water', 'sweet_sugary_foods_daily', 'sweet_sugary_foods_

Dataset Info:
Shape: (25000, 69)
Columns: ['has_dental_data', 'has_dietary_data', 'sa_citizen', 'special_needs', 'caregiver_treatment', 'appliance', 'plaque', 'dry_mouth', 'enamel_defects', 'fluoride_water', 'fluoride_toothpaste', 'topical_fluoride', 'regular_checkups', 'sealed_pits', 'restorative_procedures', 'enamel_change', 'dentin_discoloration', 'white_spot_lesions', 'cavitated_lesions', 'multiple_restorations', 'missing_teeth', 'total_dmft_score', 'sweet_sugary_foods', 'sweet_sugary_foods_bedtime', 'takeaways_processed_foods', 'fresh_fruit', 'fresh_fruit_bedtime', 'cold_drinks_juices', 'cold_drinks_juices_bedtime', 'processed_fruit', 'processed_fruit_bedtime', 'spreads', 'spreads_bedtime', 'added_sugars', 'added_sugars_bedtime', 'salty_snacks', 'dairy_products', 'vegetables', 'water', 'sweet_sugary_foods_daily', 'sweet_sugary_foods_weekly', 'sweet_sugary_foods_timing', 'takeaways_processed_foods_daily', 'takeaways_processed_foods_weekly', 'fresh_fruit_daily', 'fresh_fruit_weekly'

In [3]:
# Check GPU availability
try:
    import torch
    gpu_available = torch.cuda.is_available()
    print(f"CUDA available: {gpu_available}")
    if gpu_available:
        print(f"GPU device: {torch.cuda.get_device_name(0)}")
        print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    else:
        print("No GPU available - will use CPU for PyTorch experiments")
except ImportError:
    print("PyTorch not installed - only scikit-learn experiments will run (CPU only)")

print(f"\nNote: Scikit-learn experiments always use CPU only")
print(f"PyTorch experiments will use {'GPU' if gpu_available else 'CPU'}")

CUDA available: True
GPU device: NVIDIA GeForce RTX 4050 Laptop GPU
GPU memory: 6.0 GB

Note: Scikit-learn experiments always use CPU only
PyTorch experiments will use GPU


## PyTorch GPU Installation

Based on your NVIDIA GeForce RTX 4050 with CUDA 12.9, install PyTorch with GPU support using one of these commands:

### Option 1: Latest PyTorch with CUDA 12.1 (Recommended)
```bash
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
```

### Option 2: Latest PyTorch with CUDA 11.8 (Alternative)
```bash
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
```

### Option 3: CPU-only version (if GPU setup fails)
```bash
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
```

**Note:** The CUDA 12.1 version should work with your CUDA 12.9 driver (backwards compatible).

Run the cell below to check if GPU is detected after installation.

In [4]:
# Parameter grids
sklearn_param_grid = {
    'hidden_layer_sizes': [(48,), (64,), (96,) (64, 32), (96, 48)],
    'solver': ['adam', 'lbfgs'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter': [200, 500, 1000]
}

pytorch_param_grid = {
    'hidden_sizes': [[48],[64], [96], [64, 32], [96, 48]],
    'learning_rate': [0.001, 0.01, 0.1],
    'dropout': [0.0, 0.2, 0.5],
    'batch_size': [32, 64, 128],
    'epochs': [50, 100, 150]
}

# Save experiment configuration
experiment_config = {
    'sklearn_param_grid': sklearn_param_grid,
    'pytorch_param_grid': pytorch_param_grid,
    'data_path': data_path,
    'target_column': target_col,
    'timestamp': datetime.now().isoformat(),
    'train_size': X_train.shape[0],
    'test_size': X_test.shape[0],
    'n_features': X_train.shape[1],
    'n_classes': len(np.unique(y)),
    'use_pytorch': use_pytorch,
    'use_mlflow': use_mlflow
}

with open(os.path.join(OUTDIR, 'experiment_config.yaml'), 'w') as f:
    yaml.dump(experiment_config, f, default_flow_style=False)

logger.info(f"Experiment configuration saved to {os.path.join(OUTDIR, 'experiment_config.yaml')}")
print(f"Total sklearn combinations: {len(list(product(*sklearn_param_grid.values())))}")
if use_pytorch:
    print(f"Total PyTorch combinations: {len(list(product(*pytorch_param_grid.values())))}")

2025-08-10 20:07:10,313 - INFO - Experiment configuration saved to experiment_outputs\experiment_config.yaml


Total sklearn combinations: 60
Total PyTorch combinations: 405


In [None]:
# Scikit-learn MLP experiments with Cross-Validation and Comprehensive Metrics
sklearn_results = []
combos = list(product(*sklearn_param_grid.values()))
keys = list(sklearn_param_grid.keys())

logger.info(f"Starting {len(combos)} scikit-learn experiments")
if use_cross_validation:
    logger.info(f"Using {n_folds}-fold cross-validation (this will increase training time by ~{n_folds}x)")

# Combine train and test data for cross-validation
X_full = np.vstack([X_train_s, X_test_s])
y_full = np.concatenate([y_train, y_test])

# Set up cross-validation
if use_cross_validation:
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    cv_splits = list(cv.split(X_full, y_full))

best_sklearn_model = None
best_sklearn_score = 0.0

for i, combo in enumerate(tqdm(combos, desc="Scikit-learn experiments")):
    try:
        params = dict(zip(keys, combo))
        logger.debug(f"Testing sklearn params: {params}")
        
        # Log system resources
        memory_usage = psutil.virtual_memory().percent
        cpu_usage = psutil.cpu_percent(interval=1)
        
        start_time = datetime.now()
        
        if use_cross_validation:
            # Cross-validation approach
            clf = MLPClassifier(**params, random_state=42)
            cv_scores = cross_val_score(clf, X_full, y_full, cv=cv, scoring='accuracy')
            
            # Train on full dataset to get other metrics
            clf.fit(X_full, y_full)
            
            # Calculate metrics on original train/test split for comparison
            train_pred = clf.predict(X_train_s)
            test_pred = clf.predict(X_test_s)
            train_acc = accuracy_score(y_train, train_pred)
            test_acc = accuracy_score(y_test, test_pred)
            
            cv_mean = cv_scores.mean()
            cv_std = cv_scores.std()
            
        else:
            # Original single train/test split approach
            clf = MLPClassifier(**params, random_state=42)
            clf.fit(X_train_s, y_train)
            
            train_pred = clf.predict(X_train_s)
            test_pred = clf.predict(X_test_s)
            train_acc = accuracy_score(y_train, train_pred)
            test_acc = accuracy_score(y_test, test_pred)
            
            cv_mean = None
            cv_std = None
        
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        # Calculate comprehensive metrics for test set
        model_name = f"sklearn_mlp_{i+1}"
        test_metrics = calculate_comprehensive_metrics(
            y_test, test_pred, model_name, i+1, 'sklearn'
        )
        
        # Calculate comprehensive metrics for training set
        train_metrics = calculate_comprehensive_metrics(
            y_train, train_pred, f"{model_name}_train", f"{i+1}_train", 'sklearn'
        )
        
        # Save model
        import joblib
        model_path = os.path.join(OUTDIR, 'sklearn_models', f'sklearn_model_{i+1}.joblib')
        joblib.dump(clf, model_path)
        
        # Track best model
        score_for_comparison = cv_mean if use_cross_validation else test_acc
        if score_for_comparison > best_sklearn_score:
            best_sklearn_score = score_for_comparison
            best_sklearn_model = {
                'model': clf,
                'experiment_id': i+1,
                'params': params,
                'test_metrics': test_metrics,
                'train_metrics': train_metrics,
                'model_path': model_path
            }
        
        result = {
            'experiment_id': i + 1,
            'params': params,
            'train_acc': train_acc,
            'test_acc': test_acc,
            'train_precision_macro': train_metrics['precision_macro'],
            'train_recall_macro': train_metrics['recall_macro'],
            'train_f1_macro': train_metrics['f1_macro'],
            'test_precision_macro': test_metrics['precision_macro'],
            'test_recall_macro': test_metrics['recall_macro'],
            'test_f1_macro': test_metrics['f1_macro'],
            'test_precision_weighted': test_metrics['precision_weighted'],
            'test_recall_weighted': test_metrics['recall_weighted'],
            'test_f1_weighted': test_metrics['f1_weighted'],
            'cv_mean_acc': cv_mean,
            'cv_std_acc': cv_std,
            'n_iter': clf.n_iter_,
            'loss_curve': clf.loss_curve_ if hasattr(clf, 'loss_curve_') else None,
            'duration_seconds': duration,
            'memory_usage_percent': memory_usage,
            'cpu_usage_percent': cpu_usage,
            'timestamp': start_time.isoformat(),
            'model_path': model_path
        }
        
        sklearn_results.append(result)
        
        if use_cross_validation:
            logger.info(f"Sklearn experiment {i+1}/{len(combos)} completed. CV mean: {cv_mean:.4f}±{cv_std:.4f}, Test F1: {test_metrics['f1_macro']:.4f}")
        else:
            logger.info(f"Sklearn experiment {i+1}/{len(combos)} completed. Train acc: {train_acc:.4f}, Test F1: {test_metrics['f1_macro']:.4f}")
        
    except Exception as e:
        logger.error(f"Sklearn experiment {i+1} failed with params {params}: {str(e)}")
        logger.error(traceback.format_exc())
        continue

logger.info(f"Completed {len(sklearn_results)} successful scikit-learn experiments")

# Save best sklearn model with special designation
if best_sklearn_model:
    best_model_dir = os.path.join(OUTDIR, 'best_models')
    os.makedirs(best_model_dir, exist_ok=True)
    
    # Save best model
    best_model_path = os.path.join(best_model_dir, 'best_sklearn_model.joblib')
    joblib.dump(best_sklearn_model['model'], best_model_path)
    
    # Save scaler for the best model
    best_scaler_path = os.path.join(best_model_dir, 'best_sklearn_scaler.joblib')
    joblib.dump(scaler, best_scaler_path)
    
    # Save feature names for the best model
    feature_names = X.columns.tolist() if hasattr(X, 'columns') else [f'feature_{i}' for i in range(X.shape[1])]
    best_features_path = os.path.join(best_model_dir, 'best_sklearn_feature_names.joblib')
    joblib.dump(feature_names, best_features_path)
    
    # Save best model metrics with special naming
    best_test_metrics = calculate_comprehensive_metrics(
        y_test, best_sklearn_model['model'].predict(X_test_s), 
        'BEST_sklearn_model', 'BEST', 'sklearn_best'
    )
    
    # Save best model info
    best_info = {
        'experiment_id': best_sklearn_model['experiment_id'],
        'params': best_sklearn_model['params'],
        'metrics': best_test_metrics,
        'model_path': best_model_path,
        'scaler_path': best_scaler_path,
        'feature_names_path': best_features_path,
        'timestamp': datetime.now().isoformat()
    }
    
    with open(os.path.join(best_model_dir, 'best_sklearn_model_info.json'), 'w') as f:
        json.dump(best_info, f, indent=2)
    
    logger.info(f"Best sklearn model saved (Experiment {best_sklearn_model['experiment_id']}) with score: {best_sklearn_score:.4f}")

# Save sklearn results
if sklearn_results:
    sklearn_df = pd.DataFrame([
        {
            'experiment_id': r['experiment_id'],
            'hidden_layer_sizes': str(r['params']['hidden_layer_sizes']),
            'solver': r['params']['solver'],
            'learning_rate_init': r['params']['learning_rate_init'],
            'max_iter': r['params']['max_iter'],
            'train_acc': r['train_acc'],
            'test_acc': r['test_acc'],
            'train_f1_macro': r['train_f1_macro'],
            'test_f1_macro': r['test_f1_macro'],
            'test_precision_macro': r['test_precision_macro'],
            'test_recall_macro': r['test_recall_macro'],
            'test_f1_weighted': r['test_f1_weighted'],
            'cv_mean_acc': r['cv_mean_acc'],
            'cv_std_acc': r['cv_std_acc'],
            'n_iter': r['n_iter'],
            'duration_seconds': r['duration_seconds'],
            'memory_usage_percent': r['memory_usage_percent'],
            'cpu_usage_percent': r['cpu_usage_percent'],
            'timestamp': r['timestamp'],
            'model_path': r['model_path']
        } for r in sklearn_results
    ])
    sklearn_df.to_csv(os.path.join(OUTDIR, 'sklearn_results_summary.csv'), index=False)
    logger.info("Scikit-learn results saved to sklearn_results_summary.csv")
    
    # Display best results
    if use_cross_validation:
        best_sklearn = sklearn_df.loc[sklearn_df['cv_mean_acc'].idxmax()]
        print(f"\nBest scikit-learn result (by CV score):")
        print(f"CV accuracy: {best_sklearn['cv_mean_acc']:.4f}±{best_sklearn['cv_std_acc']:.4f}")
        print(f"Test accuracy: {best_sklearn['test_acc']:.4f}")
        print(f"Test F1-macro: {best_sklearn['test_f1_macro']:.4f}")
        print(f"Test Precision-macro: {best_sklearn['test_precision_macro']:.4f}")
        print(f"Test Recall-macro: {best_sklearn['test_recall_macro']:.4f}")
        print(f"Parameters: hidden_sizes={best_sklearn['hidden_layer_sizes']}, solver={best_sklearn['solver']}, lr={best_sklearn['learning_rate_init']}")
    else:
        best_sklearn = sklearn_df.loc[sklearn_df['test_acc'].idxmax()]
        print(f"\nBest scikit-learn result:")
        print(f"Test accuracy: {best_sklearn['test_acc']:.4f}")
        print(f"Test F1-macro: {best_sklearn['test_f1_macro']:.4f}")
        print(f"Test Precision-macro: {best_sklearn['test_precision_macro']:.4f}")
        print(f"Test Recall-macro: {best_sklearn['test_recall_macro']:.4f}")
        print(f"Parameters: hidden_sizes={best_sklearn['hidden_layer_sizes']}, solver={best_sklearn['solver']}, lr={best_sklearn['learning_rate_init']}")

2025-08-10 19:27:54,553 - INFO - Starting 60 scikit-learn experiments
2025-08-10 19:28:01,792 - INFO - Sklearn experiment 1/60 completed. Train acc: 1.0000, Test acc: 0.9942                             | 0/60 [00:00<?, ?it/s]
2025-08-10 19:28:08,585 - INFO - Sklearn experiment 2/60 completed. Train acc: 1.0000, Test acc: 0.9942                     | 1/60 [00:07<07:06,  7.23s/it]
2025-08-10 19:28:12,364 - INFO - Sklearn experiment 3/60 completed. Train acc: 1.0000, Test acc: 0.9942                     | 2/60 [00:14<06:44,  6.97s/it]
2025-08-10 19:28:16,155 - INFO - Sklearn experiment 4/60 completed. Train acc: 1.0000, Test acc: 0.9942                     | 3/60 [00:17<05:14,  5.52s/it]
2025-08-10 19:28:18,413 - INFO - Sklearn experiment 5/60 completed. Train acc: 0.9798, Test acc: 0.9764                     | 4/60 [00:21<04:30,  4.83s/it]
2025-08-10 19:28:20,661 - INFO - Sklearn experiment 6/60 completed. Train acc: 0.9798, Test acc: 0.9764                     | 5/60 [00:23<03:34,  3.91


Best scikit-learn result:
Test accuracy: 0.9942
Parameters: hidden_sizes=(50,), solver=adam, lr=0.001


In [5]:
!pip install tensorboard



In [6]:
# Add this cell to check PyTorch installation
try:
    import torch
    print(f"✅ PyTorch installed: {torch.__version__}")
    print(f"✅ CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"✅ GPU device: {torch.cuda.get_device_name(0)}")
    else:
        print("⚠️  CUDA not available - will use CPU")
except ImportError as e:
    print(f"❌ PyTorch not installed: {e}")
    print("Install with: pip install torch torchvision torchaudio")
except Exception as e:
    print(f"❌ PyTorch error: {e}")

✅ PyTorch installed: 2.8.0+cu128
✅ CUDA available: True
✅ GPU device: NVIDIA GeForce RTX 4050 Laptop GPU


In [7]:
# PyTorch experiments setup
if use_pytorch:
    try:
        import torch
        import torch.nn as nn
        import torch.optim as optim
        from torch.utils.data import TensorDataset, DataLoader
        from torch.utils.tensorboard import SummaryWriter
        
        # MLflow setup (optional)
        if use_mlflow:
            import mlflow
            import mlflow.pytorch
            mlflow.set_experiment("MLP_Hyperparameter_Search")
            logger.info("MLflow experiment tracking enabled")
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f'PyTorch device: {device}')
        
        if torch.cuda.is_available():
            logger.info(f'CUDA device: {torch.cuda.get_device_name(0)}')
            logger.info(f'CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
        
        def build_pytorch_model(input_size, hidden_sizes, dropout=0.0, n_classes=None):
            layers = []
            in_size = input_size
            for h in hidden_sizes:
                layers.append(nn.Linear(in_size, h))
                layers.append(nn.ReLU())
                if dropout and dropout > 0.0:
                    layers.append(nn.Dropout(dropout))
                in_size = h
            layers.append(nn.Linear(in_size, n_classes))
            return nn.Sequential(*layers)
        
        # Prepare PyTorch tensors
        X_train_t = torch.FloatTensor(X_train_s)
        y_train_t = torch.LongTensor(y_train.values if hasattr(y_train, 'values') else y_train)
        X_test_t = torch.FloatTensor(X_test_s)
        y_test_t = torch.LongTensor(y_test.values if hasattr(y_test, 'values') else y_test)
        
        logger.info("PyTorch setup completed successfully")
        
    except ImportError as e:
        logger.error(f"PyTorch import failed: {str(e)}")
        use_pytorch = False
    except Exception as e:
        logger.error(f"PyTorch setup failed: {str(e)}")
        logger.error(traceback.format_exc())
        use_pytorch = False
else:
    logger.info("PyTorch experiments disabled")

2025/08/10 20:07:44 INFO mlflow.tracking.fluent: Experiment with name 'MLP_Hyperparameter_Search' does not exist. Creating a new experiment.
2025-08-10 20:07:44,067 - INFO - MLflow experiment tracking enabled
2025-08-10 20:07:44,068 - INFO - PyTorch device: cuda
2025-08-10 20:07:44,068 - INFO - CUDA device: NVIDIA GeForce RTX 4050 Laptop GPU
2025-08-10 20:07:44,069 - INFO - CUDA memory: 6.0 GB
2025-08-10 20:07:44,076 - INFO - PyTorch setup completed successfully


In [None]:
# Enhanced PyTorch experiments with Cross-Validation and Comprehensive Metrics
if use_pytorch:
    pytorch_results = []
    combos = list(product(*pytorch_param_grid.values()))
    keys = list(pytorch_param_grid.keys())
    
    logger.info(f"Starting {len(combos)} PyTorch experiments")
    if use_cross_validation:
        logger.info(f"Using {n_folds}-fold cross-validation (this will increase training time by ~{n_folds}x)")
    
    run_counter = 0
    best_test_acc = 0.0
    best_pytorch_model = None
    
    # Prepare full dataset for cross-validation
    X_full_t = torch.FloatTensor(X_full)
    y_full_t = torch.LongTensor(y_full)
    
    for combo in tqdm(combos, desc="PyTorch experiments"):
        run_counter += 1
        
        try:
            params = dict(zip(keys, combo))
            logger.info(f"\nPyTorch experiment {run_counter}/{len(combos)}: {params}")
            
            # MLflow run start
            if use_mlflow:
                mlflow.start_run()
                mlflow.log_params(params)
            
            # Cross-validation scores storage
            cv_train_accs = []
            cv_val_accs = []
            cv_final_losses = []
            
            if use_cross_validation:
                # Cross-validation loop
                for fold, (train_idx, val_idx) in enumerate(cv_splits):
                    logger.debug(f"Training fold {fold + 1}/{n_folds}")
                    
                    # Split data for this fold
                    X_fold_train = X_full_t[train_idx]
                    y_fold_train = y_full_t[train_idx]
                    X_fold_val = X_full_t[val_idx]
                    y_fold_val = y_full_t[val_idx]
                    
                    # Model setup for this fold
                    input_size = X_full.shape[1]
                    n_classes = len(np.unique(y_full))
                    model = build_pytorch_model(
                        input_size, 
                        params['hidden_sizes'], 
                        dropout=params['dropout'], 
                        n_classes=n_classes
                    ).to(device)
                    
                    criterion = nn.CrossEntropyLoss()
                    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
                    
                    # Data loader for this fold
                    fold_train_ds = TensorDataset(X_fold_train, y_fold_train)
                    fold_train_loader = DataLoader(fold_train_ds, batch_size=params['batch_size'], shuffle=True)
                    
                    # Training loop for this fold
                    for epoch in range(params['epochs']):
                        model.train()
                        epoch_losses = []
                        
                        for batch_idx, (xb, yb) in enumerate(fold_train_loader):
                            xb = xb.to(device)
                            yb = yb.to(device)
                            
                            optimizer.zero_grad()
                            out = model(xb)
                            loss = criterion(out, yb)
                            loss.backward()
                            optimizer.step()
                            
                            epoch_losses.append(loss.item())
                    
                    # Evaluate this fold
                    model.eval()
                    with torch.no_grad():
                        # Training accuracy for this fold
                        fold_train_out = model(X_fold_train.to(device))
                        fold_train_pred = fold_train_out.argmax(dim=1).cpu().numpy()
                        fold_train_acc = accuracy_score(y_fold_train.cpu().numpy(), fold_train_pred)
                        
                        # Validation accuracy for this fold
                        fold_val_out = model(X_fold_val.to(device))
                        fold_val_pred = fold_val_out.argmax(dim=1).cpu().numpy()
                        fold_val_acc = accuracy_score(y_fold_val.cpu().numpy(), fold_val_pred)
                        fold_val_loss = criterion(fold_val_out, y_fold_val.to(device)).item()
                    
                    cv_train_accs.append(fold_train_acc)
                    cv_val_accs.append(fold_val_acc)
                    cv_final_losses.append(fold_val_loss)
                    
                    logger.debug(f"Fold {fold + 1}: train_acc={fold_train_acc:.4f}, val_acc={fold_val_acc:.4f}")
                
                # Calculate cross-validation statistics
                cv_train_mean = np.mean(cv_train_accs)
                cv_train_std = np.std(cv_train_accs)
                cv_val_mean = np.mean(cv_val_accs)
                cv_val_std = np.std(cv_val_accs)
                cv_loss_mean = np.mean(cv_final_losses)
            
            else:
                # No cross-validation
                cv_train_mean = cv_train_std = cv_val_mean = cv_val_std = cv_loss_mean = None
            
            # Train final model on original train/test split
            model = build_pytorch_model(
                X_train_s.shape[1], 
                params['hidden_sizes'], 
                dropout=params['dropout'], 
                n_classes=len(np.unique(y_train))
            ).to(device)
            
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
            
            train_ds = TensorDataset(X_train_t, y_train_t)
            train_loader = DataLoader(train_ds, batch_size=params['batch_size'], shuffle=True)
            
            # Setup logging for final model
            log_dir = os.path.join(OUTDIR, f'tb_logs/run_{run_counter}')
            writer = SummaryWriter(log_dir=log_dir)
            
            train_losses = []
            val_losses = []
            val_accuracies = []
            start_time = datetime.now()
            
            # Training loop for final model
            for epoch in range(params['epochs']):
                model.train()
                epoch_losses = []
                
                for batch_idx, (xb, yb) in enumerate(train_loader):
                    xb = xb.to(device)
                    yb = yb.to(device)
                    
                    optimizer.zero_grad()
                    out = model(xb)
                    loss = criterion(out, yb)
                    loss.backward()
                    optimizer.step()
                    
                    epoch_losses.append(loss.item())
                
                avg_train_loss = np.mean(epoch_losses)
                train_losses.append(avg_train_loss)
                
                # Validation every 5 epochs
                if epoch % 5 == 0 or epoch == params['epochs'] - 1:
                    model.eval()
                    with torch.no_grad():
                        val_out = model(X_test_t.to(device))
                        val_loss = criterion(val_out, y_test_t.to(device))
                        val_pred = val_out.argmax(dim=1).cpu().numpy()
                        val_acc = accuracy_score(y_test, val_pred)
                    
                    val_losses.append(val_loss.item())
                    val_accuracies.append(val_acc)
                    
                    # TensorBoard logging
                    writer.add_scalar('Loss/train', avg_train_loss, epoch)
                    writer.add_scalar('Loss/validation', val_loss.item(), epoch)
                    writer.add_scalar('Accuracy/validation', val_acc, epoch)
                    
                    # MLflow logging
                    if use_mlflow:
                        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)
                        mlflow.log_metric("val_loss", val_loss.item(), step=epoch)
                        mlflow.log_metric("val_accuracy", val_acc, step=epoch)
                    
                    # System resource monitoring
                    memory_usage = psutil.virtual_memory().percent
                    if torch.cuda.is_available():
                        gpu_memory = torch.cuda.memory_allocated() / 1024**3
                        writer.add_scalar('System/gpu_memory_gb', gpu_memory, epoch)
                    writer.add_scalar('System/memory_usage_percent', memory_usage, epoch)
                
                # Progress reporting
                if (epoch + 1) % 20 == 0 or epoch == 0:
                    logger.debug(f"Epoch {epoch+1}/{params['epochs']}: train_loss={avg_train_loss:.4f}")
            
            # Final evaluation
            model.eval()
            with torch.no_grad():
                out_train = model(X_train_t.to(device))
                pred_train = out_train.argmax(dim=1).cpu().numpy()
                out_test = model(X_test_t.to(device))
                pred_test = out_test.argmax(dim=1).cpu().numpy()
            
            train_acc = accuracy_score(y_train, pred_train)
            test_acc = accuracy_score(y_test, pred_test)
            
            end_time = datetime.now()
            duration = (end_time - start_time).total_seconds()
            
            # Calculate comprehensive metrics
            model_name = f"pytorch_mlp_{run_counter}"
            test_metrics = calculate_comprehensive_metrics(
                y_test, pred_test, model_name, run_counter, 'pytorch'
            )
            
            train_metrics = calculate_comprehensive_metrics(
                y_train, pred_train, f"{model_name}_train", f"{run_counter}_train", 'pytorch'
            )
            
            # Save model
            model_path = os.path.join(OUTDIR, 'pytorch_models', f'pytorch_model_{run_counter}.pth')
            torch.save({
                'model_state_dict': model.state_dict(),
                'params': params,
                'test_acc': test_acc,
                'test_metrics': test_metrics,
                'cv_val_mean': cv_val_mean if use_cross_validation else None,
                'run': run_counter
            }, model_path)
            
            # Track best model
            score_for_comparison = cv_val_mean if use_cross_validation else test_acc
            if score_for_comparison and score_for_comparison > best_test_acc:
                best_test_acc = score_for_comparison
                best_pytorch_model = {
                    'model': model,
                    'experiment_id': run_counter,
                    'params': params,
                    'test_metrics': test_metrics,
                    'train_metrics': train_metrics,
                    'model_path': model_path
                }
            
            # Log cross-validation metrics to MLflow
            if use_mlflow and use_cross_validation:
                mlflow.log_metric("cv_train_mean", cv_train_mean)
                mlflow.log_metric("cv_train_std", cv_train_std)
                mlflow.log_metric("cv_val_mean", cv_val_mean)
                mlflow.log_metric("cv_val_std", cv_val_std)
                mlflow.log_metric("final_train_acc", train_acc)
                mlflow.log_metric("final_test_acc", test_acc)
            
            # Store results
            result = {
                'run': run_counter,
                'params': params,
                'train_acc': train_acc,
                'test_acc': test_acc,
                'train_precision_macro': train_metrics['precision_macro'],
                'train_recall_macro': train_metrics['recall_macro'],
                'train_f1_macro': train_metrics['f1_macro'],
                'test_precision_macro': test_metrics['precision_macro'],
                'test_recall_macro': test_metrics['recall_macro'],
                'test_f1_macro': test_metrics['f1_macro'],
                'test_precision_weighted': test_metrics['precision_weighted'],
                'test_recall_weighted': test_metrics['recall_weighted'],
                'test_f1_weighted': test_metrics['f1_weighted'],
                'cv_train_mean': cv_train_mean,
                'cv_train_std': cv_train_std,
                'cv_val_mean': cv_val_mean,
                'cv_val_std': cv_val_std,
                'cv_loss_mean': cv_loss_mean,
                'final_train_loss': train_losses[-1] if train_losses else None,
                'final_val_loss': val_losses[-1] if val_losses else None,
                'duration_seconds': duration,
                'timestamp': start_time.isoformat(),
                'model_path': model_path
            }
            
            pytorch_results.append(result)
            
            # Confusion matrix logging
            cm = confusion_matrix(y_test, pred_test)
            fig, ax = plt.subplots(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap='Blues')
            ax.set_title(f'Confusion Matrix - Run {run_counter}')
            ax.set_xlabel('Predicted')
            ax.set_ylabel('Actual')
            writer.add_figure('Confusion_Matrix', fig, global_step=run_counter)
            plt.close(fig)
            
            # Training loss curve plot
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
            
            ax1.plot(train_losses, label='Training Loss')
            if val_losses:
                val_epochs = list(range(0, params['epochs'], 5)) + [params['epochs'] - 1]
                ax1.plot(val_epochs[:len(val_losses)], val_losses, label='Validation Loss', marker='o')
            ax1.set_title(f"Loss Curves - Run {run_counter}")
            ax1.set_xlabel('Epoch')
            ax1.set_ylabel('Loss')
            ax1.legend()
            ax1.grid(True)
            
            if val_accuracies:
                ax2.plot(val_epochs[:len(val_accuracies)], val_accuracies, label='Validation Accuracy', marker='o', color='green')
                ax2.set_title(f"Validation Accuracy - Run {run_counter}")
                ax2.set_xlabel('Epoch')
                ax2.set_ylabel('Accuracy')
                ax2.legend()
                ax2.grid(True)
            
            plt.tight_layout()
            
            # Save plot
            fname = f"pytorch_metrics_run_{run_counter}.png"
            plt.savefig(os.path.join(OUTDIR, fname), dpi=300, bbox_inches='tight')
            plt.close()
            
            # Progress logging
            if use_cross_validation:
                logger.info(f"PyTorch experiment {run_counter}/{len(combos)} completed. CV val: {cv_val_mean:.4f}±{cv_val_std:.4f}, Test F1: {test_metrics['f1_macro']:.4f}")
            else:
                logger.info(f"PyTorch experiment {run_counter}/{len(combos)} completed. Train acc: {train_acc:.4f}, Test F1: {test_metrics['f1_macro']:.4f}")
            
            # Clean up
            writer.close()
            if use_mlflow:
                mlflow.end_run()
            
            # Memory cleanup
            del model
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        except Exception as e:
            logger.error(f"PyTorch experiment {run_counter} failed: {str(e)}")
            logger.error(traceback.format_exc())
            if use_mlflow:
                mlflow.end_run(status="FAILED")
            continue
    
    logger.info(f"Completed {len(pytorch_results)} successful PyTorch experiments")
    
    # Save best PyTorch model with special designation
    if best_pytorch_model:
        best_model_dir = os.path.join(OUTDIR, 'best_models')
        os.makedirs(best_model_dir, exist_ok=True)
        
        # Save best model
        best_model_path = os.path.join(best_model_dir, 'best_pytorch_model.pth')
        torch.save({
            'model_state_dict': best_pytorch_model['model'].state_dict(),
            'params': best_pytorch_model['params'],
            'test_metrics': best_pytorch_model['test_metrics'],
            'run': best_pytorch_model['experiment_id']
        }, best_model_path)
        
        # Save best model metrics with special naming
        with torch.no_grad():
            best_pred = best_pytorch_model['model'](X_test_t.to(device)).argmax(dim=1).cpu().numpy()
        
        best_test_metrics = calculate_comprehensive_metrics(
            y_test, best_pred, 'BEST_pytorch_model', 'BEST', 'pytorch_best'
        )
        
        # Save best model info
        best_info = {
            'experiment_id': best_pytorch_model['experiment_id'],
            'params': best_pytorch_model['params'],
            'metrics': best_test_metrics,
            'model_path': best_model_path,
            'timestamp': datetime.now().isoformat()
        }
        
        with open(os.path.join(best_model_dir, 'best_pytorch_model_info.json'), 'w') as f:
            json.dump(best_info, f, indent=2)
        
        logger.info(f"Best PyTorch model saved (Experiment {best_pytorch_model['experiment_id']}) with score: {best_test_acc:.4f}")
    
    # Save PyTorch results
    if pytorch_results:
        pytorch_df = pd.DataFrame([
            {
                'run': r['run'],
                'hidden_sizes': str(r['params']['hidden_sizes']),
                'learning_rate': r['params']['learning_rate'],
                'dropout': r['params']['dropout'],
                'batch_size': r['params']['batch_size'],
                'epochs': r['params']['epochs'],
                'train_acc': r['train_acc'],
                'test_acc': r['test_acc'],
                'train_f1_macro': r['train_f1_macro'],
                'test_f1_macro': r['test_f1_macro'],
                'test_precision_macro': r['test_precision_macro'],
                'test_recall_macro': r['test_recall_macro'],
                'test_f1_weighted': r['test_f1_weighted'],
                'cv_train_mean': r['cv_train_mean'],
                'cv_train_std': r['cv_train_std'],
                'cv_val_mean': r['cv_val_mean'],
                'cv_val_std': r['cv_val_std'],
                'cv_loss_mean': r['cv_loss_mean'],
                'final_train_loss': r['final_train_loss'],
                'final_val_loss': r['final_val_loss'],
                'duration_seconds': r['duration_seconds'],
                'timestamp': r['timestamp'],
                'model_path': r['model_path']
            } for r in pytorch_results
        ])
        pytorch_df.to_csv(os.path.join(OUTDIR, 'pytorch_results_summary.csv'), index=False)
        logger.info("PyTorch results saved to pytorch_results_summary.csv")
        
        # Display best results
        if use_cross_validation:
            best_pytorch = pytorch_df.loc[pytorch_df['cv_val_mean'].idxmax()]
            print(f"\nBest PyTorch result (by CV score):")
            print(f"CV val accuracy: {best_pytorch['cv_val_mean']:.4f}±{best_pytorch['cv_val_std']:.4f}")
            print(f"Test accuracy: {best_pytorch['test_acc']:.4f}")
            print(f"Test F1-macro: {best_pytorch['test_f1_macro']:.4f}")
            print(f"Test Precision-macro: {best_pytorch['test_precision_macro']:.4f}")
            print(f"Test Recall-macro: {best_pytorch['test_recall_macro']:.4f}")
            print(f"Parameters: hidden_sizes={best_pytorch['hidden_sizes']}, lr={best_pytorch['learning_rate']}, dropout={best_pytorch['dropout']}")
        else:
            best_pytorch = pytorch_df.loc[pytorch_df['test_acc'].idxmax()]
            print(f"\nBest PyTorch result:")
            print(f"Test accuracy: {best_pytorch['test_acc']:.4f}")
            print(f"Test F1-macro: {best_pytorch['test_f1_macro']:.4f}")
            print(f"Test Precision-macro: {best_pytorch['test_precision_macro']:.4f}")
            print(f"Test Recall-macro: {best_pytorch['test_recall_macro']:.4f}")
            print(f"Parameters: hidden_sizes={best_pytorch['hidden_sizes']}, lr={best_pytorch['learning_rate']}, dropout={best_pytorch['dropout']}")

else:
    logger.info("PyTorch experiments skipped")

2025-08-10 20:08:23,278 - INFO - Starting 405 PyTorch experiments
2025-08-10 20:08:23,287 - INFO -                                                                                                   | 0/405 [00:00<?, ?it/s]
PyTorch experiment 1/405: {'hidden_sizes': [64], 'learning_rate': 0.001, 'dropout': 0.0, 'batch_size': 32, 'epochs': 50}
2025-08-10 20:09:17,571 - INFO - New best model saved with test accuracy: 0.9956
2025-08-10 20:09:17,572 - INFO - PyTorch experiment 1 completed. Train acc: 1.0000, Test acc: 0.9956
2025-08-10 20:09:17,644 - INFO -                                                                                         | 1/405 [00:54<6:05:59, 54.36s/it]
PyTorch experiment 2/405: {'hidden_sizes': [64], 'learning_rate': 0.001, 'dropout': 0.0, 'batch_size': 32, 'epochs': 100}
2025-08-10 20:10:48,513 - INFO - New best model saved with test accuracy: 0.9960
2025-08-10 20:10:48,514 - INFO - PyTorch experiment 2 completed. Train acc: 1.0000, Test acc: 0.9960
2025-08-10 20:1


Best PyTorch result:
Test accuracy: 0.9976
Parameters: hidden_sizes=[64], lr=0.001, dropout=0.2


In [None]:
# Comprehensive Results Analysis with All Metrics
logger.info("Starting comprehensive analysis and plotting of all metrics")

def create_comprehensive_performance_plots(df, model_type):
    """Create comprehensive performance plots for all metrics"""
    
    # Prepare data for plotting
    if 'hidden_layer_sizes' in df.columns:
        df['arch_str'] = df['hidden_layer_sizes'].apply(lambda x: str(x).replace(' ', ''))
        x_col = 'arch_str'
        hue_col = 'solver'
    else:
        df['arch_str'] = df['hidden_sizes'].apply(lambda x: str(x).replace(' ', ''))
        x_col = 'arch_str'
        hue_col = 'dropout'
    
    # Create a 2x2 subplot for different metrics
    fig, axes = plt.subplots(2, 2, figsize=(20, 15))
    fig.suptitle(f'{model_type} - Comprehensive Performance Analysis', fontsize=16)
    
    # Plot 1: Accuracy
    sns.barplot(x=x_col, y='test_acc', hue=hue_col, data=df, ax=axes[0,0], palette='viridis')
    axes[0,0].set_title('Test Accuracy')
    axes[0,0].set_xlabel('Architecture')
    axes[0,0].set_ylabel('Accuracy')
    axes[0,0].tick_params(axis='x', rotation=45)
    axes[0,0].grid(axis='y', linestyle='--', alpha=0.7)
    
    # Plot 2: F1-Score (Macro)
    sns.barplot(x=x_col, y='test_f1_macro', hue=hue_col, data=df, ax=axes[0,1], palette='plasma')
    axes[0,1].set_title('Test F1-Score (Macro)')
    axes[0,1].set_xlabel('Architecture')
    axes[0,1].set_ylabel('F1-Score')
    axes[0,1].tick_params(axis='x', rotation=45)
    axes[0,1].grid(axis='y', linestyle='--', alpha=0.7)
    
    # Plot 3: Precision (Macro)
    sns.barplot(x=x_col, y='test_precision_macro', hue=hue_col, data=df, ax=axes[1,0], palette='cividis')
    axes[1,0].set_title('Test Precision (Macro)')
    axes[1,0].set_xlabel('Architecture')
    axes[1,0].set_ylabel('Precision')
    axes[1,0].tick_params(axis='x', rotation=45)
    axes[1,0].grid(axis='y', linestyle='--', alpha=0.7)
    
    # Plot 4: Recall (Macro)
    sns.barplot(x=x_col, y='test_recall_macro', hue=hue_col, data=df, ax=axes[1,1], palette='magma')
    axes[1,1].set_title('Test Recall (Macro)')
    axes[1,1].set_xlabel('Architecture')
    axes[1,1].set_ylabel('Recall')
    axes[1,1].tick_params(axis='x', rotation=45)
    axes[1,1].grid(axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    filename = f'{model_type.lower()}_comprehensive_metrics.png'
    plt.savefig(os.path.join(OUTDIR, filename), dpi=300, bbox_inches='tight')
    plt.close()
    logger.info(f"Saved comprehensive metrics plot: {filename}")

def create_cv_comparison_plots(df, model_type):
    """Create cross-validation comparison plots if CV was used"""
    if not use_cross_validation:
        return
    
    cv_col = 'cv_mean_acc' if model_type == 'Scikit-learn' else 'cv_val_mean'
    if cv_col not in df.columns:
        return
    
    # Prepare data for plotting
    if 'hidden_layer_sizes' in df.columns:
        df['arch_str'] = df['hidden_layer_sizes'].apply(lambda x: str(x).replace(' ', ''))
        x_col = 'arch_str'
        hue_col = 'solver'
    else:
        df['arch_str'] = df['hidden_sizes'].apply(lambda x: str(x).replace(' ', ''))
        x_col = 'arch_str'
        hue_col = 'dropout'
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # CV results with error bars
    cv_std_col = cv_col.replace('mean', 'std')
    if cv_std_col in df.columns:
        # Group data for error bars
        grouped = df.groupby([x_col, hue_col]).agg({
            cv_col: 'mean',
            cv_std_col: 'mean'
        }).reset_index()
        
        sns.barplot(x=x_col, y=cv_col, hue=hue_col, data=grouped, ax=axes[0], palette='viridis')
        
        # Add error bars
        for i, (_, row) in enumerate(grouped.iterrows()):
            axes[0].errorbar(i, row[cv_col], yerr=row[cv_std_col], 
                           fmt='none', color='black', capsize=3, alpha=0.7)
        
        axes[0].set_title(f'{model_type} - Cross-Validation Results')
        axes[0].set_xlabel('Architecture')
        axes[0].set_ylabel('CV Accuracy')
        axes[0].tick_params(axis='x', rotation=45)
        axes[0].grid(axis='y', linestyle='--', alpha=0.7)
    
    # CV vs Test correlation
    axes[1].scatter(df[cv_col], df['test_acc'], alpha=0.7)
    axes[1].plot([0, 1], [0, 1], 'r--', alpha=0.8)
    correlation = df[cv_col].corr(df['test_acc'])
    axes[1].set_xlabel('Cross-Validation Accuracy')
    axes[1].set_ylabel('Test Set Accuracy')
    axes[1].set_title(f'{model_type} CV vs Test Correlation (r={correlation:.3f})')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    filename = f'{model_type.lower()}_cv_analysis.png'
    plt.savefig(os.path.join(OUTDIR, filename), dpi=300, bbox_inches='tight')
    plt.close()
    logger.info(f"Saved CV analysis plot: {filename}")

# Create comprehensive plots for both model types
if 'sklearn_df' in locals() and not sklearn_df.empty:
    create_comprehensive_performance_plots(sklearn_df, 'Scikit-learn')
    create_cv_comparison_plots(sklearn_df, 'Scikit-learn')
    
    print("\n" + "="*60)
    print("SCIKIT-LEARN COMPREHENSIVE RESULTS SUMMARY")
    print("="*60)
    print(f"Total experiments: {len(sklearn_df)}")
    
    # Best results summary
    metrics_to_show = ['test_acc', 'test_f1_macro', 'test_precision_macro', 'test_recall_macro']
    if use_cross_validation:
        cv_best = sklearn_df.loc[sklearn_df['cv_mean_acc'].idxmax()]
        print(f"\nBest model (by CV accuracy):")
        print(f"  CV accuracy: {cv_best['cv_mean_acc']:.4f}±{cv_best['cv_std_acc']:.4f}")
        for metric in metrics_to_show:
            print(f"  {metric}: {cv_best[metric]:.4f}")
        print(f"  Parameters: {cv_best['hidden_layer_sizes']}, {cv_best['solver']}, lr={cv_best['learning_rate_init']}")
    
    # Overall statistics
    print(f"\nOverall Performance Statistics:")
    for metric in metrics_to_show:
        print(f"  {metric}: {sklearn_df[metric].mean():.4f}±{sklearn_df[metric].std():.4f} (best: {sklearn_df[metric].max():.4f})")

if 'pytorch_df' in locals() and not pytorch_df.empty:
    create_comprehensive_performance_plots(pytorch_df, 'PyTorch')
    create_cv_comparison_plots(pytorch_df, 'PyTorch')
    
    print("\n" + "="*60)
    print("PYTORCH COMPREHENSIVE RESULTS SUMMARY")
    print("="*60)
    print(f"Total experiments: {len(pytorch_df)}")
    
    # Best results summary
    metrics_to_show = ['test_acc', 'test_f1_macro', 'test_precision_macro', 'test_recall_macro']
    if use_cross_validation:
        cv_best = pytorch_df.loc[pytorch_df['cv_val_mean'].idxmax()]
        print(f"\nBest model (by CV accuracy):")
        print(f"  CV accuracy: {cv_best['cv_val_mean']:.4f}±{cv_best['cv_val_std']:.4f}")
        for metric in metrics_to_show:
            print(f"  {metric}: {cv_best[metric]:.4f}")
        print(f"  Parameters: {cv_best['hidden_sizes']}, lr={cv_best['learning_rate']}, dropout={cv_best['dropout']}")
    
    # Overall statistics  
    print(f"\nOverall Performance Statistics:")
    for metric in metrics_to_show:
        print(f"  {metric}: {pytorch_df[metric].mean():.4f}±{pytorch_df[metric].std():.4f} (best: {pytorch_df[metric].max():.4f})")

# Model Comparison (if both models were run)
if 'sklearn_df' in locals() and 'pytorch_df' in locals() and not sklearn_df.empty and not pytorch_df.empty:
    print("\n" + "="*60)
    print("MODEL COMPARISON SUMMARY")
    print("="*60)
    
    comparison_metrics = ['test_acc', 'test_f1_macro', 'test_precision_macro', 'test_recall_macro']
    
    print("Best Performance Comparison:")
    for metric in comparison_metrics:
        sklearn_best = sklearn_df[metric].max()
        pytorch_best = pytorch_df[metric].max()
        winner = "Scikit-learn" if sklearn_best > pytorch_best else "PyTorch"
        print(f"  {metric}: Sklearn={sklearn_best:.4f}, PyTorch={pytorch_best:.4f} → Winner: {winner}")
    
    print("\nAverage Performance Comparison:")
    for metric in comparison_metrics:
        sklearn_avg = sklearn_df[metric].mean()
        pytorch_avg = pytorch_df[metric].mean()
        winner = "Scikit-learn" if sklearn_avg > pytorch_avg else "PyTorch"
        print(f"  {metric}: Sklearn={sklearn_avg:.4f}, PyTorch={pytorch_avg:.4f} → Winner: {winner}")

# Training time analysis
print("\n" + "="*60)
print("TRAINING TIME AND COMPUTATIONAL ANALYSIS")
print("="*60)

if 'sklearn_df' in locals() and not sklearn_df.empty:
    sklearn_time = sklearn_df['duration_seconds']
    print(f"Scikit-learn:")
    print(f"  Average time per experiment: {sklearn_time.mean():.2f}s")
    print(f"  Total time: {sklearn_time.sum():.2f}s ({sklearn_time.sum()/60:.1f} minutes)")

if 'pytorch_df' in locals() and not pytorch_df.empty:
    pytorch_time = pytorch_df['duration_seconds']
    print(f"PyTorch:")
    print(f"  Average time per experiment: {pytorch_time.mean():.2f}s")
    print(f"  Total time: {pytorch_time.sum():.2f}s ({pytorch_time.sum()/60:.1f} minutes)")

if use_cross_validation:
    print(f"\nCross-validation impact:")
    print(f"  Estimated {n_folds}x time increase due to CV")
    if 'sklearn_df' in locals():
        print(f"  Sklearn estimated time without CV: {sklearn_df['duration_seconds'].mean() / n_folds:.2f}s per experiment")
    if 'pytorch_df' in locals():
        print(f"  PyTorch estimated time without CV: {pytorch_df['duration_seconds'].mean() / n_folds:.2f}s per experiment")

print("\n" + "="*60)
print("FILES AND OUTPUTS SUMMARY")
print("="*60)
print("Generated files and directories:")
print("📁 experiment_outputs/")
print("  ├── 📁 sklearn_models/ - Individual sklearn models (.joblib)")
print("  ├── 📁 pytorch_models/ - Individual PyTorch models (.pth)")
print("  ├── 📁 best_models/ - Best models from each framework")
print("  ├── 📁 metrics/ - Comprehensive metrics for each model (.json)")
print("  ├── 📁 confusion_matrices/ - Confusion matrix plots (.png)")
print("  ├── 📁 tb_logs/ - TensorBoard logs (PyTorch only)")
print("  ├── 📄 sklearn_results_summary.csv - Sklearn experiment results")
print("  ├── 📄 pytorch_results_summary.csv - PyTorch experiment results")
print("  ├── 📄 experiment_config.yaml - Experiment configuration")
print("  └── 📄 *.png - Various performance plots")

logger.info("Comprehensive analysis complete.")

2025-08-11 09:14:53,253 - INFO - Starting final analysis and plotting of results
2025-08-11 09:14:54,116 - INFO - Saved plot: pytorch_test_accuracy.png
2025-08-11 09:14:54,117 - INFO - Final analysis and plotting complete.


In [None]:
# Comprehensive Experiment Summary with All Metrics
print("="*80)
print("COMPREHENSIVE ML EXPERIMENT SUMMARY")
print("="*80)

print(f"\n🔬 EXPERIMENT CONFIGURATION:")
print(f"Cross-validation enabled: {use_cross_validation}")
if use_cross_validation:
    print(f"Number of folds: {n_folds}")
    print(f"CV strategy: StratifiedKFold (maintains class distribution)")

print(f"PyTorch experiments: {use_pytorch}")
print(f"MLflow logging: {use_mlflow}")

print(f"\n? COMPREHENSIVE METRICS CALCULATED:")
print("For each model experiment, the following metrics were calculated and saved:")
print("✅ Accuracy (train and test)")
print("✅ Precision (macro and weighted averages)")
print("✅ Recall (macro and weighted averages)")
print("✅ F1-Score (macro and weighted averages)")
print("✅ Per-class precision, recall, and F1-score")
print("✅ Confusion Matrix (numerical and visual)")
print("✅ Classification Report (detailed breakdown)")

if use_cross_validation:
    print("✅ Cross-validation statistics (mean ± std for all metrics)")

print(f"\n📁 OUTPUT STRUCTURE:")
print("experiment_outputs/")
print("├── sklearn_models/          # All sklearn models (.joblib files)")
print("├── pytorch_models/          # All PyTorch models (.pth files)")
print("├── best_models/             # Best performing models")
print("│   ├── best_sklearn_model.joblib")
print("│   ├── best_pytorch_model.pth")
print("│   ├── best_sklearn_model_info.json")
print("│   └── best_pytorch_model_info.json")
print("├── metrics/                 # Comprehensive metrics for each model")
print("│   ├── sklearn_model_*_metrics.json")
print("│   ├── pytorch_model_*_metrics.json")
print("│   ├── *_classification_report.json")
print("│   └── BEST_*_metrics.json")
print("├── confusion_matrices/      # Confusion matrix plots")
print("│   ├── sklearn_model_*_confusion_matrix.png")
print("│   ├── pytorch_model_*_confusion_matrix.png")
print("│   └── *_best_confusion_matrix.png")
print("├── tb_logs/                 # TensorBoard logs (PyTorch)")
print("├── sklearn_results_summary.csv")
print("├── pytorch_results_summary.csv")
print("├── experiment_config.yaml")
print("└── Various performance plots (.png)")

print(f"\n? PERFORMANCE EVALUATION:")
print("🎯 Model Selection Criteria:")
if use_cross_validation:
    print("  - Primary: Cross-validation accuracy (most reliable)")
    print("  - Secondary: Test set performance for final evaluation")
    print("  - Consider: F1-score for imbalanced datasets")
    print("  - Stability: Low standard deviation in CV scores")
else:
    print("  - Primary: Test set accuracy")
    print("  - Consider: F1-score, precision, and recall")

print("\n🔍 METRICS INTERPRETATION GUIDE:")
print("📊 Accuracy: Overall correctness (TP+TN)/(TP+TN+FP+FN)")
print("📊 Precision: How many predicted positives were actually positive (TP/(TP+FP))")
print("📊 Recall: How many actual positives were correctly predicted (TP/(TP+FN))")
print("📊 F1-Score: Harmonic mean of precision and recall (2*P*R/(P+R))")
print("📊 Macro avg: Unweighted average across all classes")
print("📊 Weighted avg: Average weighted by class support")

print(f"\n⚡ COMPUTATIONAL PERFORMANCE:")
if use_cross_validation:
    print(f"Cross-validation overhead: ~{n_folds}x increase in training time")
    print("Benefits: More robust model selection, statistical confidence")
    print("Trade-off: Longer computation time for better reliability")

print("\n🏆 BEST MODEL IDENTIFICATION:")
print("Best models are automatically identified and saved separately:")
print("- Saved with special 'BEST_' prefix in metrics files")
print("- Stored in dedicated 'best_models/' directory")
print("- Include complete parameter configuration and performance metrics")

print(f"\n? USAGE RECOMMENDATIONS:")
print("1. 📈 For model selection: Use CV metrics (more reliable)")
print("2. 📊 For reporting: Use test set metrics (unbiased estimate)")
print("3. 🎯 For imbalanced data: Focus on F1-score and per-class metrics")
print("4. 📉 For further analysis: Examine confusion matrices and classification reports")
print("5. 🔄 For reproducibility: All configurations saved in experiment_config.yaml")

print(f"\n📋 NEXT STEPS:")
print("✅ Load best models: Use joblib.load() for sklearn, torch.load() for PyTorch")
print("✅ Analyze metrics: Review JSON files in metrics/ directory")
print("✅ Visualize results: Check confusion matrices and performance plots")
print("✅ Compare models: Use the comprehensive summary above")
print("✅ Deploy: Best models are ready for production use")

if use_cross_validation:
    print(f"\n📊 CROSS-VALIDATION INSIGHTS:")
    print("- CV provides estimate of model generalization performance")
    print("- Lower CV standard deviation indicates more stable model")
    print("- High correlation between CV and test scores validates methodology")
    print("- Use CV scores for hyperparameter selection, test scores for final evaluation")

print(f"\n🎉 EXPERIMENT STATUS: COMPLETED SUCCESSFULLY!")
print("All models trained, evaluated, and saved with comprehensive metrics.")

# Final model count summary
total_models = 0
if 'sklearn_df' in locals():
    total_models += len(sklearn_df)
    print(f"✅ Scikit-learn models: {len(sklearn_df)}")

if 'pytorch_df' in locals():
    total_models += len(pytorch_df)
    print(f"✅ PyTorch models: {len(pytorch_df)}")

print(f"📊 Total models evaluated: {total_models}")

logger.info("All experiments completed successfully with comprehensive metrics!")