# Enhanced MLP Architecture + Hyperparameter Exploration Notebook

**What this notebook does**
- Loads a CSV file that is already *categorically encoded* (user must provide path).
- Tries a configurable range of MLP architectures and hyperparameters (hidden layers, solvers, batch sizes, learning rates, epochs, and — for PyTorch models — dropout).
- Runs experiments using both **scikit-learn MLP** (fast, good for many grid search experiments) and an optional **PyTorch MLP** (to test dropout explicitly and obtain training loss/accuracy curves).
- **Enhanced logging**: Comprehensive logging with TensorBoard, MLflow, progress tracking, error handling, and resource monitoring.
- Saves several graphs (PNG files) showing how different parameters affect training/test performance and training loss curves.
- Saves results to a CSV for later inspection.

**Notes**
- The notebook assumes the CSV includes features and a target column (default target column is `risk_level`). If your target has string labels, the notebook will attempt to map `low/medium/high` to `0/1/2`. Adjust the mapping if needed.
- The notebook uses `StandardScaler` to scale inputs (recommended for neural nets).
- The PyTorch implementation supports dropout and logs all metrics to TensorBoard and optionally MLflow.
- Enhanced with comprehensive logging, error handling, and resource monitoring.

You can run this notebook end-to-end; change the parameter grids near the top to expand or narrow the search.


In [None]:
# Enhanced imports with logging and monitoring
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from datetime import datetime
import logging
import traceback
import psutil
import yaml
from tqdm import tqdm

# ML imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set up enhanced logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('experiment.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Configuration
OUTDIR = 'experiment_outputs'
os.makedirs(OUTDIR, exist_ok=True)
os.makedirs(os.path.join(OUTDIR, 'models'), exist_ok=True)
os.makedirs(os.path.join(OUTDIR, 'tb_logs'), exist_ok=True)

# Experiment configuration
use_pytorch = True  # Set to False to skip PyTorch experiments
use_mlflow = False  # Set to True to enable MLflow logging
data_path = '/mnt/data/sample.csv'  # Change this to your CSV path

logger.info(f"Starting ML experiments at {datetime.now()}")
logger.info(f"Output directory: {OUTDIR}")
logger.info(f"PyTorch enabled: {use_pytorch}")
logger.info(f"MLflow enabled: {use_mlflow}")

In [None]:
# Load and prepare data
try:
    logger.info(f"Loading data from {data_path}")
    df = pd.read_csv("training_data.csv")
    logger.info(f"Data loaded successfully. Shape: {df.shape}")
    logger.info(f"Columns: {list(df.columns)}")
    
    # Display basic info about the dataset
    print("Dataset Info:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Missing values: {df.isnull().sum().sum()}")
    
except Exception as e:
    logger.error(f"Failed to load data: {str(e)}")
    logger.error(traceback.format_exc())
    raise

# Prepare features and target
target_col = 'risk_level'  # Change this if your target column has a different name

try:
    if target_col not in df.columns:
        logger.warning(f"Target column '{target_col}' not found. Available columns: {list(df.columns)}")
        target_col = input("Please enter the correct target column name: ")
    
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Handle categorical target if needed
    if y.dtype == 'object':
        logger.info(f"Converting categorical target. Unique values: {y.unique()}")
        if set(y.unique()).issubset({'low', 'medium', 'high'}):
            y = y.map({'low': 0, 'medium': 1, 'high': 2})
            logger.info("Mapped risk levels: low->0, medium->1, high->2")
        else:
            # Use label encoding for other categorical targets
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            y = le.fit_transform(y)
            logger.info(f"Label encoded target. Classes: {le.classes_}")
    
    logger.info(f"Target distribution: {np.bincount(y)}")
    
except Exception as e:
    logger.error(f"Error preparing target variable: {str(e)}")
    logger.error(traceback.format_exc())
    raise

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
logger.info(f"Train-test split completed. Train: {X_train.shape}, Test: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
logger.info("Features scaled using StandardScaler")

In [None]:
# Parameter grids
sklearn_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100)],
    'solver': ['adam', 'lbfgs'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter': [200, 500]
}

pytorch_param_grid = {
    'hidden_sizes': [[64], [128], [64, 32], [128, 64], [128, 64, 32]],
    'learning_rate': [0.001, 0.01, 0.1],
    'dropout': [0.0, 0.2, 0.5],
    'batch_size': [32, 64],
    'epochs': [50, 100, 150]
}

# Save experiment configuration
experiment_config = {
    'sklearn_param_grid': sklearn_param_grid,
    'pytorch_param_grid': pytorch_param_grid,
    'data_path': data_path,
    'target_column': target_col,
    'timestamp': datetime.now().isoformat(),
    'train_size': X_train.shape[0],
    'test_size': X_test.shape[0],
    'n_features': X_train.shape[1],
    'n_classes': len(np.unique(y)),
    'use_pytorch': use_pytorch,
    'use_mlflow': use_mlflow
}

with open(os.path.join(OUTDIR, 'experiment_config.yaml'), 'w') as f:
    yaml.dump(experiment_config, f, default_flow_style=False)

logger.info(f"Experiment configuration saved to {os.path.join(OUTDIR, 'experiment_config.yaml')}")
print(f"Total sklearn combinations: {len(list(product(*sklearn_param_grid.values())))}")
if use_pytorch:
    print(f"Total PyTorch combinations: {len(list(product(*pytorch_param_grid.values())))}")

In [None]:
# Scikit-learn MLP experiments
sklearn_results = []
combos = list(product(*sklearn_param_grid.values()))
keys = list(sklearn_param_grid.keys())

logger.info(f"Starting {len(combos)} scikit-learn experiments")

for i, combo in enumerate(tqdm(combos, desc="Scikit-learn experiments")):
    try:
        params = dict(zip(keys, combo))
        logger.debug(f"Testing sklearn params: {params}")
        
        # Log system resources
        memory_usage = psutil.virtual_memory().percent
        cpu_usage = psutil.cpu_percent(interval=1)
        
        start_time = datetime.now()
        
        clf = MLPClassifier(**params, random_state=42)
        clf.fit(X_train_s, y_train)
        
        train_pred = clf.predict(X_train_s)
        test_pred = clf.predict(X_test_s)
        train_acc = accuracy_score(y_train, train_pred)
        test_acc = accuracy_score(y_test, test_pred)
        
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        result = {
            'experiment_id': i + 1,
            'params': params,
            'train_acc': train_acc,
            'test_acc': test_acc,
            'n_iter': clf.n_iter_,
            'loss_curve': clf.loss_curve_ if hasattr(clf, 'loss_curve_') else None,
            'duration_seconds': duration,
            'memory_usage_percent': memory_usage,
            'cpu_usage_percent': cpu_usage,
            'timestamp': start_time.isoformat()
        }
        
        sklearn_results.append(result)
        
        logger.info(f"Sklearn experiment {i+1}/{len(combos)} completed. Train acc: {train_acc:.4f}, Test acc: {test_acc:.4f}")
        
    except Exception as e:
        logger.error(f"Sklearn experiment {i+1} failed with params {params}: {str(e)}")
        logger.error(traceback.format_exc())
        continue

logger.info(f"Completed {len(sklearn_results)} successful scikit-learn experiments")

# Save sklearn results
if sklearn_results:
    sklearn_df = pd.DataFrame([
        {
            'experiment_id': r['experiment_id'],
            'hidden_layer_sizes': str(r['params']['hidden_layer_sizes']),
            'solver': r['params']['solver'],
            'learning_rate_init': r['params']['learning_rate_init'],
            'max_iter': r['params']['max_iter'],
            'train_acc': r['train_acc'],
            'test_acc': r['test_acc'],
            'n_iter': r['n_iter'],
            'duration_seconds': r['duration_seconds'],
            'memory_usage_percent': r['memory_usage_percent'],
            'cpu_usage_percent': r['cpu_usage_percent'],
            'timestamp': r['timestamp']
        } for r in sklearn_results
    ])
    sklearn_df.to_csv(os.path.join(OUTDIR, 'sklearn_results_summary.csv'), index=False)
    logger.info("Scikit-learn results saved to sklearn_results_summary.csv")
    
    # Display best results
    best_sklearn = sklearn_df.loc[sklearn_df['test_acc'].idxmax()]
    print(f"\nBest scikit-learn result:")
    print(f"Test accuracy: {best_sklearn['test_acc']:.4f}")
    print(f"Parameters: hidden_sizes={best_sklearn['hidden_layer_sizes']}, solver={best_sklearn['solver']}, lr={best_sklearn['learning_rate_init']}")

In [None]:
# PyTorch experiments setup
if use_pytorch:
    try:
        import torch
        import torch.nn as nn
        import torch.optim as optim
        from torch.utils.data import TensorDataset, DataLoader
        from torch.utils.tensorboard import SummaryWriter
        
        # MLflow setup (optional)
        if use_mlflow:
            import mlflow
            import mlflow.pytorch
            mlflow.set_experiment("MLP_Hyperparameter_Search")
            logger.info("MLflow experiment tracking enabled")
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f'PyTorch device: {device}')
        
        if torch.cuda.is_available():
            logger.info(f'CUDA device: {torch.cuda.get_device_name(0)}')
            logger.info(f'CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
        
        def build_pytorch_model(input_size, hidden_sizes, dropout=0.0, n_classes=None):
            layers = []
            in_size = input_size
            for h in hidden_sizes:
                layers.append(nn.Linear(in_size, h))
                layers.append(nn.ReLU())
                if dropout and dropout > 0.0:
                    layers.append(nn.Dropout(dropout))
                in_size = h
            layers.append(nn.Linear(in_size, n_classes))
            return nn.Sequential(*layers)
        
        # Prepare PyTorch tensors
        X_train_t = torch.FloatTensor(X_train_s)
        y_train_t = torch.LongTensor(y_train.values if hasattr(y_train, 'values') else y_train)
        X_test_t = torch.FloatTensor(X_test_s)
        y_test_t = torch.LongTensor(y_test.values if hasattr(y_test, 'values') else y_test)
        
        logger.info("PyTorch setup completed successfully")
        
    except ImportError as e:
        logger.error(f"PyTorch import failed: {str(e)}")
        use_pytorch = False
    except Exception as e:
        logger.error(f"PyTorch setup failed: {str(e)}")
        logger.error(traceback.format_exc())
        use_pytorch = False
else:
    logger.info("PyTorch experiments disabled")

In [None]:
# Enhanced PyTorch experiments
if use_pytorch:
    pytorch_results = []
    combos = list(product(*pytorch_param_grid.values()))
    keys = list(pytorch_param_grid.keys())
    
    logger.info(f"Starting {len(combos)} PyTorch experiments")
    
    run_counter = 0
    best_test_acc = 0.0
    
    for combo in tqdm(combos, desc="PyTorch experiments"):
        run_counter += 1
        
        try:
            # Setup logging
            log_dir = os.path.join(OUTDIR, f'tb_logs/run_{run_counter}')
            writer = SummaryWriter(log_dir=log_dir)
            
            params = dict(zip(keys, combo))
            logger.info(f"\nPyTorch experiment {run_counter}/{len(combos)}: {params}")
            
            # MLflow run start
            if use_mlflow:
                mlflow.start_run()
                mlflow.log_params(params)
            
            # Model setup
            input_size = X_train_s.shape[1]
            n_classes = len(np.unique(y_train))
            model = build_pytorch_model(
                input_size, 
                params['hidden_sizes'], 
                dropout=params['dropout'], 
                n_classes=n_classes
            ).to(device)
            
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
            
            # Data loader
            train_ds = TensorDataset(X_train_t, y_train_t)
            train_loader = DataLoader(train_ds, batch_size=params['batch_size'], shuffle=True)
            
            # Training tracking
            train_losses = []
            val_losses = []
            val_accuracies = []
            start_time = datetime.now()
            
            # Training loop with enhanced logging
            for epoch in range(params['epochs']):
                model.train()
                epoch_losses = []
                
                for batch_idx, (xb, yb) in enumerate(train_loader):
                    xb = xb.to(device)
                    yb = yb.to(device)
                    
                    optimizer.zero_grad()
                    out = model(xb)
                    loss = criterion(out, yb)
                    loss.backward()
                    optimizer.step()
                    
                    epoch_losses.append(loss.item())
                
                avg_train_loss = np.mean(epoch_losses)
                train_losses.append(avg_train_loss)
                
                # Validation every 5 epochs
                if epoch % 5 == 0 or epoch == params['epochs'] - 1:
                    model.eval()
                    with torch.no_grad():
                        val_out = model(X_test_t.to(device))
                        val_loss = criterion(val_out, y_test_t.to(device))
                        val_pred = val_out.argmax(dim=1).cpu().numpy()
                        val_acc = accuracy_score(y_test, val_pred)
                    
                    val_losses.append(val_loss.item())
                    val_accuracies.append(val_acc)
                    
                    # TensorBoard logging
                    writer.add_scalar('Loss/train', avg_train_loss, epoch)
                    writer.add_scalar('Loss/validation', val_loss.item(), epoch)
                    writer.add_scalar('Accuracy/validation', val_acc, epoch)
                    
                    # MLflow logging
                    if use_mlflow:
                        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)
                        mlflow.log_metric("val_loss", val_loss.item(), step=epoch)
                        mlflow.log_metric("val_accuracy", val_acc, step=epoch)
                    
                    # System resource monitoring
                    memory_usage = psutil.virtual_memory().percent
                    if torch.cuda.is_available():
                        gpu_memory = torch.cuda.memory_allocated() / 1024**3
                        writer.add_scalar('System/gpu_memory_gb', gpu_memory, epoch)
                    writer.add_scalar('System/memory_usage_percent', memory_usage, epoch)
                
                # Progress reporting
                if (epoch + 1) % 20 == 0 or epoch == 0:
                    logger.debug(f"Epoch {epoch+1}/{params['epochs']}: train_loss={avg_train_loss:.4f}")
            
            # Final evaluation
            model.eval()
            with torch.no_grad():
                out_train = model(X_train_t.to(device))
                pred_train = out_train.argmax(dim=1).cpu().numpy()
                out_test = model(X_test_t.to(device))
                pred_test = out_test.argmax(dim=1).cpu().numpy()
            
            train_acc = accuracy_score(y_train, pred_train)
            test_acc = accuracy_score(y_test, pred_test)
            
            end_time = datetime.now()
            duration = (end_time - start_time).total_seconds()
            
            # Confusion matrix logging
            cm = confusion_matrix(y_test, pred_test)
            fig, ax = plt.subplots(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap='Blues')
            ax.set_title(f'Confusion Matrix - Run {run_counter}')
            ax.set_xlabel('Predicted')
            ax.set_ylabel('Actual')
            writer.add_figure('Confusion_Matrix', fig, global_step=run_counter)
            plt.close(fig)
            
            # Training loss curve plot
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
            
            ax1.plot(train_losses, label='Training Loss')
            if val_losses:
                val_epochs = list(range(0, params['epochs'], 5)) + [params['epochs'] - 1]
                ax1.plot(val_epochs[:len(val_losses)], val_losses, label='Validation Loss', marker='o')
            ax1.set_title(f"Loss Curves - Run {run_counter}")
            ax1.set_xlabel('Epoch')
            ax1.set_ylabel('Loss')
            ax1.legend()
            ax1.grid(True)
            
            if val_accuracies:
                ax2.plot(val_epochs[:len(val_accuracies)], val_accuracies, label='Validation Accuracy', marker='o', color='green')
                ax2.set_title(f"Validation Accuracy - Run {run_counter}")
                ax2.set_xlabel('Epoch')
                ax2.set_ylabel('Accuracy')
                ax2.legend()
                ax2.grid(True)
            
            plt.tight_layout()
            
            # Save plot
            fname = f"pytorch_metrics_run_{run_counter}.png"
            plt.savefig(os.path.join(OUTDIR, fname), dpi=300, bbox_inches='tight')
            plt.close()
            
            # Model checkpointing for best model
            if test_acc > best_test_acc:
                best_test_acc = test_acc
                checkpoint = {
                    'model_state_dict': model.state_dict(),
                    'params': params
                }
                torch.save(checkpoint, os.path.join(OUTDIR, 'models', 'best_model.pth'))
                logger.info(f"New best model saved with test accuracy: {test_acc:.4f}")
            
            result = {
                'experiment_id': run_counter,
                'params': params,
                'train_acc': train_acc,
                'test_acc': test_acc,
                'duration_seconds': duration,
                'final_train_loss': train_losses[-1] if train_losses else None,
                'final_val_loss': val_losses[-1] if val_losses else None,
                'final_val_accuracy': val_accuracies[-1] if val_accuracies else None,
                'timestamp': start_time.isoformat()
            }
            
            pytorch_results.append(result)
            
            logger.info(f"PyTorch experiment {run_counter} completed. Train acc: {train_acc:.4f}, Test acc: {test_acc:.4f}")
            
            if use_mlflow:
                mlflow.log_artifact(os.path.join(OUTDIR, fname), "plots")
                mlflow.log_metric("final_test_accuracy", test_acc)
                mlflow.log_metric("duration", duration)
                mlflow.end_run()
            
            writer.close()
            
        except Exception as e:
            logger.error(f"PyTorch experiment {run_counter} failed with params {params}: {str(e)}")
            logger.error(traceback.format_exc())
            if use_mlflow:
                mlflow.end_run(status="FAILED")
            continue
    
    logger.info(f"Completed {len(pytorch_results)} successful PyTorch experiments")
    
    # Save PyTorch results
    if pytorch_results:
        pytorch_df = pd.DataFrame([
            {
                'experiment_id': r['experiment_id'],
                'hidden_sizes': str(r['params']['hidden_sizes']),
                'learning_rate': r['params']['learning_rate'],
                'dropout': r['params']['dropout'],
                'batch_size': r['params']['batch_size'],
                'epochs': r['params']['epochs'],
                'train_acc': r['train_acc'],
                'test_acc': r['test_acc'],
                'duration_seconds': r['duration_seconds'],
                'final_train_loss': r['final_train_loss'],
                'final_val_loss': r['final_val_loss'],
                'final_val_accuracy': r['final_val_accuracy'],
                'timestamp': r['timestamp']
            } for r in pytorch_results
        ])
        pytorch_df.to_csv(os.path.join(OUTDIR, 'pytorch_results_summary.csv'), index=False)
        logger.info("PyTorch results saved to pytorch_results_summary.csv")
        
        # Display best results
        best_pytorch = pytorch_df.loc[pytorch_df['test_acc'].idxmax()]
        print(f"\nBest PyTorch result:")
        print(f"Test accuracy: {best_pytorch['test_acc']:.4f}")
        print(f"Parameters: hidden_sizes={best_pytorch['hidden_sizes']}, lr={best_pytorch['learning_rate']}, dropout={best_pytorch['dropout']}")
    else:
        logger.warning("No successful PyTorch experiments to save")
else:
    logger.info("Skipping PyTorch experiments as requested")

In [None]:
# Plotting and final analysis
logger.info("Starting final analysis and plotting of results")

def create_performance_plots(df, metric_col, plot_title, filename):
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Convert string representations of tuples/lists to a usable format for plotting
    # This is a bit of a hack but works for this specific case
    if 'hidden_layer_sizes' in df.columns:
        df['hidden_layer_sizes_str'] = df['hidden_layer_sizes'].apply(lambda x: str(x).replace(' ', ''))
        x_col = 'hidden_layer_sizes_str'
        hue_col = 'solver'
    else:
        df['hidden_sizes_str'] = df['hidden_sizes'].apply(lambda x: str(x).replace(' ', ''))
        x_col = 'hidden_sizes_str'
        hue_col = 'dropout'
        
    sns.barplot(x=x_col, y=metric_col, hue=hue_col, data=df, ax=ax, palette='viridis')
    ax.set_title(plot_title)
    ax.set_xlabel('Architecture')
    ax.set_ylabel(metric_col)
    ax.tick_params(axis='x', rotation=45)
    ax.grid(axis='y', linestyle='--')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTDIR, filename), dpi=300)
    plt.close()
    logger.info(f"Saved plot: {filename}")

if 'sklearn_df' in locals() and not sklearn_df.empty:
    create_performance_plots(sklearn_df, 'test_acc', 'Scikit-learn MLP Test Accuracy by Architecture and Solver', 'sklearn_test_accuracy.png')
    
if 'pytorch_df' in locals() and not pytorch_df.empty:
    create_performance_plots(pytorch_df, 'test_acc', 'PyTorch MLP Test Accuracy by Architecture and Dropout', 'pytorch_test_accuracy.png')

logger.info("Final analysis and plotting complete.")