# THIS IS A FAILED ATTEMPT TO BUILD A NOTEBOOK RUNNABLE ON KAGGLE

In [1]:
%load_ext autoreload
%autoreload 2
%pip install git+https://github.com/PaoloGinefra/ACA_GraphML_Project.git
%pip install optuna-integration[pytorch_lightning]

yolo


In [None]:
# Import all necessary libraries
import os
import sys
import time
import psutil
import gc
from typing import Dict, Any, List, Tuple, Optional
import json
import warnings
warnings.filterwarnings('ignore')

# Core libraries
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

# PyTorch Geometric
import torch_geometric
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data, Batch

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import WandbLogger

# Optuna for hyperparameter optimization
import optuna
from optuna.integration import PyTorchLightningPruningCallback

# Weights & Biases for experiment tracking
try:
    import wandb
    WANDB_AVAILABLE = True
except ImportError:
    WANDB_AVAILABLE = False
    print("W&B not available - install with: pip install wandb")

# Kaggle Secrets (if available)
try:
    from kaggle_secrets import UserSecretsClient
    KAGGLE_SECRETS_AVAILABLE = True
except ImportError:
    KAGGLE_SECRETS_AVAILABLE = False
    print("Kaggle secrets not available - running outside Kaggle environment")

# Set random seeds for reproducibility
pl.seed_everything(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

print("All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"PyTorch Geometric version: {torch_geometric.__version__}")
print(f"PyTorch Lightning version: {pl.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
print(f"Weights & Biases available: {WANDB_AVAILABLE}")
print(f"Kaggle Secrets available: {KAGGLE_SECRETS_AVAILABLE}")

# 📊 W&B Configuration and Login
print("🔧 Setting up Weights & Biases...")

try:
    import wandb
    from kaggle_secrets import UserSecretsClient
    
    # Get W&B credentials from Kaggle secrets
    user_secrets = UserSecretsClient()
    
    try:
        WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")
        WANDB_ENTITY = user_secrets.get_secret("WANDB_ENTITY")
    except Exception as e:
        print(f"⚠️ Could not load W&B secrets: {e}")
        print("💡 Make sure you've added WANDB_API_KEY and WANDB_ENTITY to Kaggle secrets")
        raise

    if not WANDB_API_KEY:
        raise ValueError("WANDB_API_KEY not found in Kaggle secrets")
    
    # Enhanced W&B login with better error handling
    wandb.login(key=WANDB_API_KEY, force=True)  # Force re-login
    
    # Test W&B connection with a simple API call
    api = wandb.Api()
    user = api.viewer
    print(f"✅ W&B login successful as {user.username}")
    
    # Entity validation and debugging
    print(f"\n🔍 Entity Configuration:")
    print(f"   Personal Username: {user.username}")
    print(f"   Configured Entity: {WANDB_ENTITY if WANDB_ENTITY else 'Not set'}")
    
    # Important: Entity should be organization name, not personal username
    if WANDB_ENTITY == user.username:
        print(f"⚠️  WARNING: Entity is set to personal username!")
        print(f"   This often causes 403 'permission denied' errors.")
        print(f"   Update WANDB_ENTITY to your organization/team name instead.")
    
    # Configure W&B settings for better stability
    import os
    os.environ["WANDB_MODE"] = "online"
    
    # Define project configuration
    PROJECT_NAME = "graph-regression-optimization"
    
    print(f"📋 W&B Project: {PROJECT_NAME}")
    print(f"🏢 W&B Entity: {WANDB_ENTITY if WANDB_ENTITY else 'Default (not set)'}")
    
except Exception as e:
    print(f"❌ W&B setup failed: {e}")
    error_msg = str(e).lower()
    if "permission denied" in error_msg or "403" in error_msg:
        print("\n💡 SOLUTION for 403 Permission Denied Error:")
        print("   1. Check your W&B dashboard URL: https://wandb.ai/[ENTITY]/[PROJECT]")
        print("   2. The [ENTITY] part is what you need to use, not your username")
        print("   3. Update your Kaggle secret 'WANDB_ENTITY' with the organization name")
        print("   4. If you don't have an organization, create a personal project first")
    elif "network" in error_msg or "connection" in error_msg:
        print("💡 Check your internet connection or try again later")
    raise

In [None]:
# Configuration and Secrets Management for Kaggle
# Use Kaggle Secrets interface to set these values

# Import Kaggle UserSecrets
try:
    from kaggle_secrets import UserSecretsClient
    import os
    user_secrets = UserSecretsClient()

    KAGGLE_SECRETS_AVAILABLE = True
    print("✅ Kaggle secrets interface available")
except ImportError:
    print("⚠️ Kaggle secrets not available - running outside Kaggle environment")
    KAGGLE_SECRETS_AVAILABLE = False
    user_secrets = None

# W&B Configuration (set these in Kaggle secrets)
WANDB_API_KEY = None
WANDB_PROJECT = 'zinc-graph-regression'  # Default project name
WANDB_ENTITY = None

if KAGGLE_SECRETS_AVAILABLE:
    try:
        WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")
        print("✅ W&B API key loaded from Kaggle secrets")
    except Exception as e:
        print(f"⚠️ Could not load WANDB_API_KEY from secrets: {e}")
    
    try:
        WANDB_PROJECT = user_secrets.get_secret("WANDB_PROJECT")
        print("✅ W&B project name loaded from Kaggle secrets")
    except Exception:
        print("ℹ️ Using default W&B project name (WANDB_PROJECT not set in secrets)")
    
    try:
        WANDB_ENTITY = user_secrets.get_secret("WANDB_ENTITY")
        print("✅ W&B entity loaded from Kaggle secrets")
    except Exception:
        print("ℹ️ W&B entity not set in secrets (will use default)")

# Supabase Database Configuration
# Expected secret: Complete PostgreSQL connection URL
# Format: postgresql://postgres.{project_id}:{password}@{host}:{port}/{database}
SUPABASE_DB_URL = None

if KAGGLE_SECRETS_AVAILABLE:
    try:
        SUPABASE_DB_URL = user_secrets.get_secret("SUPABASE_URL")
        print("✅ Supabase database URL loaded from Kaggle secrets")
    except Exception as e:
        print(f"⚠️ Could not load SUPABASE_URL from secrets: {e}")
        print("ℹ️ If you want to use remote database, set SUPABASE_URL in Kaggle secrets")
        print("ℹ️ Format: postgresql://postgres.{project_id}:{password}@{host}:{port}/{database}")

# Initialize W&B if available and configured
if WANDB_AVAILABLE and WANDB_API_KEY:
    try:
        # Enhanced W&B login with better error handling
        wandb.login(key=WANDB_API_KEY, force=True)  # Force re-login
        
        # Test W&B connection with a simple API call
        api = wandb.Api()
        user = api.viewer
        print(f"✅ W&B login successful as {user.username}")
        
        # Configure W&B settings for better stability
        # Use environment variables instead of wandb.settings (which doesn't exist)
        import os
        os.environ["WANDB_MODE"] = "online"
        
    except Exception as e:
        print(f"⚠️ W&B login failed: {e}")
        error_msg = str(e).lower()
        if "permission denied" in error_msg:
            print("💡 Try regenerating your W&B API key and updating Kaggle secrets")
        elif "network" in error_msg or "connection" in error_msg:
            print("💡 Check your internet connection or try again later")
        else:
            print("💡 Check your W&B API key in Kaggle secrets")
        
        WANDB_API_KEY = None  # Disable W&B if login fails
        print("🔄 Continuing without W&B logging...")
else:
    if not WANDB_AVAILABLE:
        print("⚠️ W&B library not available")
    else:
        print("⚠️ W&B not configured - set WANDB_API_KEY in Kaggle secrets")
        print("💡 To enable W&B logging:")
        print("   1. Go to https://wandb.ai/settings and copy your API key")
        print("   2. In Kaggle: Add-ons > Secrets > + Add Secret")
        print("   3. Label: WANDB_API_KEY, Value: your_api_key_here")

# Optuna study configuration
STUDY_NAME = "zinc-graph-regression-multiobj"
OPTUNA_DB_URL = SUPABASE_DB_URL  # Use the complete URL directly

if OPTUNA_DB_URL:
    print("✅ Optuna remote database configured")
    print(f"ℹ️ Database host: {OPTUNA_DB_URL.split('@')[1].split(':')[0] if '@' in OPTUNA_DB_URL else 'unknown'}")
else:
    print("⚠️ Remote database not configured - will use local SQLite")
    print("ℹ️ To use remote storage, set SUPABASE_DB_URL in Kaggle secrets")

# Global configuration
CONFIG = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'num_trials': 50,  # Number of hyperparameter trials
    'timeout_hours': 6,  # Kaggle notebook timeout consideration
    'early_stopping_patience': 15,
    'max_epochs': 100,
    'val_check_interval': 1.0,  # Check validation every epoch
    'log_every_n_steps': 10,
}

print(f"\n📊 Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

# Display configuration summary
print(f"\n🔧 Secrets Configuration Summary:")
print(f"  Kaggle Secrets Available: {'✅' if KAGGLE_SECRETS_AVAILABLE else '❌'}")
print(f"  W&B API Key: {'✅ Configured' if WANDB_API_KEY else '❌ Not set'}")
print(f"  W&B Project: {WANDB_PROJECT}")
print(f"  W&B Entity: {WANDB_ENTITY if WANDB_ENTITY else 'Default (not set)'}")
print(f"  Remote Database: {'✅ Configured' if OPTUNA_DB_URL else '❌ Not set'}")
print(f"  Study Name: {STUDY_NAME}")

In [None]:
# Debug and Production Mode Configuration
# Set DEBUG_MODE = True for quick testing, False for full optimization

DEBUG_MODE = False  # Change to True for testing

if DEBUG_MODE:
    print("🐛 DEBUG MODE ENABLED - Using reduced settings for testing")
    CONFIG.update({
        'num_trials': 5,          # Reduced for testing
        'timeout_hours': 0.5,     # 30 minutes for testing
        'max_epochs': 15,         # Fewer epochs
        'early_stopping_patience': 8,
        'val_check_interval': 1.0,
        'log_every_n_steps': 5,
    })
    BATCH_SIZE = 16  # Smaller batch size for testing
    print("📝 Debug configuration applied")
else:
    print("🚀 PRODUCTION MODE - Full optimization settings")

print(f"📊 Current Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")
print(f"  batch_size: {BATCH_SIZE if 'BATCH_SIZE' in locals() else 32}")

# Environment validation
def validate_environment():
    """Validate that the environment is properly configured."""
    checks = []
    
    # GPU Check
    if torch.cuda.is_available():
        checks.append(("✅ GPU", f"Available: {torch.cuda.get_device_name()}"))
    else:
        checks.append(("⚠️ GPU", "Not available - will use CPU (much slower)"))
    
    # W&B Check
    if WANDB_AVAILABLE and WANDB_API_KEY:
        checks.append(("✅ W&B", "Configured and available"))
    else:
        checks.append(("⚠️ W&B", "Not configured - no experiment tracking"))
    
    # Optuna DB Check
    if OPTUNA_DB_URL:
        checks.append(("✅ Optuna DB", "Remote database configured"))
    else:
        checks.append(("⚠️ Optuna DB", "Using local SQLite"))
    
    # Memory Check
    if torch.cuda.is_available():
        total_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        checks.append(("📊 GPU Memory", f"{total_memory:.1f} GB"))
    
    print("\n🔍 Environment Validation:")
    for check_name, check_result in checks:
        print(f"  {check_name}: {check_result}")
    
    return all("✅" in check[0] for check in checks[:2])  # GPU and one of W&B/DB required

# Run validation
env_ok = validate_environment()
if not env_ok:
    print("\n⚠️ Environment check completed with warnings. You can still proceed, but some features may be limited.")
else:
    print("\n✅ Environment validation passed!")

In [None]:
class SystemMonitor:
    """
    Comprehensive system monitoring for tracking memory, time, throughput and latency.
    Designed for multi-objective optimization in Kaggle environment with robust error handling.
    """
    
    def __init__(self):
        self.reset()
    
    def reset(self):
        """Reset all tracking variables"""
        self.start_time = None
        self.end_time = None
        self.peak_memory_mb = 0
        self.initial_memory_mb = 0
        self.samples_processed = 0
        self.training_start_time = None
        self.epoch_count = 0
        
    def start_monitoring(self):
        """Start monitoring system metrics"""
        try:
            self.start_time = time.time()
            self.training_start_time = time.time()
            self.initial_memory_mb = self.get_current_memory_mb()
            
            # Reset peak memory tracking
            if torch.cuda.is_available():
                torch.cuda.reset_peak_memory_stats()
            gc.collect()
            
        except Exception as e:
            print(f"⚠️ Warning: Could not start monitoring properly: {e}")
            self.start_time = time.time()  # At least track time
        
    def update_peak_memory(self):
        """Update peak memory usage"""
        try:
            current_memory = self.get_current_memory_mb()
            self.peak_memory_mb = max(self.peak_memory_mb, current_memory)
        except Exception as e:
            print(f"⚠️ Warning: Could not update memory tracking: {e}")
        
    def get_current_memory_mb(self) -> float:
        """Get current memory usage in MB"""
        try:
            process = psutil.Process(os.getpid())
            memory_info = process.memory_info()
            
            # CPU memory
            cpu_memory_mb = memory_info.rss / 1024 / 1024
            
            # GPU memory if available
            if torch.cuda.is_available():
                gpu_memory_mb = torch.cuda.memory_allocated() / 1024 / 1024
                return max(cpu_memory_mb, gpu_memory_mb)  # Use the higher value
            
            return cpu_memory_mb
            
        except Exception as e:
            print(f"⚠️ Warning: Could not get memory usage: {e}")
            return 0.0
    
    def record_batch_processed(self, batch_size: int):
        """Record that a batch was processed"""
        self.samples_processed += batch_size
        self.update_peak_memory()
    
    def record_epoch_completed(self):
        """Record that an epoch was completed"""
        self.epoch_count += 1
    
    def get_metrics(self) -> Dict[str, float]:
        """Get comprehensive system metrics"""
        if self.start_time is None:
            return {
                'training_time_minutes': 0.0,
                'memory_consumption_mb': 0.0,
                'peak_memory_mb': 0.0,
                'throughput_samples_per_sec': 0.0,
                'latency_ms_per_sample': float('inf'),
                'samples_processed': 0,
                'epochs_completed': 0,
            }
            
        try:
            current_time = time.time()
            elapsed_time = current_time - self.start_time
            
            # Memory metrics
            current_memory_mb = self.get_current_memory_mb()
            memory_consumption_mb = max(0, current_memory_mb - self.initial_memory_mb)
            
            # Performance metrics
            throughput = self.samples_processed / elapsed_time if elapsed_time > 0 else 0
            latency_ms = (elapsed_time * 1000) / self.samples_processed if self.samples_processed > 0 else float('inf')
            
            # Additional metrics
            epochs_per_minute = (self.epoch_count / elapsed_time * 60) if elapsed_time > 0 else 0
            
            return {
                'training_time_minutes': elapsed_time / 60,
                'memory_consumption_mb': memory_consumption_mb,
                'peak_memory_mb': self.peak_memory_mb,
                'current_memory_mb': current_memory_mb,
                'throughput_samples_per_sec': throughput,
                'latency_ms_per_sample': latency_ms,
                'samples_processed': self.samples_processed,
                'epochs_completed': self.epoch_count,
                'epochs_per_minute': epochs_per_minute,
            }
            
        except Exception as e:
            print(f"⚠️ Warning: Could not calculate metrics: {e}")
            return {
                'training_time_minutes': 0.0,
                'memory_consumption_mb': 0.0,
                'peak_memory_mb': 0.0,
                'throughput_samples_per_sec': 0.0,
                'latency_ms_per_sample': float('inf'),
                'samples_processed': self.samples_processed,
                'epochs_completed': self.epoch_count,
            }
    
    def log_metrics(self, prefix: str = ""):
        """Log current metrics"""
        try:
            metrics = self.get_metrics()
            print(f"📊 {prefix} System Metrics:")
            
            # Group metrics for better readability
            time_metrics = {k: v for k, v in metrics.items() if 'time' in k or 'latency' in k}
            memory_metrics = {k: v for k, v in metrics.items() if 'memory' in k}
            performance_metrics = {k: v for k, v in metrics.items() if 'throughput' in k or 'samples' in k or 'epochs' in k}
            
            # Time metrics
            for key, value in time_metrics.items():
                if 'latency' in key:
                    print(f"  {key}: {value:.2f} ms")
                else:
                    print(f"  {key}: {value:.2f}")
            
            # Memory metrics
            for key, value in memory_metrics.items():
                print(f"  {key}: {value:.1f} MB")
            
            # Performance metrics
            for key, value in performance_metrics.items():
                if 'throughput' in key:
                    print(f"  {key}: {value:.1f}")
                elif 'epochs_per_minute' in key:
                    print(f"  {key}: {value:.2f}")
                else:
                    print(f"  {key}: {value}")
                    
        except Exception as e:
            print(f"⚠️ Could not log metrics: {e}")

# Create global monitor instance
system_monitor = SystemMonitor()
print("✅ Enhanced system monitoring initialized")

# Test the monitor to ensure it's working
try:
    test_memory = system_monitor.get_current_memory_mb()
    print(f"📊 Current memory usage: {test_memory:.1f} MB")
except Exception as e:
    print(f"⚠️ Warning: System monitor test failed: {e}")

In [None]:
# Data Preparation
from ACAgraphML.Dataset import ZINC_Dataset
from ACAgraphML.Transforms import OneHotEncodeFeat
from ACAgraphML.Pipeline import DataAugmenter
from ACAgraphML.Transforms import SteadyStateTransform
from torch_geometric.transforms import AddRandomWalkPE

# Constants for ZINC dataset
NUM_NODE_FEATS = 28
NUM_EDGE_FEATS = 4
BATCH_SIZE = 32  # Optimized for Kaggle GPU memory

def prepare_zinc_data():
    """
    Prepare ZINC dataset with data augmentation and proper validation split.
    Returns train, validation dataloaders and target statistics.
    """
    print("🔄 Loading ZINC dataset...")
    
    # Define transforms
    oneHotTransform = OneHotEncodeFeat(NUM_NODE_FEATS)
    
    def data_transform(data):
        """Complete data transformation pipeline"""
        # Apply one-hot encoding
        data = oneHotTransform(data)
        
        # Ensure proper data types
        data.x = data.x.float()
        
        # Handle edge attributes
        if data.edge_attr is not None:
            if data.edge_attr.dim() == 1:
                # Convert to one-hot if needed
                data.edge_attr = torch.nn.functional.one_hot(
                    data.edge_attr.long(),
                    num_classes=NUM_EDGE_FEATS
                ).float()
            data.edge_attr = data.edge_attr.float()
        
        # Ensure target is float
        if data.y is not None:
            data.y = data.y.float()
            
        return data
    
    # Load datasets - using subset for Kaggle time constraints
    train_dataset = ZINC_Dataset.SMALL_TRAIN.load(transform=data_transform)
    val_dataset = ZINC_Dataset.SMALL_VAL.load(transform=data_transform)
    test_dataset = ZINC_Dataset.SMALL_TEST.load(transform=data_transform)
    
    print(f"📊 Dataset sizes:")
    print(f"  Training: {len(train_dataset)}")
    print(f"  Validation: {len(val_dataset)}")
    print(f"  Test: {len(test_dataset)}")
    
    # Calculate target statistics for normalization
    train_targets = torch.cat([data.y for data in train_dataset])
    val_targets = torch.cat([data.y for data in val_dataset])
    all_targets = torch.cat([train_targets, val_targets])
    
    target_mean = torch.mean(all_targets).item()
    target_std = torch.std(all_targets).item()
    
    print(f"📈 Target statistics:")
    print(f"  Mean: {target_mean:.4f}")
    print(f"  Std: {target_std:.4f}")
    print(f"  Min: {torch.min(all_targets).item():.4f}")
    print(f"  Max: {torch.max(all_targets).item():.4f}")
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=True,
        num_workers=2,
        pin_memory=torch.cuda.is_available()
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=False,
        num_workers=2,
        pin_memory=torch.cuda.is_available()
    )
    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=False,
        num_workers=2,
        pin_memory=torch.cuda.is_available()
    )
    
    # Sample batch for inspection
    sample_batch = next(iter(train_loader))
    print(f"🔍 Sample batch info:")
    print(f"  Node features shape: {sample_batch.x.shape}")
    print(f"  Edge features shape: {sample_batch.edge_attr.shape}")
    print(f"  Targets shape: {sample_batch.y.shape}")
    print(f"  Batch size: {sample_batch.y.shape[0]}")
    
    return {
        'train_loader': train_loader,
        'val_loader': val_loader,
        'test_loader': test_loader,
        'target_mean': target_mean,
        'target_std': target_std,
        'num_train_samples': len(train_dataset),
        'num_val_samples': len(val_dataset),
        'num_test_samples': len(test_dataset),
    }

# Prepare data
data_info = prepare_zinc_data()
print("✅ Data preparation completed!")

In [None]:
# Multi-Objective Optimization Setup
from ACAgraphML.Pipeline.LightningModules.GDLPipelineLighningModule import (
    GDLPipelineLightningModule,
    create_lightning_custom
)
from ACAgraphML.Pipeline.Models.GDLPipeline import (
    GNNConfig,
    PoolingConfig, 
    RegressorConfig
)

class MultiObjectiveCallback(pl.Callback):
    """
    Custom callback to track system metrics during training.
    Integrates with our SystemMonitor for comprehensive tracking.
    """
    
    def __init__(self, system_monitor: SystemMonitor):
        super().__init__()
        self.system_monitor = system_monitor
        
    def on_train_start(self, trainer, pl_module):
        self.system_monitor.start_monitoring()
        
    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        # Record batch processing
        try:
            batch_size = batch.y.shape[0] if hasattr(batch, 'y') else len(batch)
            self.system_monitor.record_batch_processed(batch_size)
        except Exception:
            pass  # Skip if batch size cannot be determined
        
    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx=0):
        # Record validation batch processing
        try:
            batch_size = batch.y.shape[0] if hasattr(batch, 'y') else len(batch)
            self.system_monitor.record_batch_processed(batch_size)
        except Exception:
            pass  # Skip if batch size cannot be determined

def create_model_with_config(trial: optuna.Trial, data_info: Dict[str, Any]) -> GDLPipelineLightningModule:
    """
    Create a GDL Pipeline model with hyperparameters suggested by Optuna.
    
    Args:
        trial: Optuna trial object for hyperparameter suggestions
        data_info: Dictionary containing dataset information and statistics
        
    Returns:
        Configured GDLPipelineLightningModule
    """
    
    # Core architecture hyperparameters with fallback
    if hasattr(trial, 'suggest_categorical'):
        hidden_dim = trial.suggest_categorical('hidden_dim', [64, 128, 256, 512])
        layer_name = trial.suggest_categorical('layer_name', [
            'GCN', 'GAT', 'GATv2', 'SAGE', 'GINEConv', 'GINConv', 'PNA'
        ])
        pooling_type = trial.suggest_categorical('pooling_type', [
            'mean', 'max', 'attentional', 'set2set'
        ])
        regressor_type = trial.suggest_categorical('regressor_type', [
            'linear', 'mlp', 'residual_mlp', 'attention_mlp'
        ])
    else:
        hidden_dim = trial.params.get('hidden_dim', 128)
        layer_name = trial.params.get('layer_name', 'GCN')
        pooling_type = trial.params.get('pooling_type', 'mean')
        regressor_type = trial.params.get('regressor_type', 'mlp')
    
    if hasattr(trial, 'suggest_int'):
        num_layers = trial.suggest_int('num_layers', 3, 6)
    else:
        num_layers = trial.params.get('num_layers', 3)
    
    # Regularization hyperparameters with fallback
    if hasattr(trial, 'suggest_float'):
        dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.4)
        global_dropout = trial.suggest_float('global_dropout', 0.0, 0.3)
    else:
        dropout_rate = trial.params.get('dropout_rate', 0.2)
        global_dropout = trial.params.get('global_dropout', 0.1)
    
    # MLP specific parameters (if applicable)
    if regressor_type != 'linear':
        # Use integer choice to avoid dynamic value space issues
        # Handle fallback for older trials that don't have regressor_choice
        if hasattr(trial, 'suggest_int') and not hasattr(trial, 'params'):
            # New trials - use regressor_choice
            regressor_choice = trial.suggest_int('regressor_choice', 0, 2)
            if regressor_choice == 0:
                regressor_hidden_dims = [hidden_dim//2]
            elif regressor_choice == 1:
                regressor_hidden_dims = [hidden_dim, hidden_dim//2]
            else:  # regressor_choice == 2
                regressor_hidden_dims = [hidden_dim, hidden_dim//2, hidden_dim//4]
        elif hasattr(trial, 'params') and 'regressor_choice' in trial.params:
            # Evaluating old trial with regressor_choice
            regressor_choice = trial.params['regressor_choice']
            regressor_choices = {
                0: [hidden_dim//2],
                1: [hidden_dim, hidden_dim//2], 
                2: [hidden_dim, hidden_dim//2, hidden_dim//4]
            }
            regressor_hidden_dims = regressor_choices[regressor_choice]
        elif hasattr(trial, 'params') and 'regressor_hidden_dims' in trial.params:
            # Very old trials that had regressor_hidden_dims directly
            regressor_hidden_dims = trial.params['regressor_hidden_dims']
        else:
            # Fallback for any other case
            regressor_hidden_dims = [hidden_dim, hidden_dim//2]
        
        if hasattr(trial, 'suggest_float'):
            mlp_dropout = trial.suggest_float('mlp_dropout', 0.0, 0.3)
        else:
            mlp_dropout = trial.params.get('mlp_dropout', 0.1)
    else:
        regressor_hidden_dims = []
        mlp_dropout = 0.0
    
    # Optimization hyperparameters with fallback
    if hasattr(trial, 'suggest_categorical'):
        optimizer = trial.suggest_categorical('optimizer', ['adam', 'adamw', 'sgd'])
        lr_scheduler = trial.suggest_categorical('lr_scheduler', [
            'cosine', 'plateau', 'step', 'none'
        ])
        use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
    else:
        optimizer = trial.params.get('optimizer', 'adam')
        lr_scheduler = trial.params.get('lr_scheduler', 'cosine')
        use_batch_norm = trial.params.get('use_batch_norm', True)
    
    if hasattr(trial, 'suggest_float'):
        lr = trial.suggest_float('lr', 1e-5, 1e-2, log=True)
        weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-2, log=True)
        gradient_clip_val = trial.suggest_float('gradient_clip_val', 0.5, 2.0)
    else:
        lr = trial.params.get('lr', 1e-3)
        weight_decay = trial.params.get('weight_decay', 1e-4)
        gradient_clip_val = trial.params.get('gradient_clip_val', 1.0)
    
    # Create configuration objects
    gnn_config = GNNConfig(
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        layer_name=layer_name,
        dropout_rate=dropout_rate,
        use_residual=True,  # Always use residual connections
        use_layer_norm=True,  # Always use layer normalization
    )
    
    pooling_config = PoolingConfig(
        pooling_type=pooling_type,
        processing_steps=3 if pooling_type == 'set2set' else 3,
        attention_hidden_multiplier=1.0 if pooling_type == 'attentional' else 1.0
    )
    
    regressor_config = RegressorConfig(
        regressor_type=regressor_type,
        hidden_dims=regressor_hidden_dims,
        mlp_dropout=mlp_dropout,
        normalization='batch' if use_batch_norm else 'none'
    )
    
    # Create the model
    model = create_lightning_custom(
        node_features=NUM_NODE_FEATS,
        edge_features=NUM_EDGE_FEATS,
        gnn_config=gnn_config,
        pooling_config=pooling_config,
        regressor_config=regressor_config,
        global_dropout=global_dropout,
        use_batch_norm=use_batch_norm,
        
        # Target normalization
        target_mean=data_info['target_mean'],
        target_std=data_info['target_std'],
        
        # Optimization
        optimizer=optimizer,
        lr=lr,
        weight_decay=weight_decay,
        lr_scheduler=lr_scheduler,
        gradient_clip_val=gradient_clip_val,
        
        # Monitoring
        monitor_metric='val_mae',
        log_embeddings=False,  # Disable for performance
        log_predictions=False,  # Disable for performance
    )
    
    return model

# 🎯 Optimized Objective Function with Enhanced W&B Support
def objective(trial):
    """
    Robust objective function for Optuna optimization with W&B logging.
    Handles entity permission issues and provides detailed error information.
    """
    try:
        # Suggest hyperparameters
        num_layers = trial.suggest_int('num_layers', 2, 6)
        hidden_dim = trial.suggest_categorical('hidden_dim', [64, 128, 256, 512])
        dropout = trial.suggest_float('dropout', 0.0, 0.5)
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
        
        # Create trial config
        trial_config = {
            'num_layers': num_layers,
            'hidden_dim': hidden_dim,
            'dropout': dropout,
            'learning_rate': learning_rate,
            'optimizer': 'AdamW',
            'trial_number': trial.number
        }
        
        # Initialize W&B with enhanced error handling
        wandb_run = None
        try:
            print(f"🔄 Trial {trial.number}: Initializing W&B...")
            
            # Validate entity before run creation
            if not WANDB_ENTITY:
                print("⚠️ WANDB_ENTITY not set, using default entity")
                wandb_run = wandb.init(
                    project=PROJECT_NAME,
                    config=trial_config,
                    name=f"trial_{trial.number}",
                    tags=["optuna", "hyperparameter_tuning"],
                    reinit=True
                )
            else:
                print(f"🏢 Using entity: {WANDB_ENTITY}")
                wandb_run = wandb.init(
                    project=PROJECT_NAME,
                    entity=WANDB_ENTITY,
                    config=trial_config,
                    name=f"trial_{trial.number}",
                    tags=["optuna", "hyperparameter_tuning"],
                    reinit=True
                )
            
            print(f"✅ W&B run initialized: {wandb_run.id}")
            
        except Exception as wandb_error:
            print(f"⚠️ W&B initialization failed: {wandb_error}")
            error_msg = str(wandb_error).lower()
            
            if "permission denied" in error_msg or "403" in error_msg:
                print(f"\n🚨 403 Permission Denied Error Detected!")
                print(f"   Current entity: {WANDB_ENTITY}")
                print(f"   This usually means the entity name is incorrect.")
                print(f"   Solution: Update WANDB_ENTITY to your organization name")
                
                # Try without entity as fallback
                print(f"   Attempting fallback without entity...")
                try:
                    wandb_run = wandb.init(
                        project=PROJECT_NAME,
                        config=trial_config,
                        name=f"trial_{trial.number}_fallback",
                        tags=["optuna", "hyperparameter_tuning", "entity_fallback"],
                        reinit=True
                    )
                    print(f"✅ Fallback W&B run successful: {wandb_run.id}")
                except Exception as fallback_error:
                    print(f"❌ Fallback also failed: {fallback_error}")
                    wandb_run = None
            else:
                wandb_run = None
        
        # Setup model and data (simulated for now)
        print(f"🏗️ Trial {trial.number}: Building model with config: {trial_config}")
        
        # Simulate training (replace with actual model training)
        import random
        import time
        
        # Simulate epochs
        best_val_loss = float('inf')
        for epoch in range(3):  # Reduced for faster testing
            # Simulate training metrics
            train_loss = random.uniform(0.1, 1.0) * (0.9 ** epoch)
            val_loss = random.uniform(0.1, 1.0) * (0.9 ** epoch)
            
            metrics = {
                'epoch': epoch,
                'train_loss': train_loss,
                'val_loss': val_loss,
                'learning_rate': learning_rate
            }
            
            # Log to W&B if available
            if wandb_run:
                try:
                    wandb_run.log(metrics)
                except Exception as log_error:
                    print(f"⚠️ W&B logging failed: {log_error}")
            
            # Report to Optuna
            trial.report(val_loss, epoch)
            
            # Update best validation loss
            if val_loss < best_val_loss:
                best_val_loss = val_loss
            
            # Check for pruning
            if trial.should_prune():
                print(f"🔥 Trial {trial.number} pruned at epoch {epoch}")
                if wandb_run:
                    wandb_run.log({'pruned': True, 'pruned_epoch': epoch})
                    wandb_run.finish()
                raise optuna.TrialPruned()
            
            time.sleep(0.1)  # Small delay to simulate training
        
        # Log final results
        if wandb_run:
            try:
                wandb_run.log({
                    'final_val_loss': best_val_loss,
                    'trial_completed': True
                })
                wandb_run.finish()
            except Exception as log_error:
                print(f"⚠️ Final W&B logging failed: {log_error}")
        
        print(f"✅ Trial {trial.number} completed with val_loss: {best_val_loss:.4f}")
        return best_val_loss
        
    except optuna.TrialPruned:
        raise  # Re-raise pruned trials
    except Exception as e:
        print(f"❌ Trial {trial.number} failed: {e}")
        if wandb_run:
            try:
                wandb_run.log({'error': str(e), 'trial_failed': True})
                wandb_run.finish()
            except:
                pass
        
        # Return a high loss value for failed trials instead of crashing
        return float('inf')

print("✅ Multi-objective optimization setup completed!")

In [None]:
def objective(trial: optuna.Trial) -> Tuple[float, float, float, float, float]:
    """
    Multi-objective optimization function for Optuna.
    
    Objectives to minimize:
    1. Validation MAE (primary objective)
    2. Memory consumption (MB)
    3. Training time (minutes)
    4. Inverse throughput (to maximize throughput)
    5. Latency (ms per sample)
    
    Args:
        trial: Optuna trial object
        
    Returns:
        Tuple of objectives to minimize
    """
    
    # Reset system monitor for this trial
    system_monitor.reset()
    
    try:
        # Create model with trial hyperparameters
        model = create_model_with_config(trial, data_info)
        
        # Initialize W&B logging for this trial if available
        logger = None
        if WANDB_AVAILABLE and WANDB_API_KEY:
            try:
                # Try to initialize W&B with better error handling
                logger = WandbLogger(
                    project=WANDB_PROJECT,
                    entity=WANDB_ENTITY,
                    name=f"trial_{trial.number}",
                    group="optuna_optimization",
                    tags=["multi_objective", "zinc", "graph_regression"],
                    config={
                        **{f"hp_{k}": v for k, v in trial.params.items()},
                        **CONFIG,
                        "trial_number": trial.number,
                        "debug_mode": DEBUG_MODE if 'DEBUG_MODE' in globals() else False
                    }
                    # Note: settings parameter doesn't exist in WandbLogger
                    # W&B settings are controlled via environment variables
                )
            except Exception as e:
                error_msg = str(e).lower()
                if "permission denied" in error_msg or "upsert bucket" in error_msg:
                    print(f"⚠️ W&B authentication issue, continuing without logging: {e}")
                elif "api_key" in error_msg or "login" in error_msg:
                    print(f"⚠️ W&B API key issue, continuing without logging: {e}")
                else:
                    print(f"⚠️ W&B logger failed to initialize: {e}")
                logger = None
        
        # Set up callbacks - FIXED: Create fresh instances for each trial
        callbacks = []
        
        # Add Early Stopping - create new instance each time
        early_stopping = EarlyStopping(
            monitor='val_mae',
            patience=CONFIG['early_stopping_patience'],
            mode='min',
            verbose=False
        )
        callbacks.append(early_stopping)
        
        # Add Learning Rate Monitor - create new instance each time
        lr_monitor = LearningRateMonitor(logging_interval='epoch')
        callbacks.append(lr_monitor)
        
        # Skip Optuna pruning callback to avoid "Expected a parent" error
        # This callback can cause issues with PyTorch Lightning's callback system
        # Manual pruning can be implemented if needed
        
        # Create trainer with simplified configuration
        trainer_kwargs = {
            'max_epochs': CONFIG['max_epochs'],
            'accelerator': 'auto',
            'devices': 1,
            'logger': logger,
            'callbacks': callbacks,
            'enable_progress_bar': False,  # Disable for cleaner output
            'enable_checkpointing': False,  # Disable to save space
            'val_check_interval': CONFIG['val_check_interval'],
            'log_every_n_steps': CONFIG['log_every_n_steps'],
            'deterministic': True,
        }
        
        # Add gradient clipping if model supports it
        try:
            if hasattr(model, 'hparams') and hasattr(model.hparams, 'gradient_clip_val'):
                trainer_kwargs['gradient_clip_val'] = model.hparams.gradient_clip_val
        except Exception:
            pass  # Skip gradient clipping if not available
        
        trainer = Trainer(**trainer_kwargs)
        
        # Start monitoring
        print(f"\n🚀 Starting trial {trial.number}")
        trial_start_time = time.time()
        
        # Start system monitoring manually
        system_monitor.start_monitoring()
        
        # Train the model
        trainer.fit(
            model,
            train_dataloaders=data_info['train_loader'],
            val_dataloaders=data_info['val_loader']
        )
        
        # Record training completion for monitoring
        training_end_time = time.time()
        training_duration = training_end_time - trial_start_time
        
        # Estimate samples processed (rough calculation)
        num_epochs = trainer.current_epoch + 1
        samples_per_epoch = data_info['num_train_samples'] + data_info['num_val_samples'] 
        total_samples = num_epochs * samples_per_epoch
        system_monitor.samples_processed = total_samples
        system_monitor.epoch_count = num_epochs
        
        # Get final validation metrics
        try:
            val_results = trainer.validate(model, data_info['val_loader'], verbose=False)
            val_mae = val_results[0]['val_mae']
        except Exception as e:
            print(f"⚠️ Could not get validation results: {e}")
            # Try to get from callback metrics
            val_mae = trainer.callback_metrics.get('val_mae', float('inf'))
            if val_mae == float('inf'):
                print("❌ Could not retrieve validation MAE")
                raise optuna.TrialPruned()
        
        # Get system metrics
        system_metrics = system_monitor.get_metrics()
        
        # Calculate objectives with fallbacks
        memory_consumption = max(0, system_metrics.get('memory_consumption_mb', 0))
        training_time = max(0.01, system_metrics.get('training_time_minutes', 0.01))  # Minimum 0.01 to avoid division by zero
        throughput = max(1e-6, system_metrics.get('throughput_samples_per_sec', 1e-6))
        latency = system_metrics.get('latency_ms_per_sample', float('inf'))
        
        # Inverse throughput (to minimize for maximizing throughput)
        inverse_throughput = 1.0 / throughput
        
        # Sanity checks for objectives
        if not torch.isfinite(torch.tensor(val_mae)):
            print(f"❌ Invalid validation MAE: {val_mae}")
            raise optuna.TrialPruned()
        
        # Cap extreme values to prevent optimizer confusion
        memory_consumption = min(memory_consumption, 10000)  # Max 10GB
        training_time = min(training_time, 120)  # Max 2 hours
        inverse_throughput = min(inverse_throughput, 1000)  # Min 0.001 throughput
        latency = min(latency, 10000)  # Max 10 seconds per sample
        
        # Log trial results
        trial_results = {
            'val_mae': float(val_mae),
            'memory_consumption_mb': memory_consumption,
            'training_time_minutes': training_time,
            'throughput_samples_per_sec': throughput,
            'latency_ms_per_sample': latency,
            'inverse_throughput': inverse_throughput,
            'model_parameters': sum(p.numel() for p in model.parameters()),
            'trial_duration_minutes': (time.time() - trial_start_time) / 60,
            'epochs_completed': system_metrics.get('epochs_completed', 0),
        }
        
        print(f"📊 Trial {trial.number} Results:")
        for key, value in trial_results.items():
            if isinstance(value, float):
                print(f"  {key}: {value:.4f}")
            else:
                print(f"  {key}: {value}")
        
        # Log to W&B if available
        if logger is not None:
            try:
                logger.experiment.log({
                    "trial_objectives/val_mae": val_mae,
                    "trial_objectives/memory_consumption_mb": memory_consumption,
                    "trial_objectives/training_time_minutes": training_time,
                    "trial_objectives/inverse_throughput": inverse_throughput,
                    "trial_objectives/latency_ms": latency,
                    **{f"trial_results/{k}": v for k, v in trial_results.items()},
                    **{f"system_metrics/{k}": v for k, v in system_metrics.items()},
                })
                
                # Finish W&B run
                wandb.finish()
            except Exception as e:
                print(f"⚠️ Failed to log to W&B: {e}")
        
        # Return objectives to minimize
        objectives = (float(val_mae), memory_consumption, training_time, inverse_throughput, latency)
        
        # Final validation of objectives
        if any(not torch.isfinite(torch.tensor(obj)) for obj in objectives):
            print(f"❌ Invalid objectives detected: {objectives}")
            raise optuna.TrialPruned()
        
        return objectives
        
    except optuna.TrialPruned:
        print(f"✂️ Trial {trial.number} was pruned")
        raise  # Re-raise pruning exception
        
    except Exception as e:
        print(f"❌ Trial {trial.number} failed: {str(e)}")
        print(f"Error type: {type(e).__name__}")
        
        # Log error details for debugging
        import traceback
        error_details = traceback.format_exc()
        print(f"Error traceback: {error_details}")
        
        # Clean up W&B if needed
        if 'logger' in locals() and logger is not None:
            try:
                # Properly finish W&B run
                if hasattr(logger, 'experiment'):
                    logger.experiment.finish()
                wandb.finish()
            except Exception as cleanup_error:
                print(f"⚠️ W&B cleanup failed: {cleanup_error}")
                # Try alternative cleanup
                try:
                    wandb.finish(exit_code=1)
                except:
                    pass
            
        # Return poor objectives for failed trials
        return (float('inf'), float('inf'), float('inf'), float('inf'), float('inf'))
    
    finally:
        # Clean up GPU memory and resources
        try:
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()
            
            # Force cleanup of Lightning trainer and callbacks
            if 'trainer' in locals():
                del trainer
            if 'model' in locals():
                del model
            if 'callbacks' in locals():
                del callbacks
                
        except Exception as e:
            print(f"⚠️ Cleanup warning: {e}")

print("✅ Enhanced objective function with callback fix defined!")

# Test objective function setup (without running a full trial)
try:
    print("🧪 Testing objective function setup...")
    # This just tests that we can create the function without errors
    print("✅ Objective function setup test passed!")
except Exception as e:
    print(f"❌ Objective function setup test failed: {e}")
    print("Please check your configuration before proceeding.")

In [None]:
# Create and Run Optuna Study

def create_optuna_study():
    """Create Optuna study with multi-objective optimization."""
    
    # Configure storage
    storage = None
    if OPTUNA_DB_URL:
        try:
            storage = optuna.storages.RDBStorage(
                url=OPTUNA_DB_URL,
                engine_kwargs={"pool_pre_ping": True}
            )
            print("✅ Connected to remote Optuna database")
        except Exception as e:
            print(f"⚠️ Failed to connect to remote database: {e}")
            print("📁 Using local SQLite storage")
            storage = optuna.storages.RDBStorage("sqlite:///optuna_study.db")
    else:
        print("📁 Using local SQLite storage")
        storage = optuna.storages.RDBStorage("sqlite:///optuna_study.db")
    
    # Create multi-objective study
    study = optuna.create_study(
        study_name=STUDY_NAME,
        storage=storage,
        directions=['minimize'] * 5,  # All objectives to minimize
        sampler=optuna.samplers.NSGAIISampler(population_size=20),
        pruner=optuna.pruners.MedianPruner(
            n_startup_trials=5,
            n_warmup_steps=10,
            interval_steps=5
        ),
        load_if_exists=True
    )
    
    return study

def run_optimization():
    """Run the multi-objective hyperparameter optimization."""
    
    print("🎯 Starting Multi-Objective Hyperparameter Optimization")
    print("="*60)
    print(f"Objectives to minimize:")
    print(f"  1. Validation MAE (primary)")
    print(f"  2. Memory consumption (MB)")
    print(f"  3. Training time (minutes)")
    print(f"  4. Inverse throughput (maximize throughput)")
    print(f"  5. Latency per sample (ms)")
    print("="*60)
    
    # Create study
    study = create_optuna_study()
    
    # Set timeout based on Kaggle constraints
    timeout_seconds = CONFIG['timeout_hours'] * 3600
    
    try:
        # Run optimization
        study.optimize(
            objective,
            n_trials=CONFIG['num_trials'],
            timeout=timeout_seconds,
            show_progress_bar=True
        )
        
        print("\n🎉 Optimization completed!")
        
        # Analyze results
        analyze_optimization_results(study)
        
        return study
        
    except KeyboardInterrupt:
        print("\n⏹️ Optimization interrupted by user")
        return study
    except Exception as e:
        print(f"\n❌ Optimization failed: {str(e)}")
        return study

def analyze_optimization_results(study: optuna.Study):
    """Analyze and display optimization results."""
    
    print("\n📊 Optimization Results Analysis")
    print("="*50)
    
    # Get all trials
    trials = study.trials
    completed_trials = [t for t in trials if t.state == optuna.trial.TrialState.COMPLETE]
    
    print(f"Total trials: {len(trials)}")
    print(f"Completed trials: {len(completed_trials)}")
    print(f"Pruned trials: {len([t for t in trials if t.state == optuna.trial.TrialState.PRUNED])}")
    print(f"Failed trials: {len([t for t in trials if t.state == optuna.trial.TrialState.FAIL])}")
    
    if not completed_trials:
        print("⚠️ No completed trials to analyze")
        return
    
    # Find Pareto front (best trade-offs)
    pareto_trials = []
    for trial in completed_trials:
        is_dominated = False
        for other_trial in completed_trials:
            if trial == other_trial:
                continue
            # Check if other_trial dominates trial
            if all(other_val <= trial_val for other_val, trial_val in 
                   zip(other_trial.values, trial.values)) and \
               any(other_val < trial_val for other_val, trial_val in 
                   zip(other_trial.values, trial.values)):
                is_dominated = True
                break
        if not is_dominated:
            pareto_trials.append(trial)
    
    print(f"\n🏆 Pareto-optimal solutions: {len(pareto_trials)}")
    
    # Display best solutions for each objective
    objective_names = [
        "Validation MAE",
        "Memory Consumption (MB)", 
        "Training Time (min)",
        "Inverse Throughput",
        "Latency (ms)"
    ]
    
    print("\n🥇 Best solutions for each objective:")
    for i, obj_name in enumerate(objective_names):
        best_trial = min(completed_trials, key=lambda t: t.values[i])
        print(f"\n{obj_name}:")
        print(f"  Value: {best_trial.values[i]:.4f}")
        print(f"  Trial: {best_trial.number}")
        print(f"  Key params: {dict(list(best_trial.params.items())[:3])}")
    
    # Save results
    results_df = pd.DataFrame([
        {
            'trial_number': trial.number,
            'val_mae': trial.values[0],
            'memory_mb': trial.values[1], 
            'training_time_min': trial.values[2],
            'inverse_throughput': trial.values[3],
            'latency_ms': trial.values[4],
            **trial.params
        }
        for trial in completed_trials
    ])
    
    results_file = "optuna_results.csv"
    results_df.to_csv(results_file, index=False)
    print(f"\n💾 Results saved to {results_file}")
    
    # Display summary statistics
    print("\n📈 Summary Statistics:")
    for i, obj_name in enumerate(objective_names):
        values = [trial.values[i] for trial in completed_trials]
        print(f"{obj_name}: min={min(values):.4f}, max={max(values):.4f}, mean={np.mean(values):.4f}")

print("✅ Optimization functions ready!")

In [2]:
# 🚀 RUN OPTIMIZATION
# This is the main execution cell

if __name__ == "__main__":
    print("🎯 ZINC Graph Regression Multi-Objective Hyperparameter Optimization")
    print("="*70)
    print(f"Environment: {'Kaggle GPU' if torch.cuda.is_available() else 'CPU'}")
    print(f"Dataset: ZINC (Small subset)")
    print(f"Max trials: {CONFIG['num_trials']}")
    print(f"Timeout: {CONFIG['timeout_hours']} hours")
    print(f"Mode: {'DEBUG' if DEBUG_MODE else 'PRODUCTION'}")
    print("="*70)
    
    # Pre-execution validation
    try:
        print("\n🔍 Pre-execution validation...")
        
        # Check data is loaded
        if 'data_info' not in locals():
            raise RuntimeError("Data not loaded. Please run data preparation cell first.")
        
        # Check system monitor is ready
        if 'system_monitor' not in locals():
            raise RuntimeError("System monitor not initialized. Please run system monitor cell first.")
        
        # Quick model creation test
        print("🧪 Testing model creation...")
        
        # Create a simple dummy trial for testing model creation
        class DummyTrial:
            def __init__(self, params):
                self.params = params
                self.number = 0
            def suggest_categorical(self, name, choices): return self.params[name]
            def suggest_int(self, name, low, high): return self.params[name]
            def suggest_float(self, name, low, high, log=False): return self.params[name]
        
        test_params = {
            'hidden_dim': 128,
            'num_layers': 3,
            'layer_name': 'GINEConv',
            'dropout_rate': 0.1,
            'global_dropout': 0.1,
            'pooling_type': 'mean',
            'regressor_type': 'mlp',
            'regressor_choice': 1,  # This will give [hidden_dim, hidden_dim//2]
            'mlp_dropout': 0.1,
            'optimizer': 'adamw',
            'lr': 1e-3,
            'weight_decay': 1e-4,
            'lr_scheduler': 'cosine',
            'use_batch_norm': True,
            'gradient_clip_val': 1.0,
        }
        
        dummy_trial = DummyTrial(test_params)
        test_model = create_model_with_config(dummy_trial, data_info)
        del test_model, dummy_trial, test_params
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        print("✅ Pre-execution validation passed!")
        
    except Exception as e:
        print(f"❌ Pre-execution validation failed: {e}")
        print("Please check your setup and try again.")
        raise
    
    # Initialize W&B project if available
    project_run = None
    if WANDB_AVAILABLE and WANDB_API_KEY:
        try:
            project_run = wandb.init(
                project=WANDB_PROJECT,
                entity=WANDB_ENTITY,
                name="multi_objective_optimization_master",
                job_type="hyperparameter_search",
                tags=["optuna", "multi_objective", "zinc", "graph_regression"],
                config={
                    **CONFIG,
                    "debug_mode": DEBUG_MODE,
                    "total_dataset_size": data_info['num_train_samples'] + data_info['num_val_samples'],
                    "batch_size": BATCH_SIZE if 'BATCH_SIZE' in locals() else 32,
                }
            )
            print("✅ W&B master experiment tracking initialized")
        except Exception as e:
            print(f"⚠️ W&B master initialization failed: {e}")
    
    # Run the optimization with comprehensive error handling
    try:
        study = run_optimization()
        optimization_successful = True
        
    except KeyboardInterrupt:
        print("\n⏹️ Optimization interrupted by user")
        study = None
        optimization_successful = False
        
    except Exception as e:
        print(f"\n❌ Optimization failed with error: {str(e)}")
        import traceback
        print(f"Error details: {traceback.format_exc()}")
        study = None
        optimization_successful = False
    
    # Analyze results if we have any
    best_trial_info = None
    if study and study.trials:
        try:
            print("\n🧪 Evaluating best models on test set...")
            
            completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
            
            if completed_trials:
                # Get best trial for primary objective (validation MAE)
                best_mae_trial = min(completed_trials, key=lambda t: t.values[0])
                
                print(f"\n🏆 Best model (lowest validation MAE):")
                print(f"Trial: {best_mae_trial.number}")
                print(f"Validation MAE: {best_mae_trial.values[0]:.4f}")
                print(f"Memory consumption: {best_mae_trial.values[1]:.1f} MB")
                print(f"Training time: {best_mae_trial.values[2]:.2f} minutes")
                print(f"Throughput: {1.0/best_mae_trial.values[3]:.1f} samples/sec")
                print(f"Latency: {best_mae_trial.values[4]:.2f} ms/sample")
                
                # Store best trial info for summary (validation-based only)
                print(f"\n✅ Best hyperparameters identified based on validation performance")
                print(f"📝 Note: Test set is reserved for final evaluation after optimization")
                
                # Calculate model parameters from the best trial
                try:
                    dummy_trial = DummyTrial(best_mae_trial.params)
                    temp_model = create_model_with_config(dummy_trial, data_info)
                    model_params = sum(p.numel() for p in temp_model.parameters())
                    del temp_model, dummy_trial
                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
                except Exception as e:
                    print(f"⚠️ Could not calculate model parameters: {e}")
                    model_params = 0
                
                # Store best trial info for summary (no test results)
                best_trial_info = {
                    "trial_number": best_mae_trial.number,
                    "val_mae": best_mae_trial.values[0],
                    "memory_mb": best_mae_trial.values[1],
                    "training_time_min": best_mae_trial.values[2],
                    "model_parameters": model_params,
                    "params": best_mae_trial.params
                }
                
                # Log validation results to W&B
                if project_run is not None:
                    try:
                        project_run.log({
                            "best_model/val_mae": best_mae_trial.values[0],
                            "best_model/memory_consumption_mb": best_mae_trial.values[1],
                            "best_model/training_time_minutes": best_mae_trial.values[2],
                            "best_model/model_parameters": model_params,
                            **{f"best_model/hp_{k}": v for k, v in best_mae_trial.params.items()},
                        })
                        print("✅ Best model validation results logged to W&B")
                    except Exception as e:
                        print(f"⚠️ Failed to log results to W&B: {e}")
                
                print("\n✅ Hyperparameter optimization analysis completed!")
                print("🔒 Test set remains untouched for unbiased final evaluation")
                
            else:
                print("❌ No completed trials to evaluate")
        
        except Exception as e:
            print(f"❌ Results analysis failed: {e}")
            import traceback
            print(f"Error details: {traceback.format_exc()}")
    
    # Final cleanup and summary
    try:
        # Finish W&B logging
        if project_run is not None:
            wandb.finish()
        
        print("\n📋 FINAL SUMMARY:")
        print("="*50)
        print(f"- Dataset: ZINC (training: {data_info['num_train_samples']}, validation: {data_info['num_val_samples']}, test: {data_info['num_test_samples']})")
        print(f"- Optimization: {'✅ Completed' if optimization_successful else '❌ Failed/Interrupted'}")
        print(f"- Total trials: {len(study.trials) if study else 0}")
        print(f"- Completed trials: {len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]) if study else 0}")
        print(f"- Results saved: {'✅ optuna_results.csv' if study and study.trials else '❌ No results to save'}")
        print(f"- W&B logging: {'✅' if WANDB_AVAILABLE and WANDB_API_KEY else '❌'}")
        print(f"- Remote database: {'✅' if OPTUNA_DB_URL else '❌ (local SQLite used)'}")
        
        if best_trial_info:
            print(f"\n🏆 BEST MODEL SUMMARY (Validation-Based):")
            print(f"- Trial: {best_trial_info['trial_number']}")
            print(f"- Validation MAE: {best_trial_info['val_mae']:.4f}")
            print(f"- Architecture: {best_trial_info['params']['layer_name']}-{best_trial_info['params']['hidden_dim']}-{best_trial_info['params']['num_layers']}")
            print(f"- Parameters: {best_trial_info['model_parameters']:,}")
            print(f"- Memory: {best_trial_info['memory_mb']:.1f} MB")
            print(f"- Training time: {best_trial_info['training_time_min']:.2f} min")
            print(f"🔒 Test performance: Reserved for unbiased final evaluation")
        
        print("\n🎉 Multi-objective hyperparameter optimization completed!")
        print("Check the CSV file and W&B dashboard for detailed results.")
        print("\n🔒 IMPORTANT: Test set preserved for unbiased final evaluation!")
        print("   Use the best hyperparameters to train a final model and")
        print("   evaluate it ONCE on the test set for publication results.")
        
    except Exception as e:
        print(f"⚠️ Final cleanup/summary failed: {e}")
    
    print("\n" + "="*70)
    print("Notebook execution completed. Thank you for using ACA GraphML!")
    print("="*70)

🎯 ZINC Graph Regression Multi-Objective Hyperparameter Optimization


NameError: name 'torch' is not defined

In [None]:
# 🔍 W&B Entity Diagnostic
# This cell helps identify the correct entity name to use

try:
    import wandb
    api = wandb.Api()
    user = api.viewer
    
    print("🔍 W&B Account Information:")
    print(f"   Personal Username: {user.username}")
    print(f"   Current Entity Setting: {WANDB_ENTITY if WANDB_ENTITY else 'Not set'}")
    
    # List available entities (organizations/teams)
    print(f"\n📋 Available Entities for {user.username}:")
    
    # Get user teams/organizations
    try:
        # Try to list projects to see available entities
        projects = api.projects()  # This will show projects from default entity
        print(f"   Default entity projects found: {len(list(projects))}")
        
        # Check if user has team access
        print(f"\n💡 Instructions:")
        print(f"   1. Go to https://wandb.ai and check your dashboard URL")
        print(f"   2. The entity name appears in URLs like: wandb.ai/[ENTITY]/[PROJECT]")
        print(f"   3. Use the organization/team name, not your personal username '{user.username}'")
        print(f"   4. Update your Kaggle secret 'WANDB_ENTITY' with the correct entity name")
        
    except Exception as e:
        print(f"   Error listing entities: {e}")
        
except Exception as e:
    print(f"❌ W&B diagnostic failed: {e}")
    print("Make sure W&B is logged in first")

In [None]:
# 🔧 W&B DEBUGGING AND SETUP
# Run this cell if you're having W&B authentication issues

def diagnose_wandb_issues():
    """Diagnose common W&B authentication problems."""
    print("🔍 W&B Diagnostic Check")
    print("=" * 50)
    
    # Check if wandb is installed
    try:
        import wandb
        print("✅ W&B library is installed")
        print(f"   Version: {wandb.__version__}")
    except ImportError:
        print("❌ W&B library not found")
        print("💡 Install with: !pip install wandb")
        return
    
    # Check if API key is configured
    if WANDB_API_KEY:
        print("✅ W&B API key is configured")
        print(f"   Key preview: {WANDB_API_KEY[:8]}...")
    else:
        print("❌ W&B API key not configured")
        print("💡 Set WANDB_API_KEY in Kaggle secrets")
        return
    
    # Test API key validity
    try:
        print("\n🧪 Testing W&B API connection...")
        
        # Initialize W&B in offline mode first to test
        wandb.init(
            mode="offline",
            project="test-connection",
            name="diagnostic-test"
        )
        wandb.finish()
        print("✅ Offline mode works")
        
        # Now test online mode
        api = wandb.Api(api_key=WANDB_API_KEY)
        user = api.viewer
        print(f"✅ API key valid - logged in as: {user.username}")
        
        # Test project access
        try:
            projects = list(api.projects(entity=WANDB_ENTITY))
            print(f"✅ Can access projects for entity: {WANDB_ENTITY}")
        except Exception as e:
            print(f"⚠️ Project access issue: {e}")
            print(f"💡 Check if entity '{WANDB_ENTITY}' exists and you have access")
        
    except Exception as e:
        print(f"❌ W&B API test failed: {e}")
        error_msg = str(e).lower()
        
        if "permission denied" in error_msg or "unauthorized" in error_msg:
            print("💡 API key may be invalid or expired")
            print("   - Go to https://wandb.ai/settings")
            print("   - Generate a new API key")
            print("   - Update your Kaggle secret")
        elif "network" in error_msg or "connection" in error_msg:
            print("💡 Network connectivity issue")
            print("   - Check internet connection")
            print("   - Try again in a few minutes")
        else:
            print("💡 Unknown issue - check W&B status at https://status.wandb.ai")

def setup_wandb_properly():
    """Properly configure W&B for the optimization."""
    global WANDB_API_KEY, WANDB_AVAILABLE
    
    print("🔧 Setting up W&B for optimization")
    print("=" * 50)
    
    if not WANDB_API_KEY:
        print("❌ Cannot setup W&B - API key not configured")
        return False
    
    try:
        # Login with force refresh
        wandb.login(key=WANDB_API_KEY, force=True)
        
        # Configure settings for optimization workload
        # Note: wandb.settings.update() doesn't exist in newer versions
        # Settings are configured via environment variables or wandb.init() parameters
        import os
        os.environ["WANDB_MODE"] = "online"
        os.environ["WANDB_START_METHOD"] = "thread"
        
        print("✅ W&B configured successfully for optimization")
        return True
        
    except Exception as e:
        print(f"❌ W&B setup failed: {e}")
        return False

# Run diagnostics
if __name__ == "__main__":
    print("🔍 Running W&B diagnostic...")
    diagnose_wandb_issues()
    
    print("\n" + "="*50)
    if WANDB_API_KEY:
        print("🔧 Setting up W&B for optimization...")
        success = setup_wandb_properly()
        if success:
            print("🎉 W&B ready for optimization!")
        else:
            print("❌ W&B setup failed - optimization will run without logging")
    else:
        print("⚠️ W&B not configured - optimization will run without logging")
    print("="*50)