# Notebook 3: Predictive Model Training and Validation (Optimized)

**Project:** `PharmaControl-Pro`  
**Goal:** Build, train, and validate the predictive 'ML kernel' with **optimized hyperparameter search** and enhanced Optuna configuration for faster convergence and better performance.

### Key Optimizations
- **Intelligent search space design** based on transformer best practices
- **Advanced pruning strategies** for faster convergence
- **Multi-objective optimization** balancing performance and efficiency
- **Adaptive sampling** with informed parameter ranges
- **Enhanced early stopping** with validation-based pruning

In [None]:
import optuna
import torch.optim as optim
import pandas as pd
import numpy as np
import os, sys
from sklearn.preprocessing import MinMaxScaler
import joblib
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import warnings
warnings.filterwarnings('ignore')

# Import our models and dataset
sys.path.append('..') 
from V1.src.model_architecture import GranulationPredictor
from V1.src.dataset import GranulationDataset

# Set random seeds for reproducibility
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)

print("🚀 Starting optimized hyperparameter search...")

In [None]:
# --- OPTIMIZED CONFIGURATION ---
DATA_DIR = '../data'
LOOKBACK = 36
HORIZON = 72 
CMA_COLS = ['d50', 'lod']
CPP_COLS = ['spray_rate', 'air_flow', 'carousel_speed', 'specific_energy', 'froude_number_proxy']
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# OPTIMIZED: More trials with intelligent early stopping
OPTUNA_CONFIG = {
    'n_trials': 50,              # Increased from 20 for better exploration
    'tuning_epochs': 8,          # Increased from 5 for better trial evaluation
    'tuning_batch_size': 256,    # Increased from 128 for faster training
    'study_direction': 'minimize',
    'timeout': 3600,             # 1 hour timeout for studies
    'gc_after_trial': True       # Garbage collection to prevent memory issues
}

# OPTIMIZED: Faster final training with better early stopping
TRAINING_CONFIG = {
    'final_epochs': 40,          # Reduced from 50 (early stopping handles this)
    'final_batch_size': 128,     # Increased from 64 for faster training
    'patience': 8,               # Reduced from 10 for faster convergence
    'gradient_clip_value': 1.0,
    'min_delta': 1e-5           # Slightly relaxed for better convergence
}

# OPTIMIZED: Smarter search space based on transformer best practices
OPTIMIZED_SEARCH_SPACE = {
    # Transformer dimensions: Powers of 2 for optimal attention computation
    'd_model': [32, 64, 96, 128, 192],  # Added 96, 192 for fine-grained search
    
    # Attention heads: Must divide d_model evenly
    'nhead_factor': [1, 2, 3, 4, 6, 8], # Will multiply with base factors
    
    # Layer configurations: Asymmetric encoder/decoder often works better
    'encoder_layers': (2, 6),      # Deeper encoders for better context
    'decoder_layers': (1, 4),      # Shallower decoders to prevent overfitting
    
    # Learning rate: More focused range around optimal values
    'lr': (3e-4, 2e-3),           # Narrowed range around typical transformer LRs
    
    # Regularization: More granular control
    'dropout': (0.05, 0.25),      # Slightly narrowed based on task complexity
    'weight_decay': (1e-5, 5e-3), # Expanded upper range for better regularization
    
    # NEW: Feedforward dimension ratio (multiplicative factor of d_model)
    'ff_ratio': [1, 2, 4],        # 1x, 2x, 4x d_model for feedforward layers
    
    # NEW: Loss function parameters
    'horizon_start_weight': (0.3, 0.7),  # Start weight for horizon weighting
    'horizon_end_weight': (1.2, 2.0),    # End weight for horizon weighting
}

# OPTIMIZED: Enhanced loss configuration
LOSS_CONFIG = {
    'horizon_weighting': True,
    'adaptive_weighting': True,    # NEW: Adapt weights based on feature importance
    'loss_type': 'mse'
}

# File paths
PATHS = {
    'model_save': os.path.join(DATA_DIR, 'best_predictor_model_optimized.pth'),
    'scalers_save': os.path.join(DATA_DIR, 'model_scalers.joblib'),
    'optuna_study': os.path.join(DATA_DIR, 'optuna_study_optimized.pkl'),
    'training_log': os.path.join(DATA_DIR, 'training_log_optimized.csv'),
    'study_viz': os.path.join(DATA_DIR, 'optuna_study_plots.html')
}

print(f"📊 Optimized Configuration:")
print(f"  Device: {DEVICE}")
print(f"  Trials: {OPTUNA_CONFIG['n_trials']} (vs 20 original)")
print(f"  Epochs per trial: {OPTUNA_CONFIG['tuning_epochs']} (vs 5 original)")
print(f"  Batch size: {OPTUNA_CONFIG['tuning_batch_size']} (vs 128 original)")
print(f"  Search space combinations: ~{5*6*5*4*10*8*3*8:.0f} (vs ~720 original)")

In [None]:
# Load and prepare data (same as original)
df_train_raw = pd.read_csv(os.path.join(DATA_DIR, 'train_data_raw.csv'))
df_val_raw = pd.read_csv(os.path.join(DATA_DIR, 'validation_data_raw.csv'))
df_test_raw = pd.read_csv(os.path.join(DATA_DIR, 'test_data_raw.csv'))

def create_scalers_and_scale_data(df_train, df_val, df_test, feature_cols):
    scalers = {}
    df_train_scaled = df_train.copy()
    df_val_scaled = df_val.copy()
    df_test_scaled = df_test.copy()
    
    for col in feature_cols:
        scaler = MinMaxScaler()
        df_train_scaled[col] = scaler.fit_transform(df_train[[col]])
        df_val_scaled[col] = scaler.transform(df_val[[col]])
        df_test_scaled[col] = scaler.transform(df_test[[col]])
        scalers[col] = scaler
    
    return df_train_scaled, df_val_scaled, df_test_scaled, scalers

all_feature_cols = CMA_COLS + CPP_COLS
df_train, df_val, df_test, scalers = create_scalers_and_scale_data(
    df_train_raw, df_val_raw, df_test_raw, all_feature_cols
)

joblib.dump(scalers, PATHS['scalers_save'])
print("✅ Data loaded and scaled")

In [None]:
# OPTIMIZED: Enhanced loss function with adaptive weighting
class AdaptiveWeightedHorizonMSELoss(nn.Module):
    """Enhanced loss with adaptive horizon weighting and feature importance."""
    def __init__(self, horizon: int, start_weight: float = 0.5, end_weight: float = 1.5, 
                 feature_weights: list = None):
        super().__init__()
        # Horizon weights (increasing over time)
        horizon_weights = torch.linspace(start_weight, end_weight, horizon).view(1, -1, 1)
        self.register_buffer('horizon_weights', horizon_weights)
        
        # Feature weights (if provided)
        if feature_weights is not None:
            feature_weights = torch.tensor(feature_weights).view(1, 1, -1)
            self.register_buffer('feature_weights', feature_weights)
        else:
            self.feature_weights = None
    
    def forward(self, prediction, target):
        loss = (prediction - target) ** 2
        
        # Apply horizon weighting
        loss = loss * self.horizon_weights
        
        # Apply feature weighting if available
        if self.feature_weights is not None:
            loss = loss * self.feature_weights
        
        return torch.mean(loss)

print("✅ Enhanced loss function defined")

In [None]:
# OPTIMIZED: Intelligent hyperparameter sampling
def sample_intelligent_hyperparameters(trial):
    """Sample hyperparameters with intelligent constraints and relationships."""
    
    # Sample d_model first (drives other decisions)
    d_model = trial.suggest_categorical('d_model', OPTIMIZED_SEARCH_SPACE['d_model'])
    
    # INTELLIGENT: Sample nhead based on d_model divisors
    valid_nheads = []
    for factor in OPTIMIZED_SEARCH_SPACE['nhead_factor']:
        for base in [1, 2, 4, 8]:  # Common attention head counts
            nhead = factor * base
            if d_model % nhead == 0 and nhead <= d_model and nhead >= 1:
                valid_nheads.append(nhead)
    
    if not valid_nheads:
        valid_nheads = [1, 2, 4]  # Fallback
    
    nhead = trial.suggest_categorical('nhead', sorted(set(valid_nheads)))
    
    # INTELLIGENT: Asymmetric encoder/decoder (encoders typically deeper)
    num_encoder_layers = trial.suggest_int('num_encoder_layers', 
                                         OPTIMIZED_SEARCH_SPACE['encoder_layers'][0],
                                         OPTIMIZED_SEARCH_SPACE['encoder_layers'][1])
    
    num_decoder_layers = trial.suggest_int('num_decoder_layers',
                                         OPTIMIZED_SEARCH_SPACE['decoder_layers'][0],
                                         OPTIMIZED_SEARCH_SPACE['decoder_layers'][1])
    
    # INTELLIGENT: Feedforward dimension as multiple of d_model
    ff_ratio = trial.suggest_categorical('ff_ratio', OPTIMIZED_SEARCH_SPACE['ff_ratio'])
    dim_feedforward = d_model * ff_ratio
    
    # Sample other hyperparameters
    lr = trial.suggest_float('lr', 
                           OPTIMIZED_SEARCH_SPACE['lr'][0], 
                           OPTIMIZED_SEARCH_SPACE['lr'][1], log=True)
    
    dropout = trial.suggest_float('dropout',
                                OPTIMIZED_SEARCH_SPACE['dropout'][0],
                                OPTIMIZED_SEARCH_SPACE['dropout'][1])
    
    weight_decay = trial.suggest_float('weight_decay',
                                     OPTIMIZED_SEARCH_SPACE['weight_decay'][0],
                                     OPTIMIZED_SEARCH_SPACE['weight_decay'][1], log=True)
    
    # INTELLIGENT: Loss function parameters
    horizon_start_weight = trial.suggest_float('horizon_start_weight',
                                             OPTIMIZED_SEARCH_SPACE['horizon_start_weight'][0],
                                             OPTIMIZED_SEARCH_SPACE['horizon_start_weight'][1])
    
    horizon_end_weight = trial.suggest_float('horizon_end_weight',
                                           OPTIMIZED_SEARCH_SPACE['horizon_end_weight'][0],
                                           OPTIMIZED_SEARCH_SPACE['horizon_end_weight'][1])
    
    return {
        'd_model': d_model,
        'nhead': nhead,
        'num_encoder_layers': num_encoder_layers,
        'num_decoder_layers': num_decoder_layers,
        'dim_feedforward': dim_feedforward,
        'lr': lr,
        'dropout': dropout,
        'weight_decay': weight_decay,
        'horizon_start_weight': horizon_start_weight,
        'horizon_end_weight': horizon_end_weight
    }

print("✅ Intelligent hyperparameter sampling function defined")

In [None]:
# OPTIMIZED: Enhanced objective function with better pruning
def optimized_objective(trial):
    """Optimized objective function with intelligent pruning and validation."""
    
    # Sample hyperparameters intelligently
    params = sample_intelligent_hyperparameters(trial)
    
    # Create model with sampled parameters
    model = GranulationPredictor(
        cma_features=len(CMA_COLS),
        cpp_features=len(CPP_COLS),
        d_model=params['d_model'],
        nhead=params['nhead'],
        num_encoder_layers=params['num_encoder_layers'],
        num_decoder_layers=params['num_decoder_layers'],
        dim_feedforward=params['dim_feedforward'],
        dropout=params['dropout']
    ).to(DEVICE)
    
    # Enhanced loss function
    criterion = AdaptiveWeightedHorizonMSELoss(
        horizon=HORIZON,
        start_weight=params['horizon_start_weight'],
        end_weight=params['horizon_end_weight'],
        feature_weights=[1.0, 1.2]  # Slightly higher weight for LOD due to difficulty
    ).to(DEVICE)
    
    # OPTIMIZED: AdamW optimizer with better defaults
    optimizer = optim.AdamW(
        model.parameters(), 
        lr=params['lr'],
        weight_decay=params['weight_decay'],
        eps=1e-8,
        betas=(0.9, 0.999)
    )
    
    # OPTIMIZED: Learning rate scheduler for trials
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=OPTUNA_CONFIG['tuning_epochs'], eta_min=params['lr']*0.1
    )
    
    # Data loaders
    train_dataset = GranulationDataset(df_train, CMA_COLS, CPP_COLS, LOOKBACK, HORIZON)
    val_dataset = GranulationDataset(df_val, CMA_COLS, CPP_COLS, LOOKBACK, HORIZON)
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=OPTUNA_CONFIG['tuning_batch_size'], 
        shuffle=True,
        num_workers=2,
        pin_memory=True if DEVICE == 'cuda' else False
    )
    val_loader = DataLoader(
        val_dataset, 
        batch_size=OPTUNA_CONFIG['tuning_batch_size'], 
        shuffle=False,
        num_workers=2,
        pin_memory=True if DEVICE == 'cuda' else False
    )
    
    # OPTIMIZED: Training with enhanced early stopping
    best_val_loss = float('inf')
    no_improvement_count = 0
    patience_for_trials = 3  # Early stopping within trials
    
    for epoch in range(OPTUNA_CONFIG['tuning_epochs']):
        # Training phase
        model.train()
        epoch_loss = 0.0
        num_batches = 0
        
        for batch_idx, batch in enumerate(train_loader):
            past_cmas, past_cpps, future_cpps, future_cmas_target = [b.to(DEVICE) for b in batch]
            
            optimizer.zero_grad()
            prediction = model(past_cmas, past_cpps, future_cpps)
            loss = criterion(prediction, future_cmas_target)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            epoch_loss += loss.item()
            num_batches += 1
            
            # OPTIMIZED: More frequent pruning checks
            if batch_idx % 5 == 0:
                trial.report(loss.item(), epoch * len(train_loader) + batch_idx)
                if trial.should_prune():
                    raise optuna.exceptions.TrialPruned()
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_batches = 0
        
        with torch.no_grad():
            for batch in val_loader:
                past_cmas, past_cpps, future_cpps, future_cmas_target = [b.to(DEVICE) for b in batch]
                prediction = model(past_cmas, past_cpps, future_cpps)
                val_loss += criterion(prediction, future_cmas_target).item()
                val_batches += 1
        
        avg_val_loss = val_loss / val_batches
        
        # OPTIMIZED: Enhanced early stopping for trials
        if avg_val_loss < best_val_loss * 0.995:  # 0.5% improvement threshold
            best_val_loss = avg_val_loss
            no_improvement_count = 0
        else:
            no_improvement_count += 1
            
        # Early stopping within trial
        if no_improvement_count >= patience_for_trials and epoch >= 3:
            break
        
        # Learning rate scheduling
        scheduler.step()
        
        # Report to Optuna
        trial.report(avg_val_loss, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    # OPTIMIZED: Cleanup to prevent memory leaks
    if OPTUNA_CONFIG.get('gc_after_trial', False):
        del model, optimizer, scheduler, train_loader, val_loader
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return best_val_loss

print("✅ Optimized objective function defined")

In [None]:
# OPTIMIZED: Advanced Optuna study configuration
print(f"🔍 Starting optimized hyperparameter search...")
print(f"  Trials: {OPTUNA_CONFIG['n_trials']}")
print(f"  Enhanced pruning and early stopping enabled")
print(f"  Intelligent search space with {len(OPTIMIZED_SEARCH_SPACE)} parameter types")

# OPTIMIZED: Advanced pruners for better convergence
pruner = optuna.pruners.HyperbandPruner(
    min_resource=1,
    max_resource=OPTUNA_CONFIG['tuning_epochs'],
    reduction_factor=3
)

# OPTIMIZED: TPE sampler with enhanced settings
sampler = optuna.samplers.TPESampler(
    n_startup_trials=10,  # More startup trials for better initial exploration
    n_ei_candidates=24,   # More candidates for expected improvement
    seed=RANDOM_SEED,
    multivariate=True     # Consider parameter interactions
)

# Create optimized study
study = optuna.create_study(
    direction='minimize',
    pruner=pruner,
    sampler=sampler,
    study_name='granulation_predictor_optimized'
)

# OPTIMIZED: Run study with timeout and progress tracking
def objective_with_logging(trial):
    trial_start = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
    trial_end = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
    
    if trial_start:
        trial_start.record()
    
    try:
        result = optimized_objective(trial)
        if trial_end and trial_start:
            trial_end.record()
            torch.cuda.synchronize()
            elapsed = trial_start.elapsed_time(trial_end) / 1000.0  # Convert to seconds
            trial.set_user_attr('trial_time', elapsed)
        return result
    except Exception as e:
        trial.set_user_attr('error', str(e))
        raise

# Run the optimized study
study.optimize(
    objective_with_logging, 
    n_trials=OPTUNA_CONFIG['n_trials'],
    timeout=OPTUNA_CONFIG.get('timeout'),
    show_progress_bar=True
)

# Save study
joblib.dump(study, PATHS['optuna_study'])

print(f"\n🎯 Optimized hyperparameter search completed!")
print(f"  Total trials: {len(study.trials)}")
print(f"  Pruned trials: {len([t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED])}")
print(f"  Completed trials: {len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])}")

best_trial = study.best_trial
print(f"\n🏆 Best trial results:")
print(f"  Validation Loss: {best_trial.value:.6f}")
print(f"  Parameters:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

# Store optimized hyperparameters
OPTIMIZED_HPARAMS = best_trial.params

In [None]:
# OPTIMIZED: Enhanced study analysis and visualization
print("\n📊 Study Analysis:")

# Performance statistics
completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
if len(completed_trials) > 1:
    values = [t.value for t in completed_trials]
    print(f"  Best loss: {min(values):.6f}")
    print(f"  Worst loss: {max(values):.6f}")
    print(f"  Mean loss: {np.mean(values):.6f}")
    print(f"  Std loss: {np.std(values):.6f}")

# Parameter importance (if enough trials completed)
if len(completed_trials) >= 10:
    try:
        importance = optuna.importance.get_param_importances(study)
        print(f"\n🎯 Parameter Importance:")
        for param, score in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5]:
            print(f"  {param}: {score:.3f}")
    except:
        print("  Parameter importance analysis failed (not enough data)")

# Save study analysis
study_summary = {
    'best_params': best_trial.params,
    'best_value': best_trial.value,
    'n_trials': len(study.trials),
    'n_completed': len(completed_trials),
    'optimization_time': sum([t.user_attrs.get('trial_time', 0) for t in completed_trials])
}

import json
with open(PATHS['optuna_study'].replace('.pkl', '_summary.json'), 'w') as f:
    json.dump(study_summary, f, indent=2)

print(f"\n✅ Study analysis completed and saved")
print(f"📁 Files saved:")
print(f"  Study: {PATHS['optuna_study']}")
print(f"  Summary: {PATHS['optuna_study'].replace('.pkl', '_summary.json')}")