In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install optuna schedulefree -q

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
import numpy as np
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# CRITICAL FIX: Rename TA1.x to TA1 to match test data
train = train.rename(columns={"TA1.x": "TA1"})

# ============================================
# Feature Engineering
# ============================================
print("="*60)
print("FEATURE ENGINEERING")
print("="*60)

# Extract features and target
feature_columns = [col for col in train.columns if col != 'DIC']
X = train[feature_columns].copy()
y = train['DIC'].values
X_test = test[feature_columns].copy()

print(f"\nOriginal features: {X.shape[1]} features")

# Generate polynomial features (degree=2) including interaction terms
# This will create: original features + squared terms + all cross-product terms
poly = PolynomialFeatures(degree=2, include_bias=False)

print(f"\nGenerating polynomial features (degree=2)...")
print(f"  - Original features")
print(f"  - Squared terms (x^2)")
print(f"  - Interaction terms (x_i * x_j)")

X = poly.fit_transform(X)
X_test = poly.transform(X_test)

print(f"\nPolynomial features generated:")
print(f"  Original: 16 features")
print(f"  After polynomial expansion: {X.shape[1]} features")
print(f"  (16 original + 16 squared + 120 interaction terms)")

# ============================================
# Robust Scaling + Smooth Clipping
# ============================================
print(f"\n{'='*60}")
print("ROBUST SCALING + SMOOTH CLIPPING")
print(f"{'='*60}")

# Use RobustScaler (median and IQR-based, robust to outliers)
scaler_X = RobustScaler()
scaler_y = RobustScaler()

X_scaled = scaler_X.fit_transform(X)
X_test_scaled = scaler_X.transform(X_test)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

print(f"RobustScaler applied (median-based, robust to outliers)")

# Smooth clipping using tanh (softer than hard clipping)
clip_value = 3.0  # Soft clip at ±3 (similar to ±3 std in StandardScaler)

def smooth_clip(x, clip_val=3.0):
    """
    Smooth clipping using tanh function
    - Values around 0 are unchanged
    - Large values are smoothly compressed towards ±clip_val
    """
    return np.tanh(x / clip_val) * clip_val

X_scaled = smooth_clip(X_scaled, clip_value)
X_test_scaled = smooth_clip(X_test_scaled, clip_value)
y_scaled = smooth_clip(y_scaled, clip_value)

print(f"Smooth clipping applied (tanh-based, clip_value={clip_value})")
print(f"  X_scaled range: [{X_scaled.min():.3f}, {X_scaled.max():.3f}]")
print(f"  y_scaled range: [{y_scaled.min():.3f}, {y_scaled.max():.3f}]")

print(f"\nNormalized features:")
print(f"  X_scaled shape: {X_scaled.shape}")
print(f"  y_scaled shape: {y_scaled.shape}")

# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=SEED)

print(f"\nTrain/Val split:")
print(f"  X_train: {X_train.shape}")
print(f"  X_val:   {X_val.shape}")
print("="*60)

In [None]:
from torch.utils.data import Dataset, DataLoader

class OceanChemistryDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        # Handle both numpy arrays and pandas Series
        if hasattr(y, 'values'):
            y = y.values
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = OceanChemistryDataset(X_train, y_train)
val_dataset = OceanChemistryDataset(X_val, y_val)

# Set generator for reproducible shuffling
g = torch.Generator()
g.manual_seed(SEED)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=g)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

In [None]:
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_sizes=[128], dropout_rate=0.0, activation='relu', use_batchnorm=True):
        """
        MLP model with flexible number of hidden layers
        
        Args:
            input_size: Number of input features
            hidden_sizes: List of hidden layer sizes (e.g., [128], [128, 64], [256, 128, 64])
            dropout_rate: Dropout probability
            activation: Activation function name
            use_batchnorm: Whether to use batch normalization
        """
        super(MLPModel, self).__init__()

        self.use_batchnorm = use_batchnorm
        self.num_layers = len(hidden_sizes)

        # Activation function mapping
        activation_map = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(0.1),
            'elu': nn.ELU(),
            'gelu': nn.GELU(),
            'silu': nn.SiLU(),  # Swish
            'tanh': nn.Tanh()
        }

        # Select activation function
        self.activation = activation_map.get(activation, nn.ReLU())

        # Determine initialization based on activation
        nonlinearity = 'relu' if activation in ['relu', 'leaky_relu'] else 'linear'

        # Build hidden layers dynamically
        self.hidden_layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList() if use_batchnorm else None
        self.dropouts = nn.ModuleList()
        
        layer_sizes = [input_size] + hidden_sizes
        
        for i in range(len(hidden_sizes)):
            # Hidden layer
            layer = nn.Linear(layer_sizes[i], layer_sizes[i+1])
            if nonlinearity == 'relu':
                nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='relu')
            else:
                nn.init.xavier_normal_(layer.weight)
            nn.init.constant_(layer.bias, 0)
            self.hidden_layers.append(layer)
            
            # Batch normalization
            if use_batchnorm:
                self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i+1]))
            
            # Dropout
            self.dropouts.append(nn.Dropout(dropout_rate))
        
        # Output layer
        self.output = nn.Linear(hidden_sizes[-1], 1)
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
        
        # Residual connection (shortcut from input to output)
        self.shortcut = nn.Linear(input_size, 1)
        nn.init.xavier_normal_(self.shortcut.weight)
        nn.init.constant_(self.shortcut.bias, 0)

    def forward(self, x):
        # Main path: input -> hidden layers -> output
        h = x
        for i in range(self.num_layers):
            h = self.hidden_layers[i](h)
            if self.use_batchnorm:
                h = self.batch_norms[i](h)
            h = self.activation(h)
            h = self.dropouts[i](h)
        
        main_output = self.output(h)
        
        # Residual path: input -> output (shortcut)
        residual = self.shortcut(x)
        
        # Add residual connection
        output = main_output + residual
        
        return output

# Set device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

print(f"\nMLP Model Architecture:")
print(f"  Input -> Hidden Layers (configurable) -> Output")
print(f"  + Residual connection (Input -> Output)")
print(f"  Each hidden layer: Linear -> [BatchNorm (optional)] -> Activation -> Dropout")
print(f"  hidden_sizes: List of layer sizes (e.g., [128], [128, 64], [256, 128, 64])")
print(f"  Dropout: Fixed to 0.0")
print(f"  BatchNorm: Can be enabled/disabled")

In [None]:
import torch.optim as optim
from schedulefree import RAdamScheduleFree
import copy

# Hyperparameters
hidden_sizes = [2048]  # Example: [128] for 1 layer, [128, 64] for 2 layers, [256, 128, 64] for 3 layers
dropout_rate = 0.0
activation = 'gelu'
use_batchnorm = False
loss_function = 'mae'
optimizer_name = 'adamw_schedulefree'  # Options: 'adam', 'adamw', 'radam', 'radam_schedulefree'
lr = 0.001
weight_decay = 5e-4
batch_size = 16
scheduler_name = 'reduce_plateau'  # Ignored if optimizer_name is 'radam_schedulefree'
epochs = 1000
# Adam/AdamW/RAdam beta parameters
beta1 = 0.95
beta2 = 0.95
# Gaussian Noise Injection (Data Augmentation)
use_gaussian_noise = True
noise_std = 0.1 # Standard deviation of Gaussian noise (after scaling, so keep small)
noise_prob = 0.5  # Probability of applying noise (0.5 = 50% of batches)

print("="*60)
print("TRAINING")
print("="*60)
print(f"Hidden layers: {hidden_sizes} ({len(hidden_sizes)} layers)")
print(f"Dropout: {dropout_rate}")
print(f"Activation: {activation}")
print(f"Batch Normalization: {use_batchnorm}")
print(f"Loss: {loss_function}")
print(f"Optimizer: {optimizer_name}")
print(f"Learning rate: {lr}")
print(f"Weight decay: {weight_decay}")
print(f"Beta1: {beta1}, Beta2: {beta2}")
print(f"Batch size: {batch_size}")
if optimizer_name != 'radam_schedulefree':
    print(f"Scheduler: {scheduler_name}")
else:
    print(f"Scheduler: None (SchedulerFree)")
if use_gaussian_noise:
    print(f"Gaussian Noise: std={noise_std}, prob={noise_prob} (enabled)")
else:
    print(f"Gaussian Noise: disabled")
print(f"Epochs: {epochs}")
print("="*60)

# Loss function
loss_map = {
    'mse': nn.MSELoss(),
    'mae': nn.L1Loss(),
    'smooth_l1': nn.SmoothL1Loss(),
    'huber': nn.HuberLoss()
}
criterion = loss_map[loss_function]

# Initialize model
model = MLPModel(input_size=X_train.shape[1], hidden_sizes=hidden_sizes, 
                 dropout_rate=dropout_rate, activation=activation, use_batchnorm=use_batchnorm)
model = model.to(device)

# Optimizer selection
if optimizer_name == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
elif optimizer_name == 'adamw':
    optimizer = optim.AdamW(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
elif optimizer_name == 'radam':
    optimizer = optim.RAdam(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
elif optimizer_name == 'radam_schedulefree':
    optimizer = RAdamScheduleFree(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
else:
    optimizer = optim.AdamW(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)

# Scheduler (not used for SchedulerFree)
scheduler = None
if optimizer_name != 'radam_schedulefree':
    if scheduler_name == 'cosine':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
    elif scheduler_name == 'reduce_plateau':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20)
    elif scheduler_name == 'step':
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.5)
    elif scheduler_name == 'exponential':
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

# Inverse smooth clip function
def inverse_smooth_clip(x, clip_val=3.0):
    """
    Inverse of smooth clipping (arctanh)
    """
    # Clip to prevent arctanh overflow
    x_clipped = np.clip(x / clip_val, -0.9999, 0.9999)
    return np.arctanh(x_clipped) * clip_val

# Training loop
best_val_rmse = float('inf')
best_model_state = None  # Store best model parameters

for epoch in range(epochs):
    # Set optimizer to train mode (important for SchedulerFree)
    if optimizer_name == 'radam_schedulefree':
        optimizer.train()
    
    # Training
    model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        # Apply Gaussian Noise Injection for data augmentation
        if use_gaussian_noise and np.random.rand() < noise_prob:
            noise = torch.randn_like(X_batch) * noise_std
            X_batch_noisy = X_batch + noise
        else:
            X_batch_noisy = X_batch
        
        optimizer.zero_grad()
        outputs = model(X_batch_noisy)
        loss = criterion(outputs.squeeze(), y_batch)
        loss.backward()
        optimizer.step()
    
    # Set optimizer to eval mode (uses averaged weights for SchedulerFree)
    if optimizer_name == 'radam_schedulefree':
        optimizer.eval()
    
    # Validation (no noise during evaluation)
    model.eval()
    val_predictions = []
    val_targets = []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)  # No noise during validation
            val_predictions.extend(outputs.squeeze().cpu().numpy())
            val_targets.extend(y_batch.cpu().numpy())
    
    # Inverse transform to original scale
    # Step 1: Inverse smooth clip (arctanh)
    val_predictions_unclipped = inverse_smooth_clip(np.array(val_predictions), clip_value)
    val_targets_unclipped = inverse_smooth_clip(np.array(val_targets), clip_value)
    
    # Step 2: Inverse RobustScaler transform
    val_predictions_original = scaler_y.inverse_transform(val_predictions_unclipped.reshape(-1, 1)).flatten()
    val_targets_original = scaler_y.inverse_transform(val_targets_unclipped.reshape(-1, 1)).flatten()
    
    # Calculate RMSE in original scale
    val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))
    
    # Update learning rate scheduler (not for SchedulerFree)
    if scheduler is not None:
        if scheduler_name == 'reduce_plateau':
            scheduler.step(val_rmse)
        else:
            scheduler.step()
    
    # Track best and save model state
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_model_state = copy.deepcopy(model.state_dict())  # Save best model parameters
    
    # Print progress
    if epoch % 100 == 0:
        if optimizer_name != 'radam_schedulefree':
            current_lr = optimizer.param_groups[0]['lr']
            print(f"Epoch {epoch+1}/{epochs} | Val RMSE: {val_rmse:.4f} | Best: {best_val_rmse:.4f} | LR: {current_lr:.6f}")
        else:
            print(f"Epoch {epoch+1}/{epochs} | Val RMSE: {val_rmse:.4f} | Best: {best_val_rmse:.4f} | LR: {lr:.6f} (fixed)")

# Load best model parameters
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"\nTraining complete! Best Val RMSE: {best_val_rmse:.4f}")
    print(f"Loaded best model parameters from training.")

In [None]:
# ============================================
# Optuna Hyperparameter Optimization
# ============================================

import optuna
from optuna.trial import Trial
from optuna.pruners import HyperbandPruner
from optuna.samplers import TPESampler
from schedulefree import RAdamScheduleFree, AdamWScheduleFree
import copy

# Inverse smooth clip function for Optuna
def inverse_smooth_clip_optuna(x, clip_val=3.0):
    """
    Inverse of smooth clipping (arctanh)
    """
    # Clip to prevent arctanh overflow
    x_clipped = np.clip(x / clip_val, -0.9999, 0.9999)
    return np.arctanh(x_clipped) * clip_val

def objective(trial: Trial):
    # Number of layers selection (1, 2, or 3 layers)
    num_layers = trial.suggest_int('num_layers', 1, 3)
    
    # Build hidden_sizes list based on num_layers
    hidden_sizes = []
    for i in range(num_layers):
        layer_size = trial.suggest_categorical(f'hidden_size_layer_{i+1}', [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048])
        hidden_sizes.append(layer_size)
    
    # Optimizer selection: AdamWScheduleFree or RAdamScheduleFree only
    optimizer_name = trial.suggest_categorical('optimizer', ['adamw_schedulefree', 'radam_schedulefree'])
    
    # NO scheduler exploration (using SchedulerFree optimizers)
    
    # Hyperparameters to optimize
    lr = trial.suggest_float('lr', 1e-5, 1e-1, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-1, log=True)
    dropout_rate = 0.0  # Fixed to 0
    batch_size = trial.suggest_categorical('batch_size', [4, 8, 16, 32, 64, 128, 256, 512])
    
    # Batch Normalization: fixed to False
    use_batchnorm = False
    
    # Adam betas (momentum coefficients)
    beta1 = trial.suggest_float('beta1', 0.85, 0.95)
    beta2 = trial.suggest_float('beta2', 0.9, 0.9999)
    
    # Activation function selection
    activation = trial.suggest_categorical('activation', ['relu', 'gelu', 'silu'])
    
    # Loss function: MAE fixed
    loss_function = 'mae'
    
    # Gaussian Noise parameters (always enabled)
    use_gaussian_noise = True
    noise_std = trial.suggest_float('noise_std', 0.01, 0.15)
    noise_prob = trial.suggest_float('noise_prob', 0.3, 0.9)
    
    # Loss criterion (MAE fixed)
    criterion_trial = nn.L1Loss()
    
    # Create dataloaders with suggested batch size
    g_trial = torch.Generator()
    g_trial.manual_seed(SEED)
    train_loader_trial = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g_trial)
    val_loader_trial = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Create model with hidden_sizes (list)
    model_trial = MLPModel(input_size=X_train.shape[1], hidden_sizes=hidden_sizes, 
                          dropout_rate=dropout_rate, activation=activation, use_batchnorm=use_batchnorm)
    model_trial = model_trial.to(device)
    
    # Create optimizer based on selection (SchedulerFree only)
    if optimizer_name == 'adamw_schedulefree':
        optimizer_trial = AdamWScheduleFree(
            model_trial.parameters(), 
            lr=lr, 
            betas=(beta1, beta2),
            weight_decay=weight_decay
        )
    elif optimizer_name == 'radam_schedulefree':
        optimizer_trial = RAdamScheduleFree(
            model_trial.parameters(), 
            lr=lr, 
            betas=(beta1, beta2),
            weight_decay=weight_decay
        )
    
    # No scheduler (using SchedulerFree optimizers)
    scheduler = None
    
    # Early stopping
    best_val_rmse = float('inf')
    best_model_state = None  # Store best model parameters
    patience = 750
    patience_counter = 0
    
    # Training loop
    max_epochs = 2000
    for epoch in range(max_epochs):
        # Set optimizer to train mode (important for SchedulerFree)
        optimizer_trial.train()
        
        # Training
        model_trial.train()
        for X_batch, y_batch in train_loader_trial:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            # Apply Gaussian Noise Injection (always enabled)
            if np.random.rand() < noise_prob:
                noise = torch.randn_like(X_batch) * noise_std
                X_batch_noisy = X_batch + noise
            else:
                X_batch_noisy = X_batch
            
            optimizer_trial.zero_grad()
            outputs = model_trial(X_batch_noisy)
            loss = criterion_trial(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer_trial.step()
        
        # Set optimizer to eval mode (uses averaged weights for SchedulerFree)
        optimizer_trial.eval()
        
        # Validation
        model_trial.eval()
        val_predictions = []
        val_targets = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader_trial:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model_trial(X_batch)
                val_predictions.extend(outputs.squeeze().cpu().numpy())
                val_targets.extend(y_batch.cpu().numpy())
        
        # Inverse transform to original scale
        # Step 1: Inverse smooth clip (arctanh)
        val_predictions_unclipped = inverse_smooth_clip_optuna(np.array(val_predictions), clip_value)
        val_targets_unclipped = inverse_smooth_clip_optuna(np.array(val_targets), clip_value)
        
        # Step 2: Inverse RobustScaler transform
        val_predictions_original = scaler_y.inverse_transform(val_predictions_unclipped.reshape(-1, 1)).flatten()
        val_targets_original = scaler_y.inverse_transform(val_targets_unclipped.reshape(-1, 1)).flatten()
        
        # Calculate RMSE in original scale
        val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))
        
        # Early stopping and save best model
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model_state = copy.deepcopy(model_trial.state_dict())  # Save best model parameters
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
        
        # Report intermediate value for Hyperband pruning
        trial.report(val_rmse, epoch)
        
        # Handle pruning
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    # Load best model parameters before returning
    if best_model_state is not None:
        model_trial.load_state_dict(best_model_state)
    
    return best_val_rmse

# Create Sampler with seed for reproducibility
sampler = TPESampler(seed=SEED)

# Create Pruner
pruner = HyperbandPruner(
    min_resource=50,
    max_resource=2000,
    reduction_factor=3
)

# Create study with sampler and pruner
study = optuna.create_study(
    direction='minimize', 
    study_name='mlp_optimization',
    sampler=sampler,  # Sampler with seed for reproducibility
    pruner=pruner
)

print("="*60)
print("Starting Optuna optimization (Focused Search)")
print("="*60)
print(f"Random seed: {SEED}")
print(f"Sampler: TPESampler (seed={SEED})")
print(f"Pruner: HyperbandPruner (min_resource=50, max_resource=2000, reduction_factor=3)")
print(f"\nOptimizing: num_layers (1-3),")
print(f"            hidden_size for each layer (4/8/16/32/64/128/256/512/1024/2048),")
print(f"            optimizer (AdamWScheduleFree/RAdamScheduleFree ONLY),")
print(f"            NO scheduler exploration (using SchedulerFree),")
print(f"            lr, weight_decay,")
print(f"            batch_size (4/8/16/32/64/128/256/512),")
print(f"            beta1 (0.85-0.95), beta2 (0.9-0.9999),")
print(f"            activation (relu/gelu/silu),")
print(f"            gaussian_noise params (std: 0.01-0.15, prob: 0.3-0.9)")
print(f"Loss function: MAE (FIXED)")
print(f"Batch Normalization: FALSE (FIXED)")
print(f"Gaussian Noise: ENABLED (FIXED)")
print(f"Model: Multi-layer MLP with Residual Connection")
print(f"Input: Polynomial features (degree=2, 152 dimensions)")
print(f"Dropout: Fixed to 0.0")
print(f"Scaling: RobustScaler + Smooth Clipping (tanh)")
print(f"Best model checkpoint: Enabled (saves best validation RMSE)")
print(f"Early stopping patience: 750 epochs")
print("="*60)

study.optimize(objective, n_trials=1000, timeout=7200)  # 1000 trials or 2 hours

# Print best parameters
print("\n" + "="*60)
print("OPTIMIZATION COMPLETE!")
print("="*60)
print(f"\nBest trial:")
print(f"  RMSE: {study.best_trial.value:.4f}")
print(f"\n  Best Parameters:")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

# Print pruning statistics
pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
print(f"\n  Statistics:")
print(f"    Completed trials: {len(complete_trials)}")
print(f"    Pruned trials: {len(pruned_trials)}")
print(f"    Total trials: {len(study.trials)}")
print(f"    Pruning efficiency: {len(pruned_trials)/len(study.trials)*100:.1f}%")
print("="*60)

In [None]:
# ============================================
# Test Time Augmentation (TTA) for Inference
# ============================================

# TTA parameters
tta_iterations = 20  # Number of augmented predictions to average
tta_noise_std = 0.03  # Standard deviation of Gaussian noise for TTA (lower than training)

print("="*60)
print("TEST TIME AUGMENTATION (TTA)")
print("="*60)
print(f"TTA iterations: {tta_iterations}")
print(f"TTA noise std: {tta_noise_std}")
print("="*60)

# Convert the test set into a torch tensor and move to device
test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

# Set the model to evaluation mode
model.eval()

# Collect predictions from multiple TTA iterations
all_predictions = []

with torch.no_grad():
    for tta_iter in range(tta_iterations):
        # Add Gaussian noise for TTA (except first iteration which is clean)
        if tta_iter == 0:
            # First iteration: clean prediction (no noise)
            test_tensor_augmented = test_tensor
        else:
            # Subsequent iterations: add Gaussian noise
            noise = torch.randn_like(test_tensor) * tta_noise_std
            test_tensor_augmented = test_tensor + noise
        
        # Make predictions
        predictions_scaled = model(test_tensor_augmented).squeeze().cpu().numpy()
        
        # Inverse transform to get actual DIC values
        # Step 1: Inverse smooth clip (arctanh)
        predictions_unclipped = inverse_smooth_clip(predictions_scaled, clip_value)
        
        # Step 2: Inverse RobustScaler transform
        predictions = scaler_y.inverse_transform(predictions_unclipped.reshape(-1, 1)).flatten()
        
        all_predictions.append(predictions)
        
        if (tta_iter + 1) % 5 == 0:
            print(f"TTA iteration {tta_iter + 1}/{tta_iterations} complete")

# Average all TTA predictions
final_predictions = np.mean(all_predictions, axis=0)

# Calculate statistics
print(f"\nTTA complete!")
print(f"Final predictions - Min: {final_predictions.min():.2f}, Max: {final_predictions.max():.2f}, Mean: {final_predictions.mean():.2f}")
print(f"Prediction std across TTA iterations: {np.std(all_predictions, axis=0).mean():.4f}")

# Prepare submission
submission = pd.DataFrame({"id": range(1455, 1455 + len(final_predictions)), "DIC": final_predictions})
submission.to_csv("submission.csv", index=False)
print("\nSubmission saved!")