In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install schedulefree optuna -q

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

# Disable PyTorch compile to avoid sympy compatibility issues
import os
os.environ['TORCH_COMPILE_DISABLE'] = '1'

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# CRITICAL FIX: Rename TA1.x to TA1 to match test data
train = train.rename(columns={"TA1.x": "TA1"})

# Find common columns between train and test (excluding "DIC" from train)
common_columns = train.drop(columns=["DIC"]).columns.intersection(test.columns)

print(f"Common columns: {len(common_columns)}")
print(f"Common columns list: {sorted(common_columns.tolist())}")

# Select the common columns for both train and test
X = train[common_columns]
y = train["DIC"]
test = test[common_columns]

# Normalize X and y separately
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
test_scaled = scaler_X.transform(test)

# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=SEED)

In [None]:
from torch.utils.data import Dataset, DataLoader

class OceanChemistryDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        # Handle both numpy arrays and pandas Series
        if hasattr(y, 'values'):
            y = y.values
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = OceanChemistryDataset(X_train, y_train)
val_dataset = OceanChemistryDataset(X_val, y_val)

# Set generator for reproducible shuffling
g = torch.Generator()
g.manual_seed(SEED)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=g)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

In [None]:
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_layers=[128, 64], dropout_rate=0.3, activation='relu'):
        super(MLPModel, self).__init__()

        # Create layers dynamically
        self.layers = nn.ModuleList()
        # self.batch_norms = nn.ModuleList()
        self.activations = nn.ModuleList()
        self.dropouts = nn.ModuleList()

        # Activation function mapping
        activation_map = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(0.1),
            'elu': nn.ELU(),
            'gelu': nn.GELU(),
            'silu': nn.SiLU(),  # Swish
            'tanh': nn.Tanh()
        }

        # Select activation function
        act_fn = activation_map.get(activation, nn.ReLU())

        # Determine initialization based on activation
        nonlinearity = 'relu' if activation in ['relu', 'leaky_relu'] else 'linear'

        # Input layer
        prev_size = input_size

        # Hidden layers
        for hidden_size in hidden_layers:
            # Linear layer
            fc = nn.Linear(prev_size, hidden_size)
            # Initialization based on activation
            if nonlinearity == 'relu':
                nn.init.kaiming_normal_(fc.weight, mode='fan_in', nonlinearity='relu')
            else:
                nn.init.xavier_normal_(fc.weight)
            nn.init.constant_(fc.bias, 0)

            self.layers.append(fc)
            # self.batch_norms.append(nn.BatchNorm1d(hidden_size))
            # Create new activation instance for each layer
            self.activations.append(activation_map.get(activation, nn.ReLU()))
            self.dropouts.append(nn.Dropout(dropout_rate))

            prev_size = hidden_size

        # Output layer
        self.output = nn.Linear(prev_size, 1)
        # Xavier initialization for output layer
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)

    def forward(self, x):
        # Hidden layers with activation and Dropout
        for fc, activation, dropout in zip(self.layers, self.activations, self.dropouts):
            x = fc(x)
            # x = bn(x)  # BatchNorm disabled
            x = activation(x)
            x = dropout(x)  # Dropout for regularization

        # Output layer (no dropout)
        x = self.output(x)
        return x

# Set device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Initialize the model with custom hidden layer sizes and dropout
hidden_layers = [128]  # You can change this to [256, 128, 64] or any configuration
dropout_rate = 0.11998368464894496  # Dropout rate (0.3 = 30% of neurons dropped during training)
activation = 'relu'  # Activation function
model = MLPModel(input_size=X_train.shape[1], hidden_layers=hidden_layers, dropout_rate=dropout_rate, activation=activation)
model = model.to(device)  # Move model to GPU

print(f"Input size: {X_train.shape[1]}")
print(f"Hidden layers: {hidden_layers}")
print(f"Dropout rate: {dropout_rate}")
print(f"Activation: {activation}")
print(f"Model architecture: {X_train.shape[1]} -> {' -> '.join(map(str, hidden_layers))} -> 1")

In [None]:
import torch.optim as optim
import schedulefree

# Optimizer selection (you can change this)
optimizer_name = 'adamw'  # Options: 'adamw', 'radam', 'schedulefree_adamw', 'schedulefree_radam', 'schedulefree_sgd'

# Loss function selection (you can change this)
loss_function = 'mae'  # Options: 'mse', 'mae', 'smooth_l1', 'huber'

loss_map = {
    'mse': nn.MSELoss(),        # Mean Squared Error (L2)
    'mae': nn.L1Loss(),         # Mean Absolute Error (L1)
    'smooth_l1': nn.SmoothL1Loss(),  # Smooth L1 (Huber-like)
    'huber': nn.HuberLoss()     # Huber Loss (robust to outliers)
}
criterion = loss_map[loss_function]

# Create optimizer based on selection
lr = 0.0012731358578182157
betas = (0.9333212328850029, 0.9115328074111416)
weight_decay = 0.09642207316306091

if optimizer_name == 'adamw':
    optimizer = optim.AdamW(
        model.parameters(),
        lr=lr,
        betas=betas,
        weight_decay=weight_decay,
        fused=False  # Disable fused to avoid sympy issues
    )
elif optimizer_name == 'radam':
    optimizer = optim.RAdam(
        model.parameters(),
        lr=lr,
        betas=betas,
        weight_decay=weight_decay
    )
elif optimizer_name == 'schedulefree_adamw':
    optimizer = schedulefree.AdamWScheduleFree(
        model.parameters(),
        lr=lr,
        betas=betas,
        weight_decay=weight_decay
    )
elif optimizer_name == 'schedulefree_radam':
    # Check if RAdamScheduleFree exists, otherwise use AdamWScheduleFree
    if hasattr(schedulefree, 'RAdamScheduleFree'):
        optimizer = schedulefree.RAdamScheduleFree(
            model.parameters(),
            lr=lr,
            betas=betas,
            weight_decay=weight_decay
        )
    else:
        print("Warning: RAdamScheduleFree not found, using AdamWScheduleFree instead")
        optimizer = schedulefree.AdamWScheduleFree(
            model.parameters(),
            lr=lr,
            betas=betas,
            weight_decay=weight_decay
        )
elif optimizer_name == 'schedulefree_sgd':
    optimizer = schedulefree.SGDScheduleFree(
        model.parameters(),
        lr=lr,
        momentum=0.9,
        weight_decay=weight_decay
    )
else:
    raise ValueError(f"Unknown optimizer: {optimizer_name}")

print(f"Using optimizer: {optimizer_name}")
print(f"Using loss function: {loss_function}")

# Check if using ScheduleFree optimizer
is_schedulefree = optimizer_name.startswith('schedulefree')

# Training function
def train_model(model, train_loader, val_loader, epochs=5000):
    for epoch in range(epochs):
        model.train()
        if is_schedulefree:
            optimizer.train()  # ScheduleFree specific
            
        running_loss = 0.0
        train_predictions = []
        train_targets = []

        for X_batch, y_batch in train_loader:
            # Move data to GPU
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            # Store predictions and targets for RMSE calculation (move to CPU)
            train_predictions.extend(outputs.squeeze().detach().cpu().numpy())
            train_targets.extend(y_batch.cpu().numpy())

        # Inverse transform to original scale
        train_predictions_original = scaler_y.inverse_transform(np.array(train_predictions).reshape(-1, 1)).flatten()
        train_targets_original = scaler_y.inverse_transform(np.array(train_targets).reshape(-1, 1)).flatten()
        
        # Calculate train metrics in original scale
        train_mae = np.mean(np.abs(train_predictions_original - train_targets_original))
        train_rmse = np.sqrt(np.mean((train_predictions_original - train_targets_original)**2))
        
        val_loss = 0.0
        val_predictions = []
        val_targets = []
        model.eval()
        if is_schedulefree:
            optimizer.eval()  # ScheduleFree specific
            
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                # Move data to GPU
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                outputs = model(X_batch)
                loss = criterion(outputs.squeeze(), y_batch)
                val_loss += loss.item()
                
                # Store predictions and targets for RMSE calculation (move to CPU)
                val_predictions.extend(outputs.squeeze().cpu().numpy())
                val_targets.extend(y_batch.cpu().numpy())
        
        # Inverse transform to original scale
        val_predictions_original = scaler_y.inverse_transform(np.array(val_predictions).reshape(-1, 1)).flatten()
        val_targets_original = scaler_y.inverse_transform(np.array(val_targets).reshape(-1, 1)).flatten()
        
        # Calculate validation metrics in original scale
        val_mae = np.mean(np.abs(val_predictions_original - val_targets_original))
        val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))

        if epoch % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs} | "
                  f"Train MAE: {train_mae:.2f}, Train RMSE: {train_rmse:.2f} | "
                  f"Val MAE: {val_mae:.2f}, Val RMSE: {val_rmse:.2f}")

# Train the model
train_model(model, train_loader, val_loader, epochs=2000)

In [None]:
# ============================================
# Optuna Hyperparameter Optimization
# ============================================
# Uncomment below to run Optuna search

import optuna
from optuna.trial import Trial
from optuna.pruners import HyperbandPruner
import schedulefree

def objective(trial: Trial):
    # Optimizer selection
    optimizer_name = trial.suggest_categorical('optimizer', 
        ['adamw', 'radam', 'schedulefree_adamw', 'schedulefree_radam', 'schedulefree_sgd'])
    
    # Hyperparameters to optimize
    lr = trial.suggest_float('lr', 1e-5, 1e-1, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-1, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
    
    # Optimizer-specific parameters
    if optimizer_name in ['adamw', 'radam', 'schedulefree_adamw', 'schedulefree_radam']:
        beta1 = trial.suggest_float('beta1', 0.8, 0.95)
        beta2 = trial.suggest_float('beta2', 0.9, 0.9999)
    elif optimizer_name == 'schedulefree_sgd':
        momentum = trial.suggest_float('momentum', 0.8, 0.99)
    
    # Activation function selection
    activation = trial.suggest_categorical('activation', ['relu', 'leaky_relu', 'elu', 'gelu', 'silu', 'tanh'])
    
    # Loss function selection
    loss_function = trial.suggest_categorical('loss_function', ['mse', 'mae', 'smooth_l1', 'huber'])
    
    # Hidden layers configuration
    n_layers = trial.suggest_int('n_layers', 1, 4)
    hidden_layers = []
    for i in range(n_layers):
        hidden_size = trial.suggest_categorical(f'hidden_size_{i}', [32, 64, 128, 256, 512])
        hidden_layers.append(hidden_size)
    
    # Create dataloaders with suggested batch size
    g_trial = torch.Generator()
    g_trial.manual_seed(SEED)
    train_loader_trial = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g_trial)
    val_loader_trial = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Create model with suggested hyperparameters
    model_trial = MLPModel(input_size=X_train.shape[1], hidden_layers=hidden_layers, 
                          dropout_rate=dropout_rate, activation=activation)
    model_trial = model_trial.to(device)
    
    # Create optimizer with suggested hyperparameters
    # Use fused=False to avoid sympy compatibility issues
    if optimizer_name == 'adamw':
        optimizer_trial = optim.AdamW(
            model_trial.parameters(), 
            lr=lr, 
            betas=(beta1, beta2),
            weight_decay=weight_decay,
            fused=False  # Disable fused to avoid sympy issues
        )
    elif optimizer_name == 'radam':
        optimizer_trial = optim.RAdam(
            model_trial.parameters(), 
            lr=lr, 
            betas=(beta1, beta2),
            weight_decay=weight_decay
        )
    elif optimizer_name == 'schedulefree_adamw':
        optimizer_trial = schedulefree.AdamWScheduleFree(
            model_trial.parameters(),
            lr=lr,
            betas=(beta1, beta2),
            weight_decay=weight_decay
        )
    elif optimizer_name == 'schedulefree_radam':
        # Check if RAdamScheduleFree exists, otherwise use AdamWScheduleFree
        if hasattr(schedulefree, 'RAdamScheduleFree'):
            optimizer_trial = schedulefree.RAdamScheduleFree(
                model_trial.parameters(),
                lr=lr,
                betas=(beta1, beta2),
                weight_decay=weight_decay
            )
        else:
            # Fallback: use AdamWScheduleFree (very similar to RAdam in practice)
            optimizer_trial = schedulefree.AdamWScheduleFree(
                model_trial.parameters(),
                lr=lr,
                betas=(beta1, beta2),
                weight_decay=weight_decay
            )
    elif optimizer_name == 'schedulefree_sgd':
        optimizer_trial = schedulefree.SGDScheduleFree(
            model_trial.parameters(),
            lr=lr,
            momentum=momentum,
            weight_decay=weight_decay
        )
    
    # Select loss function
    loss_map = {
        'mse': nn.MSELoss(),
        'mae': nn.L1Loss(),
        'smooth_l1': nn.SmoothL1Loss(),
        'huber': nn.HuberLoss()
    }
    criterion_trial = loss_map[loss_function]
    
    # Early stopping
    best_val_rmse = float('inf')
    patience = 100
    patience_counter = 0
    
    # Check if using ScheduleFree optimizer
    is_schedulefree = optimizer_name.startswith('schedulefree')
    
    # Training loop
    max_epochs = 2000
    for epoch in range(max_epochs):
        # Training
        model_trial.train()
        if is_schedulefree:
            optimizer_trial.train()  # ScheduleFree specific
            
        for X_batch, y_batch in train_loader_trial:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer_trial.zero_grad()
            outputs = model_trial(X_batch)
            loss = criterion_trial(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer_trial.step()
        
        # Validation
        model_trial.eval()
        if is_schedulefree:
            optimizer_trial.eval()  # ScheduleFree specific
            
        val_predictions = []
        val_targets = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader_trial:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model_trial(X_batch)
                val_predictions.extend(outputs.squeeze().cpu().numpy())
                val_targets.extend(y_batch.cpu().numpy())
        
        # Calculate RMSE in original scale
        val_predictions_original = scaler_y.inverse_transform(np.array(val_predictions).reshape(-1, 1)).flatten()
        val_targets_original = scaler_y.inverse_transform(np.array(val_targets).reshape(-1, 1)).flatten()
        val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))
        
        # Early stopping
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
        
        # Report intermediate value for Hyperband pruning
        trial.report(val_rmse, epoch)
        
        # Handle pruning
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return best_val_rmse

# Create study with Hyperband Pruner (theoretically optimal)
pruner = HyperbandPruner(
    min_resource=50,        # Minimum 50 epochs before pruning
    max_resource=2000,      # Maximum 2000 epochs
    reduction_factor=3      # Prune 2/3 of trials at each stage
)

study = optuna.create_study(
    direction='minimize', 
    study_name='mlp_hyperband_optimization',
    pruner=pruner
)

print("Starting Optuna optimization with Hyperband Pruner...")
print(f"Settings: min_resource=50, max_resource=2000, reduction_factor=3")
print(f"Optimizing: optimizer (AdamW/RAdam/ScheduleFree variants), lr, weight_decay, dropout_rate, batch_size,")
print(f"            beta1, beta2, activation, loss_function, n_layers, hidden_sizes")
print("="*60)

study.optimize(objective, n_trials=100, timeout=7200)  # 100 trials or 2 hours

# Print best parameters
print("\n" + "="*60)
print("OPTIMIZATION COMPLETE!")
print("="*60)
print(f"\nBest trial:")
print(f"  RMSE: {study.best_trial.value:.4f}")
print(f"\n  Best Parameters:")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

# Print pruning statistics
pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
print(f"\n  Statistics:")
print(f"    Completed trials: {len(complete_trials)}")
print(f"    Pruned trials: {len(pruned_trials)}")
print(f"    Total trials: {len(study.trials)}")
print(f"    Pruning efficiency: {len(pruned_trials)/len(study.trials)*100:.1f}%")
print("="*60)

In [None]:
# Convert the test set into a torch tensor and move to GPU
test_tensor = torch.tensor(test_scaled, dtype=torch.float32).to(device)

# Set the model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    predictions_scaled = model(test_tensor).squeeze().cpu().numpy()
    # Inverse transform to get actual DIC values
    predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()

print(f"Predictions - Min: {predictions.min():.2f}, Max: {predictions.max():.2f}, Mean: {predictions.mean():.2f}")

# Prepare submission
submission = pd.DataFrame({"id": range(1455, 1455 + len(predictions)), "DIC": predictions})
submission.to_csv("submission.csv", index=False)
print("Submission saved!")