In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install schedulefree -q

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy import stats
import numpy as np
import pandas as pd
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# CRITICAL FIX: Rename TA1.x to TA1 to match test data
train = train.rename(columns={"TA1.x": "TA1"})

# ============================================
# Extract Raw Features and Target
# ============================================
print(f"\n{'='*60}")
print("DATA PREPARATION")
print(f"{'='*60}")

# Extract features and target
feature_columns = [col for col in train.columns if col != 'DIC']
X_raw = train[feature_columns].copy()
y_raw = train['DIC'].values.copy()
X_test_raw = test[feature_columns].copy()

print(f"元の訓練データ: {X_raw.shape}")
print(f"テストデータ: {X_test_raw.shape}")

# ============================================
# Holdout Validation Setup
# ============================================
print(f"\n{'='*60}")
print("HOLDOUT VALIDATION SETUP")
print(f"{'='*60}")

# 80% train, 20% validation
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=SEED
)

print(f"Train set: {X_train_raw.shape[0]} samples")
print(f"Validation set: {X_val_raw.shape[0]} samples")
print(f"Test set: {X_test_raw.shape[0]} samples")
print("="*60)


In [None]:
# ============================================
# ResNet Model for Tabular Data
# Based on "Revisiting Deep Learning Models for Tabular Data" (NeurIPS 2021)
# ============================================

class ResNetBlock(nn.Module):
    """
    ResNet Block for tabular data
    ResNetBlock(x) = x + Dropout(Linear(Dropout(ReLU(Linear(BatchNorm(x))))))
    """
    def __init__(self, d, hidden_factor=2, dropout_rate=0.1):
        """
        Args:
            d: Dimension of input and output
            hidden_factor: Factor to determine hidden layer size (hidden = d * hidden_factor)
            dropout_rate: Dropout probability
        """
        super(ResNetBlock, self).__init__()
        
        hidden_dim = int(d * hidden_factor)
        
        self.norm = nn.BatchNorm1d(d)
        self.linear1 = nn.Linear(d, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.linear2 = nn.Linear(hidden_dim, d)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        # Initialize weights
        nn.init.kaiming_normal_(self.linear1.weight, mode='fan_in', nonlinearity='relu')
        nn.init.kaiming_normal_(self.linear2.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(self.linear1.bias, 0)
        nn.init.constant_(self.linear2.bias, 0)
    
    def forward(self, x):
        # Main path: BatchNorm -> Linear -> ReLU -> Dropout -> Linear -> Dropout
        residual = x
        x = self.norm(x)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.linear2(x)
        x = self.dropout2(x)
        
        # Residual connection
        return residual + x


class ResNetModel(nn.Module):
    """
    ResNet for tabular data
    ResNet(x) = Prediction(ResNetBlock(...(ResNetBlock(Linear(x)))))
    """
    def __init__(self, input_size, d=256, n_blocks=4, hidden_factor=2, dropout_rate=0.1):
        """
        Args:
            input_size: Number of input features
            d: Dimension of ResNet blocks
            n_blocks: Number of ResNet blocks
            hidden_factor: Hidden layer factor for each block
            dropout_rate: Dropout probability
        """
        super(ResNetModel, self).__init__()
        
        # Initial projection
        self.input_layer = nn.Linear(input_size, d)
        nn.init.kaiming_normal_(self.input_layer.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(self.input_layer.bias, 0)
        
        # ResNet blocks
        self.blocks = nn.ModuleList([
            ResNetBlock(d, hidden_factor, dropout_rate) 
            for _ in range(n_blocks)
        ])
        
        # Prediction head: BatchNorm -> ReLU -> Linear
        self.final_norm = nn.BatchNorm1d(d)
        self.final_relu = nn.ReLU()
        self.output = nn.Linear(d, 1)
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
    
    def forward(self, x):
        # Initial projection
        x = self.input_layer(x)
        
        # Pass through ResNet blocks
        for block in self.blocks:
            x = block(x)
        
        # Prediction
        x = self.final_norm(x)
        x = self.final_relu(x)
        x = self.output(x)
        
        return x


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n{'='*60}")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"{'='*60}")

In [None]:
import torch.optim as optim
from schedulefree import RAdamScheduleFree, AdamWScheduleFree
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.amp import autocast, GradScaler
from sklearn.preprocessing import RobustScaler
import copy
import optuna

# ============================================
# Configuration
# ============================================
USE_OPTUNA = True  # Set to True to use Optuna hyperparameter search
N_TRIALS = 100  # Number of Optuna trials
TIMEOUT = None  # Time limit for Optuna in seconds (None = no limit)
OPTUNA_SEED = 42  # Fixed seed for Optuna search (single seed for reproducibility)

# Fixed settings (not tuned by Optuna)
use_c_mixup = True
c_mixup_alpha = 1.0
c_mixup_sigma = 1.0
c_mixup_factor = 2
epochs = 100000
early_stopping_patience = 500

# Multiple seed training (after Optuna search)
num_seeds = 5
start_seed = 1000

print("="*60)
print("ResNet for Tabular Data - Optuna Hyperparameter Search")
print("="*60)
print(f"USE_OPTUNA: {USE_OPTUNA}")
if USE_OPTUNA:
    print(f"N_TRIALS: {N_TRIALS}")
    print(f"TIMEOUT: {TIMEOUT}")
    print(f"OPTUNA_SEED: {OPTUNA_SEED} (fixed for search)")
print(f"epochs: {epochs}")
print(f"early_stopping_patience: {early_stopping_patience}")
print(f"num_seeds: {num_seeds} (seeds {start_seed} to {start_seed + num_seeds - 1})")
print("="*60)


class EMA:
    """Exponential Moving Average (EMA) for model weights"""
    def __init__(self, model, decay=0.999):
        self.model = model
        self.decay = decay
        self.shadow = {}
        self.backup = {}
        self.register()
    
    def register(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()
    
    def update(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                new_average = self.decay * self.shadow[name] + (1.0 - self.decay) * param.data
                self.shadow[name] = new_average.clone()
    
    def apply_shadow(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.backup[name] = param.data.clone()
                param.data = self.shadow[name]
    
    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                param.data = self.backup[name]
        self.backup = {}


def smooth_clip(x, clip_val=3.0):
    """Apply smooth clipping using tanh"""
    return np.tanh(x / clip_val) * clip_val


def inverse_smooth_clip(x, clip_val=3.0):
    """Inverse of smooth clipping with numerical stability"""
    x_normalized = x / clip_val
    # Use more conservative clipping to avoid numerical instability in arctanh
    # arctanh(x) -> infinity as x -> ±1, so clip to ±0.995 for safety
    x_safe = np.clip(x_normalized, -0.995, 0.995)
    result = np.arctanh(x_safe) * clip_val
    # Final safety check: replace any remaining inf/nan with bounded values
    result = np.where(np.isfinite(result), result, np.sign(x) * clip_val * 10)
    return result


def c_mixup(X, y, alpha=1.0, sigma=1.0, augment_factor=2):
    """C-Mixup (Calibrated Mixup) data augmentation"""
    n_samples = X.shape[0]
    
    y_expanded = y.reshape(-1, 1)
    label_distances = (y_expanded - y_expanded.T) ** 2
    
    sampling_probs = np.exp(-label_distances / (2 * sigma ** 2))
    np.fill_diagonal(sampling_probs, 0)
    row_sums = sampling_probs.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1
    sampling_probs = sampling_probs / row_sums
    
    X_augmented = []
    y_augmented = []
    
    for _ in range(augment_factor):
        for i in range(n_samples):
            j = np.random.choice(n_samples, p=sampling_probs[i])
            lambda_mix = np.random.beta(alpha, alpha)
            
            x_mix = lambda_mix * X[i] + (1 - lambda_mix) * X[j]
            y_mix = lambda_mix * y[i] + (1 - lambda_mix) * y[j]
            
            X_augmented.append(x_mix)
            y_augmented.append(y_mix)
    
    X_aug = np.vstack([X] + [np.array(X_augmented)])
    y_aug = np.hstack([y] + [np.array(y_augmented)])
    
    return X_aug, y_aug


def train_and_evaluate(params, X_train_raw, X_val_raw, y_train_raw, y_val_raw, seed, verbose=False):
    """
    Train a single model with given hyperparameters and return validation RMSE
    """
    # Set random seeds
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # Preprocessing: RobustScaler
    scaler_X = RobustScaler()
    scaler_y = RobustScaler()
    
    X_train_scaled = scaler_X.fit_transform(X_train_raw.values)
    X_val_scaled = scaler_X.transform(X_val_raw.values)
    
    y_train_scaled = scaler_y.fit_transform(y_train_raw.reshape(-1, 1)).flatten()
    y_val_scaled = scaler_y.transform(y_val_raw.reshape(-1, 1)).flatten()
    
    # Smooth Clipping
    clip_val = 3.0
    X_train_clipped = smooth_clip(X_train_scaled, clip_val=clip_val)
    X_val_clipped = smooth_clip(X_val_scaled, clip_val=clip_val)
    y_train_clipped = smooth_clip(y_train_scaled, clip_val=clip_val)
    y_val_clipped = smooth_clip(y_val_scaled, clip_val=clip_val)
    
    # C-Mixup augmentation
    if use_c_mixup:
        X_train_final, y_train_final = c_mixup(
            X_train_clipped, 
            y_train_clipped, 
            alpha=c_mixup_alpha, 
            sigma=c_mixup_sigma,
            augment_factor=c_mixup_factor
        )
    else:
        X_train_final = X_train_clipped
        y_train_final = y_train_clipped
    
    X_val_final = X_val_clipped
    y_val_final = y_val_clipped
    
    # Create DataLoaders
    train_dataset = TensorDataset(
        torch.tensor(X_train_final, dtype=torch.float32), 
        torch.tensor(y_train_final, dtype=torch.float32)
    )
    val_dataset = TensorDataset(
        torch.tensor(X_val_final, dtype=torch.float32), 
        torch.tensor(y_val_final, dtype=torch.float32)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False, pin_memory=True)
    
    # Initialize Model
    model = ResNetModel(
        input_size=X_train_final.shape[1],
        d=params['d'],
        n_blocks=params['n_blocks'],
        hidden_factor=params['hidden_factor'],
        dropout_rate=params['residual_dropout']
    )
    model = model.to(device)
    
    # Update hidden_dropout in each block
    for block in model.blocks:
        block.dropout1 = nn.Dropout(params['hidden_dropout'])
    
    # Initialize EMA (conditional based on params)
    use_ema = params['use_ema']
    ema_decay = params.get('ema_decay', 0.999)
    ema = EMA(model, decay=ema_decay) if use_ema else None
    
    # Loss function
    loss_map = {
        'mse': nn.MSELoss(), 
        'mae': nn.L1Loss(), 
        'smooth_l1': nn.SmoothL1Loss(), 
        'huber': nn.HuberLoss()
    }
    criterion = loss_map[params['loss_function']]
    
    # Initialize Optimizer
    optimizer_name = params['optimizer']
    lr = params['learning_rate']
    weight_decay = params['weight_decay']
    
    is_schedulefree = optimizer_name.endswith('_schedulefree')
    if optimizer_name == 'adamw_schedulefree':
        optimizer = AdamWScheduleFree(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_name == 'radam_schedulefree':
        optimizer = RAdamScheduleFree(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_name == 'adamw':
        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        raise ValueError(f"Unknown optimizer: {optimizer_name}")
    
    # Mixed precision training
    use_amp = torch.cuda.is_available()
    scaler = GradScaler('cuda') if use_amp else None
    
    # Training Loop
    best_val_rmse = float('inf')
    best_model_state = None
    best_ema_shadow = None
    patience_counter = 0
    
    for epoch in range(epochs):
        # Training mode
        if is_schedulefree:
            optimizer.train()
        
        model.train()
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            
            if use_amp:
                with autocast('cuda'):
                    outputs = model(X_batch)
                    loss = criterion(outputs.squeeze(-1), y_batch)
                
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(X_batch)
                loss = criterion(outputs.squeeze(-1), y_batch)
                loss.backward()
                optimizer.step()
            
            if ema is not None:
                ema.update()
        
        # Evaluation mode
        if is_schedulefree:
            optimizer.eval()
        
        if ema is not None:
            ema.apply_shadow()
        
        model.eval()
        
        # Validation
        val_predictions = []
        val_targets = []
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                if use_amp:
                    with autocast('cuda'):
                        outputs = model(X_batch)
                else:
                    outputs = model(X_batch)
                
                val_predictions.extend(outputs.squeeze(-1).cpu().numpy())
                val_targets.extend(y_batch.cpu().numpy())
        
        if ema is not None:
            ema.restore()
        
        # Calculate RMSE in original scale
        # Inverse smooth clipping
        val_predictions_unclipped = inverse_smooth_clip(np.array(val_predictions), clip_val=clip_val)
        val_targets_unclipped = inverse_smooth_clip(np.array(val_targets), clip_val=clip_val)
        
        # Inverse scaling
        val_predictions_original = scaler_y.inverse_transform(val_predictions_unclipped.reshape(-1, 1)).flatten()
        val_targets_original = scaler_y.inverse_transform(val_targets_unclipped.reshape(-1, 1)).flatten()
        
        val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))
        
        # Print progress every 10 epochs if verbose
        if verbose and (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}/{epochs} - val_RMSE: {val_rmse:.4f} (best: {best_val_rmse:.4f})")
        
        # Early stopping
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model_state = copy.deepcopy(model.state_dict())
            if ema is not None:
                best_ema_shadow = copy.deepcopy(ema.shadow)
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= early_stopping_patience:
            if verbose:
                print(f"Early stopping at epoch {epoch + 1}")
            break
    
    return best_val_rmse


def objective(trial):
    """
    Optuna objective function
    Uses OPTUNA_SEED for reproducible hyperparameter search
    """
    # Hyperparameters to tune
    params = {}
    
    # Model architecture
    params['d'] = trial.suggest_int('d', 64, 512)
    params['n_blocks'] = trial.suggest_int('n_blocks', 1, 8)
    params['hidden_factor'] = trial.suggest_float('hidden_factor', 1.0, 4.0)
    params['hidden_dropout'] = trial.suggest_float('hidden_dropout', 0.0, 0.5)
    
    # Residual dropout (conditional: 0 or 0~0.5)
    use_residual_dropout = trial.suggest_categorical('use_residual_dropout', [True, False])
    if use_residual_dropout:
        params['residual_dropout'] = trial.suggest_float('residual_dropout', 0.0, 0.5)
    else:
        params['residual_dropout'] = 0.0
    
    # Training hyperparameters
    params['learning_rate'] = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
    
    # Weight decay (conditional: 0 or 1e-6~1e-3)
    use_weight_decay = trial.suggest_categorical('use_weight_decay', [True, False])
    if use_weight_decay:
        params['weight_decay'] = trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
    else:
        params['weight_decay'] = 0.0
    
    params['batch_size'] = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    params['optimizer'] = trial.suggest_categorical('optimizer', ['adamw', 'adamw_schedulefree', 'radam_schedulefree'])
    params['loss_function'] = trial.suggest_categorical('loss_function', ['mse', 'mae', 'smooth_l1', 'huber'])
    
    # EMA (conditional: use or not, and if use, decay value)
    params['use_ema'] = trial.suggest_categorical('use_ema', [True, False])
    if params['use_ema']:
        params['ema_decay'] = trial.suggest_float('ema_decay', 0.99, 0.9999)
    else:
        params['ema_decay'] = 0.999  # default (not used)
    
    # Train with FIXED seed for Optuna search (reproducibility)
    val_rmse = train_and_evaluate(params, X_train_raw, X_val_raw, y_train_raw, y_val_raw, seed=OPTUNA_SEED, verbose=False)
    
    return val_rmse


# ============================================
# OPTUNA HYPERPARAMETER SEARCH
# ============================================
if USE_OPTUNA:
    print("\n" + "="*60)
    print("STARTING OPTUNA HYPERPARAMETER SEARCH")
    print(f"Using fixed seed: {OPTUNA_SEED}")
    print("="*60)
    
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True)
    
    print("\n" + "="*60)
    print("OPTUNA SEARCH COMPLETE")
    print("="*60)
    print(f"Best trial:")
    print(f"  Value (val_RMSE): {study.best_trial.value:.4f}")
    print(f"  Params:")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")
    print("="*60)
    
    # Extract best hyperparameters
    best_params = {}
    best_params['d'] = study.best_trial.params['d']
    best_params['n_blocks'] = study.best_trial.params['n_blocks']
    best_params['hidden_factor'] = study.best_trial.params['hidden_factor']
    best_params['hidden_dropout'] = study.best_trial.params['hidden_dropout']
    
    # Handle conditional residual_dropout
    if study.best_trial.params.get('use_residual_dropout', False):
        best_params['residual_dropout'] = study.best_trial.params.get('residual_dropout', 0.0)
    else:
        best_params['residual_dropout'] = 0.0
    
    best_params['learning_rate'] = study.best_trial.params['learning_rate']
    
    # Handle conditional weight_decay
    if study.best_trial.params.get('use_weight_decay', False):
        best_params['weight_decay'] = study.best_trial.params.get('weight_decay', 0.0)
    else:
        best_params['weight_decay'] = 0.0
    
    best_params['batch_size'] = study.best_trial.params['batch_size']
    best_params['optimizer'] = study.best_trial.params['optimizer']
    best_params['loss_function'] = study.best_trial.params['loss_function']
    
    # Handle conditional EMA
    best_params['use_ema'] = study.best_trial.params['use_ema']
    if best_params['use_ema']:
        best_params['ema_decay'] = study.best_trial.params.get('ema_decay', 0.999)
    else:
        best_params['ema_decay'] = 0.999  # default (not used)
    
else:
    # Default hyperparameters (if not using Optuna)
    best_params = {
        'd': 256,
        'n_blocks': 4,
        'hidden_factor': 2.0,
        'hidden_dropout': 0.1,
        'residual_dropout': 0.1,
        'learning_rate': 1e-3,
        'weight_decay': 1e-4,
        'batch_size': 64,
        'optimizer': 'adamw_schedulefree',
        'loss_function': 'mae',
        'use_ema': True,
        'ema_decay': 0.999,
    }
    print("\nUsing default hyperparameters (Optuna disabled)")


# ============================================
# TRAIN WITH BEST HYPERPARAMETERS (MULTIPLE SEEDS)
# ============================================
print("\n" + "#"*60)
print("# TRAINING WITH BEST HYPERPARAMETERS (MULTIPLE SEEDS)")
print("#"*60)
print(f"\nBest hyperparameters:")
for key, value in best_params.items():
    print(f"  {key}: {value}")
print()

best_rmses = []

for seed_idx in range(num_seeds):
    SEED = start_seed + seed_idx
    
    print(f"\n{'#'*60}")
    print(f"# SEED {SEED} ({seed_idx + 1}/{num_seeds})")
    print(f"{'#'*60}\n")
    
    val_rmse = train_and_evaluate(best_params, X_train_raw, X_val_raw, y_train_raw, y_val_raw, SEED, verbose=True)
    best_rmses.append(val_rmse)
    
    print(f"\nValidation RMSE: {val_rmse:.4f}")
    
    # Train on full data and predict on test
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    
    # Preprocessing
    scaler_X = RobustScaler()
    scaler_y = RobustScaler()
    
    X_train_scaled = scaler_X.fit_transform(X_train_raw.values)
    X_test_scaled = scaler_X.transform(X_test_raw.values)
    
    y_train_scaled = scaler_y.fit_transform(y_train_raw.reshape(-1, 1)).flatten()
    
    # Smooth Clipping
    clip_val = 3.0
    X_train_clipped = smooth_clip(X_train_scaled, clip_val=clip_val)
    X_test_clipped = smooth_clip(X_test_scaled, clip_val=clip_val)
    y_train_clipped = smooth_clip(y_train_scaled, clip_val=clip_val)
    
    if use_c_mixup:
        X_train_final, y_train_final = c_mixup(
            X_train_clipped, 
            y_train_clipped, 
            alpha=c_mixup_alpha, 
            sigma=c_mixup_sigma,
            augment_factor=c_mixup_factor
        )
    else:
        X_train_final = X_train_clipped
        y_train_final = y_train_clipped
    
    # Create DataLoader
    train_dataset = TensorDataset(
        torch.tensor(X_train_final, dtype=torch.float32), 
        torch.tensor(y_train_final, dtype=torch.float32)
    )
    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True, pin_memory=True)
    
    # Initialize Model
    model = ResNetModel(
        input_size=X_train_final.shape[1],
        d=best_params['d'],
        n_blocks=best_params['n_blocks'],
        hidden_factor=best_params['hidden_factor'],
        dropout_rate=best_params['residual_dropout']
    )
    model = model.to(device)
    
    for block in model.blocks:
        block.dropout1 = nn.Dropout(best_params['hidden_dropout'])
    
    # Initialize EMA and Optimizer
    use_ema = best_params['use_ema']
    ema_decay = best_params['ema_decay']
    ema = EMA(model, decay=ema_decay) if use_ema else None
    
    loss_map = {
        'mse': nn.MSELoss(), 
        'mae': nn.L1Loss(), 
        'smooth_l1': nn.SmoothL1Loss(), 
        'huber': nn.HuberLoss()
    }
    criterion = loss_map[best_params['loss_function']]
    
    optimizer_name = best_params['optimizer']
    is_schedulefree = optimizer_name.endswith('_schedulefree')
    if optimizer_name == 'adamw_schedulefree':
        optimizer = AdamWScheduleFree(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
    elif optimizer_name == 'radam_schedulefree':
        optimizer = RAdamScheduleFree(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
    else:
        optimizer = optim.AdamW(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
    
    use_amp = torch.cuda.is_available()
    scaler_grad = GradScaler('cuda') if use_amp else None
    
    # Train
    print("Training final model...")
    for epoch in range(epochs):
        if is_schedulefree:
            optimizer.train()
        
        model.train()
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            
            if use_amp:
                with autocast('cuda'):
                    outputs = model(X_batch)
                    loss = criterion(outputs.squeeze(-1), y_batch)
                
                scaler_grad.scale(loss).backward()
                scaler_grad.step(optimizer)
                scaler_grad.update()
            else:
                outputs = model(X_batch)
                loss = criterion(outputs.squeeze(-1), y_batch)
                loss.backward()
                optimizer.step()
            
            if ema is not None:
                ema.update()
    
    if ema is not None:
        ema.apply_shadow()
    
    # Predict
    model.eval()
    test_tensor = torch.tensor(X_test_clipped, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        if use_amp:
            with autocast('cuda'):
                predictions_clipped = model(test_tensor).squeeze().cpu().numpy()
        else:
            predictions_clipped = model(test_tensor).squeeze().cpu().numpy()
        
        # Inverse smooth clipping
        predictions_unclipped = inverse_smooth_clip(predictions_clipped, clip_val=clip_val)
        
        # Inverse scaling
        predictions = scaler_y.inverse_transform(predictions_unclipped.reshape(-1, 1)).flatten()
    
    # Save submission
    submission = pd.DataFrame({
        "id": range(1455, 1455 + len(predictions)), 
        "DIC": predictions
    })
    submission_filename = f"submission_resnet_seed_{SEED}.csv"
    submission.to_csv(submission_filename, index=False)
    
    print(f"Saved: {submission_filename}")

print(f"\n{'#'*60}")
print(f"# ALL SEEDS COMPLETE")
print(f"{'#'*60}")
print(f"Validation RMSEs: {best_rmses}")
print(f"Mean RMSE: {np.mean(best_rmses):.4f} ± {np.std(best_rmses):.4f}")
print(f"{'#'*60}")

In [None]:

# name: 坂田煌翔
# student_id: 62408940