In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install schedulefree -q

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy import stats
import numpy as np
import pandas as pd
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# CRITICAL FIX: Rename TA1.x to TA1 to match test data
train = train.rename(columns={"TA1.x": "TA1"})

# ============================================
# Extract Raw Features and Target
# ============================================
print(f"\n{'='*60}")
print("DATA PREPARATION")
print(f"{'='*60}")

# Extract features and target
feature_columns = [col for col in train.columns if col != 'DIC']
X_raw = train[feature_columns].copy()
y_raw = train['DIC'].values.copy()
X_test_raw = test[feature_columns].copy()

print(f"元の訓練データ: {X_raw.shape}")
print(f"テストデータ: {X_test_raw.shape}")

# ============================================
# Holdout Validation Setup
# ============================================
print(f"\n{'='*60}")
print("HOLDOUT VALIDATION SETUP")
print(f"{'='*60}")

# 80% train, 20% validation
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=SEED
)

print(f"Train set: {X_train_raw.shape[0]} samples")
print(f"Validation set: {X_val_raw.shape[0]} samples")
print(f"Test set: {X_test_raw.shape[0]} samples")
print("="*60)


In [None]:
# ============================================
# MLP Model
# ============================================

class FeatureScalingLayer(nn.Module):
    def __init__(self, num_features):
        """
        特徴量ごとに学習可能なスケーリング係数を持つ層
        ソフトな特徴量選択を実現
        
        Args:
            num_features: 入力特徴量の数
        """
        super(FeatureScalingLayer, self).__init__()
        # スケーリング係数を1.0で初期化（最初は元の特徴量をそのまま使用）
        self.scale = nn.Parameter(torch.ones(num_features))
    
    def forward(self, x):
        # 各特徴量にスケーリング係数を乗算
        return x * self.scale


class LearnableActivation(nn.Module):
    def __init__(self, num_features, activation_fn):
        """
        学習可能なパラメータを持つ活性化関数
        σ_α(x) = (1-α)x + α σ(x)
        
        Args:
            num_features: 特徴量の数(各ニューロンごとにαを持つ)
            activation_fn: ベースとなる活性化関数
        """
        super(LearnableActivation, self).__init__()
        self.activation_fn = activation_fn
        self.alpha = nn.Parameter(torch.ones(num_features))
    
    def forward(self, x):
        # αを[0, 1]の範囲にクリップ
        alpha_clamped = torch.clamp(self.alpha, 0.0, 1.0)
        # σ_α(x) = (1-α)x + α σ(x)
        return (1 - alpha_clamped) * x + alpha_clamped * self.activation_fn(x)


class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_sizes=[128], dropout_rate=0.0, activation='relu', use_batchnorm=True):
        """
        MLP model with flexible number of hidden layers
        
        Args:
            input_size: Number of input features
            hidden_sizes: List of hidden layer sizes
            dropout_rate: Dropout probability
            activation: Activation function name
            use_batchnorm: Whether to use batch normalization
        """
        super(MLPModel, self).__init__()

        self.use_batchnorm = use_batchnorm
        self.num_layers = len(hidden_sizes)

        # Activation function mapping
        activation_map = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(0.1),
            'elu': nn.ELU(),
            'gelu': nn.GELU(),
            'silu': nn.SiLU(),
            'tanh': nn.Tanh(),
            'mish': nn.Mish()
        }

        base_activation = activation_map.get(activation, nn.ReLU())

        # Determine initialization based on activation
        nonlinearity = 'relu' if activation in ['relu', 'leaky_relu'] else 'linear'

        # 特徴量スケーリング層（最初の層の前に配置）
        self.feature_scaling = FeatureScalingLayer(input_size)

        # Build hidden layers dynamically
        self.hidden_layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList() if use_batchnorm else None
        self.learnable_activations = nn.ModuleList()  # 学習可能な活性化関数
        self.dropouts = nn.ModuleList()
        
        layer_sizes = [input_size] + hidden_sizes
        
        for i in range(len(hidden_sizes)):
            # Hidden layer
            layer = nn.Linear(layer_sizes[i], layer_sizes[i+1])
            if nonlinearity == 'relu':
                nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='relu')
            else:
                nn.init.xavier_normal_(layer.weight)
            nn.init.constant_(layer.bias, 0)
            self.hidden_layers.append(layer)
            
            # Batch normalization
            if use_batchnorm:
                self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i+1]))
            
            # 学習可能な活性化関数を各層に追加
            self.learnable_activations.append(
                LearnableActivation(layer_sizes[i+1], base_activation)
            )
            
            # Dropout
            self.dropouts.append(nn.Dropout(dropout_rate))
        
        # Output layer
        self.output = nn.Linear(hidden_sizes[-1], 1)
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
        
        # Residual connection
        self.shortcut = nn.Linear(input_size, 1)
        nn.init.xavier_normal_(self.shortcut.weight)
        nn.init.constant_(self.shortcut.bias, 0)

    def forward(self, x):
        # 特徴量スケーリング（最初に適用）
        x_scaled = self.feature_scaling(x)
        
        # Main path
        h = x_scaled
        for i in range(self.num_layers):
            h = self.hidden_layers[i](h)
            if self.use_batchnorm:
                h = self.batch_norms[i](h)
            h = self.learnable_activations[i](h)  # 学習可能な活性化関数を使用
            h = self.dropouts[i](h)
        
        main_output = self.output(h)
        
        # Residual path（スケーリングされた入力を使用）
        residual = self.shortcut(x_scaled)
        
        return main_output + residual


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n{'='*60}")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"{'='*60}")

In [None]:
import torch.optim as optim
from schedulefree import RAdamScheduleFree, AdamWScheduleFree
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.cuda.amp import autocast, GradScaler
import copy

# Fixed hyperparameters
activation = 'mish'
use_batchnorm = False
loss_function = 'mae'
optimizer_name = 'adamw_schedulefree'
beta1 = 0.9
beta2 = 0.999
clip_value = 3.0
early_stopping_patience = 1000
dropout_rate = 0.0
use_ema = True
ema_decay = 0.999

# Model and training hyperparameters
hidden_size = 2048
lr = 6.5e-05
weight_decay = 1.8e-05
batch_size = 16
epochs = 2500

hidden_sizes = [hidden_size]

print("="*60)
print("HOLDOUT TRAINING")
print("="*60)
print(f"Early stopping: patience={early_stopping_patience} epochs (based on RMSE)")
print(f"\nHyperparameters:")
print(f"  hidden_size: {hidden_size}")
print(f"  lr: {lr:.6f}")
print(f"  weight_decay: {weight_decay:.6e}")
print(f"  batch_size: {batch_size}")
print(f"  epochs: {epochs}")
print(f"  loss_function: {loss_function}")
print(f"  use_ema: {use_ema}, ema_decay: {ema_decay}")
print(f"  mixed_precision: {torch.cuda.is_available()}")
print("="*60)


class EMA:
    """Exponential Moving Average (EMA) for model weights"""
    def __init__(self, model, decay=0.999):
        self.model = model
        self.decay = decay
        self.shadow = {}
        self.backup = {}
        self.register()
    
    def register(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()
    
    def update(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                new_average = self.decay * self.shadow[name] + (1.0 - self.decay) * param.data
                self.shadow[name] = new_average.clone()
    
    def apply_shadow(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.backup[name] = param.data.clone()
                param.data = self.shadow[name]
    
    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                param.data = self.backup[name]
        self.backup = {}


# Loss function
loss_map = {'mse': nn.MSELoss(), 'mae': nn.L1Loss(), 'smooth_l1': nn.SmoothL1Loss(), 'huber': nn.HuberLoss()}
criterion = loss_map[loss_function]

# Smooth clipping functions
def smooth_clip(x, clip_val=3.0):
    return np.tanh(x / clip_val) * clip_val

def inverse_smooth_clip(x, clip_val=3.0):
    x_clipped = np.clip(x / clip_val, -0.9999, 0.9999)
    return np.arctanh(x_clipped) * clip_val


# ============================================
# Preprocessing
# ============================================
print("\nPreprocessing data...")

# Yeo-Johnson Transformation
skewness_dict = {}
for col in X_train_raw.columns:
    skew_val = stats.skew(X_train_raw[col])
    skewness_dict[col] = skew_val

high_skew_features = [col for col, skew_val in skewness_dict.items() if abs(skew_val) > 0.5]

if len(high_skew_features) > 0:
    pt = PowerTransformer(method='yeo-johnson', standardize=False)
    X_train_transformed = X_train_raw.copy()
    X_val_transformed = X_val_raw.copy()
    X_train_transformed[high_skew_features] = pt.fit_transform(X_train_raw[high_skew_features])
    X_val_transformed[high_skew_features] = pt.transform(X_val_raw[high_skew_features])
else:
    pt = None
    X_train_transformed = X_train_raw
    X_val_transformed = X_val_raw

# Yeo-Johnson for target
y_skew = stats.skew(y_train_raw)
print(f"Target skewness: {y_skew:.4f}")

if abs(y_skew) > 0.5:
    pt_y = PowerTransformer(method='yeo-johnson', standardize=False)
    y_train_transformed = pt_y.fit_transform(y_train_raw.reshape(-1, 1)).flatten()
    y_val_transformed = pt_y.transform(y_val_raw.reshape(-1, 1)).flatten()
    print(f"Applied Yeo-Johnson to target")
else:
    pt_y = None
    y_train_transformed = y_train_raw
    y_val_transformed = y_val_raw

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_expanded = poly.fit_transform(X_train_transformed.values)
X_val_expanded = poly.transform(X_val_transformed.values)

# RobustScaling + Smooth Clipping
scaler_X = RobustScaler()
scaler_y = RobustScaler()

X_train_scaled = scaler_X.fit_transform(X_train_expanded)
X_val_scaled = scaler_X.transform(X_val_expanded)

y_train_scaled = scaler_y.fit_transform(y_train_transformed.reshape(-1, 1)).flatten()
y_val_scaled = scaler_y.transform(y_val_transformed.reshape(-1, 1)).flatten()

X_train_scaled = smooth_clip(X_train_scaled, clip_value)
X_val_scaled = smooth_clip(X_val_scaled, clip_value)
y_train_scaled = smooth_clip(y_train_scaled, clip_value)
y_val_scaled = smooth_clip(y_val_scaled, clip_value)

# ============================================
# テストデータの前処理（訓練データと同じ変換）
# ============================================
X_test_transformed = X_test_raw.copy()
if pt is not None and len(high_skew_features) > 0:
    X_test_transformed[high_skew_features] = pt.transform(X_test_raw[high_skew_features])

X_test_expanded = poly.transform(X_test_transformed.values)
X_test_scaled = scaler_X.transform(X_test_expanded)
X_test_scaled_clipped = smooth_clip(X_test_scaled, clip_value)

# ============================================
# C-Mixup Data Augmentation (Based on NeurIPS 2022 Paper)
# ============================================
print(f"\n{'='*60}")
print("C-MIXUP DATA AUGMENTATION")
print(f"{'='*60}")

def c_mixup(X, y, alpha=1.0, sigma=1.0, augment_factor=2):
    """
    C-Mixup (Calibrated Mixup) data augmentation based on NeurIPS 2022 paper
    "C-Mixup: Improving Generalization in Regression"
    
    Algorithm 1 from the paper:
    1. Calculate pairwise distance matrix P via Eqn. (6)
    2. For each example (xi, yi):
       - Sample (xj, yj) from P(·|(xi, yi)) and λ from Beta(α, α)
       - Interpolate (xi, yi), (xj, yj) to get (x̃, ỹ) according to Eqn. (2)
    
    Args:
        X: Feature matrix (n_samples, n_features)
        y: Target values (n_samples,)
        alpha: Beta distribution parameter for mixing ratio
        sigma: Bandwidth for Gaussian kernel
        augment_factor: Number of times to augment the dataset
    
    Returns:
        X_aug: Augmented features
        y_aug: Augmented targets
    """
    n_samples = X.shape[0]
    
    # Step 1: Calculate pairwise distance matrix using label distances (Equation 6)
    # d(i,j) = ||yi - yj||²₂
    # For scalar targets: d(i,j) = (yi - yj)²
    y_expanded = y.reshape(-1, 1)
    label_distances = (y_expanded - y_expanded.T) ** 2  # shape (n_samples, n_samples)
    
    # Step 2: Calculate sampling probabilities using Gaussian kernel (Equation 6)
    # P((xj, yj)|(xi, yi)) ∝ exp(-d(i,j)² / (2σ²))
    sampling_probs = np.exp(-label_distances / (2 * sigma ** 2))
    
    # Normalize to probability mass function (sum to 1 for each row)
    # Set diagonal to 0 to avoid sampling the same example
    np.fill_diagonal(sampling_probs, 0)
    row_sums = sampling_probs.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1  # Avoid division by zero
    sampling_probs = sampling_probs / row_sums
    
    X_augmented = []
    y_augmented = []
    
    # Step 3: Apply C-Mixup augmentation (Algorithm 1 from paper)
    for _ in range(augment_factor):
        for i in range(n_samples):
            # Sample (xj, yj) according to P(·|(xi, yi))
            j = np.random.choice(n_samples, p=sampling_probs[i])
            
            # Sample λ from Beta(α, α)
            lambda_mix = np.random.beta(alpha, alpha)
            
            # Create mixed sample (Equation 2)
            # x̃ = λ·xi + (1-λ)·xj
            # ỹ = λ·yi + (1-λ)·yj
            x_mix = lambda_mix * X[i] + (1 - lambda_mix) * X[j]
            y_mix = lambda_mix * y[i] + (1 - lambda_mix) * y[j]
            
            X_augmented.append(x_mix)
            y_augmented.append(y_mix)
    
    # Combine original and augmented data
    X_aug = np.vstack([X] + [np.array(X_augmented)])
    y_aug = np.hstack([y] + [np.array(y_augmented)])
    
    return X_aug, y_aug

# Apply C-Mixup to training data
print("Applying C-Mixup to training data...")
print(f"Original training size: {X_train_scaled.shape[0]}")

# Hyperparameters for C-Mixup (based on paper recommendations)
c_mixup_alpha = 1.0      # Beta distribution parameter
c_mixup_sigma = 1.0      # Gaussian kernel bandwidth
c_mixup_factor = 2       # Augmentation factor (2x original data)

X_train_mixup, y_train_mixup = c_mixup(
    X_train_scaled, 
    y_train_scaled, 
    alpha=c_mixup_alpha, 
    sigma=c_mixup_sigma,
    augment_factor=c_mixup_factor
)

print(f"Augmented training size: {X_train_mixup.shape[0]}")
print(f"Augmentation ratio: {X_train_mixup.shape[0] / X_train_scaled.shape[0]:.1f}x")
print(f"C-Mixup parameters: alpha={c_mixup_alpha}, sigma={c_mixup_sigma}, factor={c_mixup_factor}")
print(f"{'='*60}")

print(f"\nFinal training shape: {X_train_mixup.shape}")
print(f"Final validation shape: {X_val_scaled.shape}")
print(f"{'='*60}")

# Create DataLoaders with optimization
train_dataset = TensorDataset(
    torch.tensor(X_train_mixup, dtype=torch.float32), 
    torch.tensor(y_train_mixup, dtype=torch.float32)
)
val_dataset = TensorDataset(
    torch.tensor(X_val_scaled, dtype=torch.float32), 
    torch.tensor(y_val_scaled, dtype=torch.float32)
)

# Optimized DataLoader settings
dataloader_kwargs = {
    'batch_size': batch_size,
    'pin_memory': True if torch.cuda.is_available() else False,
}

# Add num_workers only if not in notebook or if explicitly needed
# Note: In Kaggle/Jupyter, num_workers > 0 can cause issues
# Uncomment the following lines if running in a regular Python script:
# if torch.cuda.is_available():
#     dataloader_kwargs['num_workers'] = 2
#     dataloader_kwargs['persistent_workers'] = True

train_loader = DataLoader(train_dataset, shuffle=True, **dataloader_kwargs)
val_loader = DataLoader(val_dataset, shuffle=False, **dataloader_kwargs)

print(f"\nDataLoader settings: {dataloader_kwargs}")

# ============================================
# Model Training
# ============================================
print("\nInitializing model...")

model = MLPModel(
    input_size=X_train_mixup.shape[1],
    hidden_sizes=hidden_sizes, 
    dropout_rate=dropout_rate,
    activation=activation,
    use_batchnorm=use_batchnorm
)
model = model.to(device)

# Initialize EMA
ema = EMA(model, decay=ema_decay) if use_ema else None

# Initialize Optimizer
is_schedulefree = optimizer_name.endswith('_schedulefree')
if optimizer_name == 'adamw_schedulefree':
    optimizer = AdamWScheduleFree(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
elif optimizer_name == 'radam_schedulefree':
    optimizer = RAdamScheduleFree(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
else:
    optimizer = optim.AdamW(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)

# Initialize GradScaler for mixed precision training
use_amp = torch.cuda.is_available()
scaler = GradScaler() if use_amp else None

if use_amp:
    print("Using Automatic Mixed Precision (AMP) for faster training")

# Training Loop
best_val_rmse = float('inf')
best_model_state = None
best_ema_shadow = None
best_epoch = 0
patience_counter = 0

print("\nTraining...")
for epoch in range(epochs):
    if is_schedulefree:
        optimizer.train()
    
    model.train()
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        
        # Mixed precision training
        if use_amp:
            with autocast():
                outputs = model(X_batch)
                loss = criterion(outputs.squeeze(-1), y_batch)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(-1), y_batch)
            loss.backward()
            optimizer.step()
        
        if ema is not None:
            ema.update()
    
    if is_schedulefree:
        optimizer.eval()
    
    # Validation
    if ema is not None:
        ema.apply_shadow()
    
    model.eval()
    
    val_predictions = []
    val_targets = []
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            # Use AMP for validation as well
            if use_amp:
                with autocast():
                    outputs = model(X_batch)
            else:
                outputs = model(X_batch)
            
            val_predictions.extend(outputs.squeeze(-1).cpu().numpy())
            val_targets.extend(y_batch.cpu().numpy())
    
    if ema is not None:
        ema.restore()
    
    # Calculate metrics in original scale
    val_predictions_unclipped = inverse_smooth_clip(np.array(val_predictions), clip_value)
    val_targets_unclipped = inverse_smooth_clip(np.array(val_targets), clip_value)
    
    val_predictions_original = scaler_y.inverse_transform(val_predictions_unclipped.reshape(-1, 1)).flatten()
    val_targets_original = scaler_y.inverse_transform(val_targets_unclipped.reshape(-1, 1)).flatten()
    
    if pt_y is not None:
        val_predictions_original = pt_y.inverse_transform(val_predictions_original.reshape(-1, 1)).flatten()
        val_targets_original = pt_y.inverse_transform(val_targets_original.reshape(-1, 1)).flatten()
    
    val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))
    val_mae = np.mean(np.abs(val_predictions_original - val_targets_original))
    
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch + 1}/{epochs} - MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}")
    
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_model_state = copy.deepcopy(model.state_dict())
        if ema is not None:
            best_ema_shadow = copy.deepcopy(ema.shadow)
        best_epoch = epoch + 1
        patience_counter = 0
    else:
        patience_counter += 1
    
    if patience_counter >= early_stopping_patience:
        print(f"Early stopping at epoch {epoch + 1}")
        break

# Load best model
model.load_state_dict(best_model_state)
if ema is not None and best_ema_shadow is not None:
    ema.shadow = best_ema_shadow
    ema.apply_shadow()

print(f"\n{'='*60}")
print("TRAINING COMPLETE")
print(f"{'='*60}")
print(f"Best Validation RMSE: {best_val_rmse:.4f} at epoch {best_epoch}")
if use_ema:
    print(f"Using EMA weights (decay={ema_decay})")
if use_amp:
    print(f"Mixed precision training was enabled")
print(f"{'='*60}")


In [None]:
# ============================================
# Test Prediction (Holdout Method)
# ============================================

print("="*60)
print("TEST PREDICTION")
print("="*60)

# テストデータは既に前処理済み (X_test_scaled_clipped)
test_tensor = torch.tensor(X_test_scaled_clipped, dtype=torch.float32).to(device)

# Predict
model.eval()
with torch.no_grad():
    predictions_scaled = model(test_tensor).squeeze().cpu().numpy()
    predictions_unclipped = inverse_smooth_clip(predictions_scaled, clip_value)
    predictions = scaler_y.inverse_transform(predictions_unclipped.reshape(-1, 1)).flatten()
    
    # Apply inverse Yeo-Johnson transformation if applicable
    if pt_y is not None:
        predictions = pt_y.inverse_transform(predictions.reshape(-1, 1)).flatten()

print(f"\nPrediction complete!")
print(f"Prediction range: [{predictions.min():.2f}, {predictions.max():.2f}]")
print(f"Prediction mean: {predictions.mean():.2f}")
print(f"Prediction std: {predictions.std():.2f}")

# Prepare submission
submission = pd.DataFrame({
    "id": range(1455, 1455 + len(predictions)), 
    "DIC": predictions
})
submission.to_csv("submission_holdout.csv", index=False)

print(f"\n{'='*60}")
print("SUBMISSION COMPLETE")
print(f"{'='*60}")
print(f"File saved: submission_holdout.csv")
print(f"Validation RMSE: {best_val_rmse:.4f}")
print(f"{'='*60}")


In [None]:

# name: 坂田煌翔
# student_id: 62408940