In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install schedulefree -q

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy import stats
import numpy as np
import pandas as pd
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# CRITICAL FIX: Rename TA1.x to TA1 to match test data
train = train.rename(columns={"TA1.x": "TA1"})

# ============================================
# Extract Raw Features and Target
# ============================================
print(f"\n{'='*60}")
print("DATA PREPARATION")
print(f"{'='*60}")

# Extract features and target
feature_columns = [col for col in train.columns if col != 'DIC']
X_raw = train[feature_columns].copy()
y_raw = train['DIC'].values.copy()
X_test_raw = test[feature_columns].copy()

print(f"元の訓練データ: {X_raw.shape}")
print(f"テストデータ: {X_test_raw.shape}")

# ============================================
# Holdout Validation Setup
# ============================================
print(f"\n{'='*60}")
print("HOLDOUT VALIDATION SETUP")
print(f"{'='*60}")

# 80% train, 20% validation
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=SEED
)

print(f"Train set: {X_train_raw.shape[0]} samples")
print(f"Validation set: {X_val_raw.shape[0]} samples")
print(f"Test set: {X_test_raw.shape[0]} samples")
print("="*60)


In [None]:
# ============================================
# ResNet Model for Tabular Data
# Based on "Revisiting Deep Learning Models for Tabular Data" (NeurIPS 2021)
# ============================================

class ResNetBlock(nn.Module):
    """
    ResNet Block for tabular data
    ResNetBlock(x) = x + Dropout(Linear(Dropout(ReLU(Linear(BatchNorm(x))))))
    """
    def __init__(self, d, hidden_factor=2, dropout_rate=0.1):
        """
        Args:
            d: Dimension of input and output
            hidden_factor: Factor to determine hidden layer size (hidden = d * hidden_factor)
            dropout_rate: Dropout probability
        """
        super(ResNetBlock, self).__init__()
        
        hidden_dim = int(d * hidden_factor)
        
        self.norm = nn.BatchNorm1d(d)
        self.linear1 = nn.Linear(d, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.linear2 = nn.Linear(hidden_dim, d)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        # Initialize weights
        nn.init.kaiming_normal_(self.linear1.weight, mode='fan_in', nonlinearity='relu')
        nn.init.kaiming_normal_(self.linear2.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(self.linear1.bias, 0)
        nn.init.constant_(self.linear2.bias, 0)
    
    def forward(self, x):
        # Main path: BatchNorm -> Linear -> ReLU -> Dropout -> Linear -> Dropout
        residual = x
        x = self.norm(x)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.linear2(x)
        x = self.dropout2(x)
        
        # Residual connection
        return residual + x


class ResNetModel(nn.Module):
    """
    ResNet for tabular data
    ResNet(x) = Prediction(ResNetBlock(...(ResNetBlock(Linear(x)))))
    """
    def __init__(self, input_size, d=256, n_blocks=4, hidden_factor=2, dropout_rate=0.1):
        """
        Args:
            input_size: Number of input features
            d: Dimension of ResNet blocks
            n_blocks: Number of ResNet blocks
            hidden_factor: Hidden layer factor for each block
            dropout_rate: Dropout probability
        """
        super(ResNetModel, self).__init__()
        
        # Initial projection
        self.input_layer = nn.Linear(input_size, d)
        nn.init.kaiming_normal_(self.input_layer.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(self.input_layer.bias, 0)
        
        # ResNet blocks
        self.blocks = nn.ModuleList([
            ResNetBlock(d, hidden_factor, dropout_rate) 
            for _ in range(n_blocks)
        ])
        
        # Prediction head: BatchNorm -> ReLU -> Linear
        self.final_norm = nn.BatchNorm1d(d)
        self.final_relu = nn.ReLU()
        self.output = nn.Linear(d, 1)
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
    
    def forward(self, x):
        # Initial projection
        x = self.input_layer(x)
        
        # Pass through ResNet blocks
        for block in self.blocks:
            x = block(x)
        
        # Prediction
        x = self.final_norm(x)
        x = self.final_relu(x)
        x = self.output(x)
        
        return x


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n{'='*60}")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"{'='*60}")

In [None]:
import torch.optim as optim
from schedulefree import RAdamScheduleFree, AdamWScheduleFree
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.amp import autocast, GradScaler
from sklearn.preprocessing import QuantileTransformer, StandardScaler
import copy

# ============================================
# ResNet Hyperparameters (Prototype)
# ============================================
# Model architecture
d = 256  # Dimension of ResNet blocks
n_blocks = 4  # Number of ResNet blocks
hidden_factor = 2  # Hidden layer factor in each block
dropout_rate = 0.1  # Dropout probability

# Training hyperparameters
lr = 1e-3
weight_decay = 1e-4
batch_size = 64
epochs = 2000
early_stopping_patience = 200

# Optimizer settings
optimizer_name = ''
beta1 = 0.9
beta2 = 0.999

# Loss function
loss_function = 'mse'  # or 'mae'

# EMA settings
use_ema = False
ema_decay = 0.999

# Data augmentation
use_c_mixup = False  # C-Mixup data augmentation
c_mixup_alpha = 1.0
c_mixup_sigma = 1.0
c_mixup_factor = 2

# Multiple seed training
num_seeds = 1
start_seed = 42

print("="*60)
print("ResNet for Tabular Data - Training Configuration")
print("="*60)
print(f"Model:")
print(f"  d (block dimension): {d}")
print(f"  n_blocks: {n_blocks}")
print(f"  hidden_factor: {hidden_factor}")
print(f"  dropout_rate: {dropout_rate}")
print(f"\nTraining:")
print(f"  lr: {lr:.6f}")
print(f"  weight_decay: {weight_decay:.6e}")
print(f"  batch_size: {batch_size}")
print(f"  epochs: {epochs}")
print(f"  early_stopping_patience: {early_stopping_patience}")
print(f"  loss_function: {loss_function}")
print(f"  use_ema: {use_ema}, ema_decay: {ema_decay}")
print(f"  use_c_mixup: {use_c_mixup}")
print(f"\nSeeds: {start_seed} to {start_seed + num_seeds - 1}")
print("="*60)


class EMA:
    """Exponential Moving Average (EMA) for model weights"""
    def __init__(self, model, decay=0.999):
        self.model = model
        self.decay = decay
        self.shadow = {}
        self.backup = {}
        self.register()
    
    def register(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()
    
    def update(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                new_average = self.decay * self.shadow[name] + (1.0 - self.decay) * param.data
                self.shadow[name] = new_average.clone()
    
    def apply_shadow(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.backup[name] = param.data.clone()
                param.data = self.shadow[name]
    
    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                param.data = self.backup[name]
        self.backup = {}


def c_mixup(X, y, alpha=1.0, sigma=1.0, augment_factor=2):
    """
    C-Mixup (Calibrated Mixup) data augmentation
    """
    n_samples = X.shape[0]
    
    y_expanded = y.reshape(-1, 1)
    label_distances = (y_expanded - y_expanded.T) ** 2
    
    sampling_probs = np.exp(-label_distances / (2 * sigma ** 2))
    np.fill_diagonal(sampling_probs, 0)
    row_sums = sampling_probs.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1
    sampling_probs = sampling_probs / row_sums
    
    X_augmented = []
    y_augmented = []
    
    for _ in range(augment_factor):
        for i in range(n_samples):
            j = np.random.choice(n_samples, p=sampling_probs[i])
            lambda_mix = np.random.beta(alpha, alpha)
            
            x_mix = lambda_mix * X[i] + (1 - lambda_mix) * X[j]
            y_mix = lambda_mix * y[i] + (1 - lambda_mix) * y[j]
            
            X_augmented.append(x_mix)
            y_augmented.append(y_mix)
    
    X_aug = np.vstack([X] + [np.array(X_augmented)])
    y_aug = np.hstack([y] + [np.array(y_augmented)])
    
    return X_aug, y_aug


# Loss function
loss_map = {
    'mse': nn.MSELoss(), 
    'mae': nn.L1Loss(), 
    'smooth_l1': nn.SmoothL1Loss(), 
    'huber': nn.HuberLoss()
}
criterion = loss_map[loss_function]


# ============================================
# TRAINING LOOP OVER MULTIPLE SEEDS
# ============================================
for seed_idx in range(num_seeds):
    SEED = start_seed + seed_idx
    
    print(f"\n{'#'*60}")
    print(f"# TRAINING WITH SEED {SEED} ({seed_idx + 1}/{num_seeds})")
    print(f"{'#'*60}\n")
    
    # Set random seeds
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # ============================================
    # Preprocessing: Quantile Transformation Only
    # ============================================
    print("Preprocessing data with Quantile Transformation...")
    
    # Quantile Transformation for features
    qt_features = QuantileTransformer(
        output_distribution='normal',
        random_state=SEED,
        n_quantiles=min(1000, X_train_raw.shape[0])  # Adjust based on sample size
    )
    
    X_train_transformed = qt_features.fit_transform(X_train_raw.values)
    X_val_transformed = qt_features.transform(X_val_raw.values)
    X_test_transformed = qt_features.transform(X_test_raw.values)
    
    print(f"  Applied Quantile Transformation to features")
    print(f"  Train shape: {X_train_transformed.shape}")
    print(f"  Val shape: {X_val_transformed.shape}")
    print(f"  Test shape: {X_test_transformed.shape}")
    
    # Standardization for target (simple mean/std)
    scaler_y = StandardScaler()
    y_train_transformed = scaler_y.fit_transform(y_train_raw.reshape(-1, 1)).flatten()
    y_val_transformed = scaler_y.transform(y_val_raw.reshape(-1, 1)).flatten()
    
    print(f"  Applied StandardScaler to target")
    
    # ============================================
    # Optional: C-Mixup Data Augmentation
    # ============================================
    if use_c_mixup:
        print(f"\n{'='*60}")
        print("C-MIXUP DATA AUGMENTATION")
        print(f"{'='*60}")
        
        print(f"Original training size: {X_train_transformed.shape[0]}")
        
        X_train_final, y_train_final = c_mixup(
            X_train_transformed, 
            y_train_transformed, 
            alpha=c_mixup_alpha, 
            sigma=c_mixup_sigma,
            augment_factor=c_mixup_factor
        )
        
        print(f"Augmented training size: {X_train_final.shape[0]}")
        print(f"Augmentation ratio: {X_train_final.shape[0] / X_train_transformed.shape[0]:.1f}x")
        print(f"{'='*60}")
    else:
        X_train_final = X_train_transformed
        y_train_final = y_train_transformed
    
    X_val_final = X_val_transformed
    y_val_final = y_val_transformed
    X_test_final = X_test_transformed
    
    # ============================================
    # Create DataLoaders
    # ============================================
    train_dataset = TensorDataset(
        torch.tensor(X_train_final, dtype=torch.float32), 
        torch.tensor(y_train_final, dtype=torch.float32)
    )
    val_dataset = TensorDataset(
        torch.tensor(X_val_final, dtype=torch.float32), 
        torch.tensor(y_val_final, dtype=torch.float32)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
    
    # ============================================
    # Initialize Model
    # ============================================
    print("\nInitializing ResNet model...")
    
    model = ResNetModel(
        input_size=X_train_final.shape[1],
        d=d,
        n_blocks=n_blocks,
        hidden_factor=hidden_factor,
        dropout_rate=dropout_rate
    )
    model = model.to(device)
    
    # Count parameters
    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"  Model parameters: {n_params:,}")
    
    # Initialize EMA
    ema = EMA(model, decay=ema_decay) if use_ema else None
    
    # Initialize Optimizer
    is_schedulefree = optimizer_name.endswith('_schedulefree')
    if optimizer_name == 'adamw_schedulefree':
        optimizer = AdamWScheduleFree(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
    elif optimizer_name == 'radam_schedulefree':
        optimizer = RAdamScheduleFree(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
    else:
        optimizer = optim.AdamW(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
    
    # Mixed precision training
    use_amp = torch.cuda.is_available()
    scaler = GradScaler('cuda') if use_amp else None
    
    if use_amp:
        print("  Using Automatic Mixed Precision (AMP)")
    
    # ============================================
    # Training Loop
    # ============================================
    print("\nTraining...")
    best_val_rmse = float('inf')
    best_model_state = None
    best_ema_shadow = None
    best_epoch = 0
    patience_counter = 0
    
    for epoch in range(epochs):
        # Training mode
        if is_schedulefree:
            optimizer.train()
        
        model.train()
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            
            if use_amp:
                with autocast('cuda'):
                    outputs = model(X_batch)
                    loss = criterion(outputs.squeeze(-1), y_batch)
                
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(X_batch)
                loss = criterion(outputs.squeeze(-1), y_batch)
                loss.backward()
                optimizer.step()
            
            if ema is not None:
                ema.update()
        
        # Evaluation mode
        if is_schedulefree:
            optimizer.eval()
        
        if ema is not None:
            ema.apply_shadow()
        
        model.eval()
        
        # Validation
        val_predictions = []
        val_targets = []
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                if use_amp:
                    with autocast('cuda'):
                        outputs = model(X_batch)
                else:
                    outputs = model(X_batch)
                
                val_predictions.extend(outputs.squeeze(-1).cpu().numpy())
                val_targets.extend(y_batch.cpu().numpy())
        
        if ema is not None:
            ema.restore()
        
        # Calculate metrics in original scale
        val_predictions_original = scaler_y.inverse_transform(np.array(val_predictions).reshape(-1, 1)).flatten()
        val_targets_original = scaler_y.inverse_transform(np.array(val_targets).reshape(-1, 1)).flatten()
        
        val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))
        val_mae = np.mean(np.abs(val_predictions_original - val_targets_original))
        
        if (epoch + 1) % 100 == 0:
            print(f"Epoch {epoch + 1}/{epochs} - MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}")
        
        # Early stopping
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model_state = copy.deepcopy(model.state_dict())
            if ema is not None:
                best_ema_shadow = copy.deepcopy(ema.shadow)
            best_epoch = epoch + 1
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= early_stopping_patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break
    
    # Load best model
    model.load_state_dict(best_model_state)
    if ema is not None and best_ema_shadow is not None:
        ema.shadow = best_ema_shadow
        ema.apply_shadow()
    
    print(f"\n{'='*60}")
    print("TRAINING COMPLETE")
    print(f"{'='*60}")
    print(f"Seed: {SEED}")
    print(f"Best Validation RMSE: {best_val_rmse:.4f} at epoch {best_epoch}")
    if use_ema:
        print(f"Using EMA weights (decay={ema_decay})")
    print(f"{'='*60}")
    
    # ============================================
    # Test Prediction
    # ============================================
    print("\nGenerating test predictions...")
    
    test_tensor = torch.tensor(X_test_final, dtype=torch.float32).to(device)
    
    model.eval()
    with torch.no_grad():
        if use_amp:
            with autocast('cuda'):
                predictions_scaled = model(test_tensor).squeeze().cpu().numpy()
        else:
            predictions_scaled = model(test_tensor).squeeze().cpu().numpy()
        
        # Inverse scaling
        predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()
    
    print(f"Prediction range: [{predictions.min():.2f}, {predictions.max():.2f}]")
    print(f"Prediction mean: {predictions.mean():.2f}")
    print(f"Prediction std: {predictions.std():.2f}")
    
    # Save submission
    submission = pd.DataFrame({
        "id": range(1455, 1455 + len(predictions)), 
        "DIC": predictions
    })
    submission_filename = f"submission_resnet_seed_{SEED}.csv"
    submission.to_csv(submission_filename, index=False)
    
    print(f"\n{'='*60}")
    print("SUBMISSION SAVED")
    print(f"{'='*60}")
    print(f"File: {submission_filename}")
    print(f"Validation RMSE: {best_val_rmse:.4f}")
    print(f"{'='*60}\n")

print(f"\n{'#'*60}")
print(f"# ALL SEEDS TRAINING COMPLETE")
print(f"{'#'*60}")
print(f"Trained {num_seeds} models with seeds {start_seed} to {start_seed + num_seeds - 1}")
print(f"Submission files created:")
for i in range(num_seeds):
    print(f"  - submission_resnet_seed_{start_seed + i}.csv")
print(f"{'#'*60}")

In [None]:

# name: 坂田煌翔
# student_id: 62408940