In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install schedulefree -q

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy import stats
import numpy as np
import pandas as pd
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# CRITICAL FIX: Rename TA1.x to TA1 to match test data
train = train.rename(columns={"TA1.x": "TA1"})

# ============================================
# Extract Raw Features and Target
# ============================================
print(f"\n{'='*60}")
print("DATA PREPARATION")
print(f"{'='*60}")

# Extract features and target
feature_columns = [col for col in train.columns if col != 'DIC']
X_raw = train[feature_columns].copy()
y_raw = train['DIC'].values.copy()
X_test_raw = test[feature_columns].copy()

print(f"元の訓練データ: {X_raw.shape}")
print(f"テストデータ: {X_test_raw.shape}")

# ============================================
# Holdout Validation Setup
# ============================================
print(f"\n{'='*60}")
print("HOLDOUT VALIDATION SETUP")
print(f"{'='*60}")

# 80% train, 20% validation
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=SEED
)

print(f"Train set: {X_train_raw.shape[0]} samples")
print(f"Validation set: {X_val_raw.shape[0]} samples")
print(f"Test set: {X_test_raw.shape[0]} samples")
print("="*60)


In [None]:
# ============================================
# ResNet Model for Tabular Data
# Based on "Revisiting Deep Learning Models for Tabular Data" (NeurIPS 2021)
# ============================================

class ResNetBlock(nn.Module):
    """
    ResNet Block for tabular data
    ResNetBlock(x) = x + Dropout(Linear(Dropout(ReLU(Linear(BatchNorm(x))))))
    """
    def __init__(self, d, hidden_factor=2, dropout_rate=0.1):
        """
        Args:
            d: Dimension of input and output
            hidden_factor: Factor to determine hidden layer size (hidden = d * hidden_factor)
            dropout_rate: Dropout probability
        """
        super(ResNetBlock, self).__init__()
        
        hidden_dim = int(d * hidden_factor)
        
        self.norm = nn.BatchNorm1d(d)
        self.linear1 = nn.Linear(d, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.linear2 = nn.Linear(hidden_dim, d)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        # Initialize weights
        nn.init.kaiming_normal_(self.linear1.weight, mode='fan_in', nonlinearity='relu')
        nn.init.kaiming_normal_(self.linear2.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(self.linear1.bias, 0)
        nn.init.constant_(self.linear2.bias, 0)
    
    def forward(self, x):
        # Main path: BatchNorm -> Linear -> ReLU -> Dropout -> Linear -> Dropout
        residual = x
        x = self.norm(x)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.linear2(x)
        x = self.dropout2(x)
        
        # Residual connection
        return residual + x


class ResNetModel(nn.Module):
    """
    ResNet for tabular data
    ResNet(x) = Prediction(ResNetBlock(...(ResNetBlock(Linear(x)))))
    """
    def __init__(self, input_size, d=256, n_blocks=4, hidden_factor=2, dropout_rate=0.1):
        """
        Args:
            input_size: Number of input features
            d: Dimension of ResNet blocks
            n_blocks: Number of ResNet blocks
            hidden_factor: Hidden layer factor for each block
            dropout_rate: Dropout probability
        """
        super(ResNetModel, self).__init__()
        
        # Initial projection
        self.input_layer = nn.Linear(input_size, d)
        nn.init.kaiming_normal_(self.input_layer.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(self.input_layer.bias, 0)
        
        # ResNet blocks
        self.blocks = nn.ModuleList([
            ResNetBlock(d, hidden_factor, dropout_rate) 
            for _ in range(n_blocks)
        ])
        
        # Prediction head: BatchNorm -> ReLU -> Linear
        self.final_norm = nn.BatchNorm1d(d)
        self.final_relu = nn.ReLU()
        self.output = nn.Linear(d, 1)
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
    
    def forward(self, x):
        # Initial projection
        x = self.input_layer(x)
        
        # Pass through ResNet blocks
        for block in self.blocks:
            x = block(x)
        
        # Prediction
        x = self.final_norm(x)
        x = self.final_relu(x)
        x = self.output(x)
        
        return x


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n{'='*60}")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"{'='*60}")

In [None]:
# ============================================
# Denoising Autoencoder (DAE) for Tabular Data
# ResNetBlockベースの構造で統一
# ============================================

class DAEEncoder(nn.Module):
    """
    DAE Encoder using ResNetBlock structure
    ResNetBlockと完全に同じ構造で、重みの転用を容易にする
    """
    def __init__(self, input_size, d, n_blocks, hidden_factor=2, dropout_rate=0.1):
        """
        Args:
            input_size: Number of input features
            d: Dimension of ResNet blocks
            n_blocks: Number of ResNet blocks
            hidden_factor: Hidden layer factor for each block
            dropout_rate: Dropout probability
        """
        super(DAEEncoder, self).__init__()
        
        # Initial projection (ResNetと同じ)
        self.input_layer = nn.Linear(input_size, d)
        nn.init.kaiming_normal_(self.input_layer.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(self.input_layer.bias, 0)
        
        # ResNet blocks (ResNetと完全に同じ構造)
        self.blocks = nn.ModuleList([
            ResNetBlock(d, hidden_factor, dropout_rate) 
            for _ in range(n_blocks)
        ])
        
        # Final normalization (エンコーダーの出力を正規化)
        self.final_norm = nn.BatchNorm1d(d)
        self.final_relu = nn.ReLU()
    
    def forward(self, x):
        # Initial projection
        x = self.input_layer(x)
        
        # Pass through ResNet blocks
        for block in self.blocks:
            x = block(x)
        
        # Final normalization
        x = self.final_norm(x)
        x = self.final_relu(x)
        
        return x


class DAEDecoder(nn.Module):
    """
    DAE Decoder to reconstruct input from latent representation
    """
    def __init__(self, d, output_size):
        """
        Args:
            d: Dimension of latent representation
            output_size: Number of output features (same as input)
        """
        super(DAEDecoder, self).__init__()
        
        # Simple decoder: Linear projection back to input space
        self.output_layer = nn.Linear(d, output_size)
        nn.init.xavier_normal_(self.output_layer.weight)
        nn.init.constant_(self.output_layer.bias, 0)
    
    def forward(self, x):
        return self.output_layer(x)


class DenoisingAutoencoder(nn.Module):
    """
    Denoising Autoencoder with ResNetBlock-based encoder
    エンコーダーがResNetと完全に同じ構造なので、重みの転用が容易
    """
    def __init__(self, input_size, d=256, n_blocks=4, hidden_factor=2, dropout_rate=0.1):
        """
        Args:
            input_size: Number of input features
            d: Dimension of ResNet blocks
            n_blocks: Number of ResNet blocks
            hidden_factor: Hidden layer factor for each block
            dropout_rate: Dropout probability
        """
        super(DenoisingAutoencoder, self).__init__()
        
        self.encoder = DAEEncoder(input_size, d, n_blocks, hidden_factor, dropout_rate)
        self.decoder = DAEDecoder(d, input_size)
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
    def encode(self, x):
        """Run only the encoder part"""
        return self.encoder(x)


class ResNetWithDAE(nn.Module):
    """
    ResNet model with DAE pretrained encoder
    DAEのエンコーダー全体（input_layer + blocks）をそのまま使用
    """
    def __init__(self, input_size, d=256, n_blocks=4, hidden_factor=2, 
                 hidden_dropout=0.1, residual_dropout=0.1,
                 use_dae_encoder=False, n_additional_blocks=0):
        """
        Args:
            input_size: Number of input features
            d: Dimension of ResNet blocks
            n_blocks: Number of blocks in DAE encoder (事前学習済み)
            hidden_factor: Hidden layer factor for each block
            hidden_dropout: Hidden layer dropout (used in additional blocks)
            residual_dropout: Residual dropout (used in DAE encoder blocks)
            use_dae_encoder: If True, use pretrained DAE encoder
            n_additional_blocks: Number of additional ResNet blocks after DAE encoder
        """
        super(ResNetWithDAE, self).__init__()
        
        self.use_dae_encoder = use_dae_encoder
        self.n_blocks = n_blocks
        self.n_additional_blocks = n_additional_blocks
        
        if use_dae_encoder:
            # DAE encoder (will be loaded from pretrained weights)
            self.dae_encoder = DAEEncoder(input_size, d, n_blocks, hidden_factor, residual_dropout)
            
            # Additional ResNet blocks (optional, for further processing)
            if n_additional_blocks > 0:
                self.additional_blocks = nn.ModuleList([
                    ResNetBlock(d, hidden_factor, hidden_dropout) 
                    for _ in range(n_additional_blocks)
                ])
            else:
                self.additional_blocks = None
        else:
            # Standard ResNet without DAE
            self.input_layer = nn.Linear(input_size, d)
            nn.init.kaiming_normal_(self.input_layer.weight, mode='fan_in', nonlinearity='relu')
            nn.init.constant_(self.input_layer.bias, 0)
            
            total_blocks = n_blocks + n_additional_blocks
            self.blocks = nn.ModuleList([
                ResNetBlock(d, hidden_factor, residual_dropout) 
                for _ in range(total_blocks)
            ])
        
        # Prediction head (共通)
        self.final_norm = nn.BatchNorm1d(d)
        self.final_relu = nn.ReLU()
        self.output = nn.Linear(d, 1)
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
    
    def load_dae_encoder(self, dae_state_dict):
        """
        Load pretrained DAE encoder weights
        DAEのencoder部分の重みをそのまま転用
        """
        if not self.use_dae_encoder:
            raise ValueError("Must be initialized with use_dae_encoder=True")
        
        # Extract encoder weights from DAE
        encoder_state_dict = {}
        for key, value in dae_state_dict.items():
            if key.startswith('encoder.'):
                # Remove 'encoder.' prefix
                new_key = key.replace('encoder.', '')
                encoder_state_dict[new_key] = value
        
        # Load weights into dae_encoder
        self.dae_encoder.load_state_dict(encoder_state_dict, strict=True)
    
    def forward(self, x):
        if self.use_dae_encoder:
            # Use pretrained DAE encoder
            x = self.dae_encoder(x)
            
            # Additional processing (optional)
            if self.additional_blocks is not None:
                for block in self.additional_blocks:
                    x = block(x)
        else:
            # Standard ResNet
            x = self.input_layer(x)
            for block in self.blocks:
                x = block(x)
        
        # Prediction head
        x = self.final_norm(x)
        x = self.final_relu(x)
        x = self.output(x)
        
        return x


def add_noise(x, noise_type='gaussian', noise_level=0.1):
    """
    Add noise to input data
    
    Args:
        x: Input data (torch tensor)
        noise_type: 'gaussian', 'masking', or 'swap'
        noise_level: Noise strength
    """
    if noise_type == 'gaussian':
        # Gaussian noise
        noise = torch.randn_like(x) * noise_level
        return x + noise
    
    elif noise_type == 'masking':
        # Random masking (set some features to zero)
        mask = torch.rand_like(x) > noise_level
        return x * mask.float()
    
    elif noise_type == 'swap':
        # Swap noise (randomly swap values)
        noisy_x = x.clone()
        for i in range(x.shape[1]):
            if torch.rand(1).item() < noise_level:
                # Shuffle this feature
                idx = torch.randperm(x.shape[0])
                noisy_x[:, i] = x[idx, i]
        return noisy_x
    
    else:
        raise ValueError(f"Unknown noise type: {noise_type}")


def pretrain_dae(dae, X_train, X_val, epochs=100, batch_size=64, 
                 noise_type='gaussian', noise_level=0.1, 
                 learning_rate=1e-3, patience=20, device='cuda', seed=42, verbose=False):
    """
    Pretrain DAE
    
    Returns:
        best_dae_state: Best model state
        best_val_loss: Best validation loss
    """
    # Set random seeds
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    dae = dae.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(dae.parameters(), lr=learning_rate, weight_decay=1e-5)
    
    # Create DataLoaders with explicit generator
    g = torch.Generator()
    g.manual_seed(seed)
    
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32))
    val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32))
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    best_val_loss = float('inf')
    best_dae_state = None
    patience_counter = 0
    
    use_amp = torch.cuda.is_available()
    scaler = GradScaler('cuda') if use_amp else None
    
    for epoch in range(epochs):
        # Training
        dae.train()
        train_loss = 0
        for (X_batch,) in train_loader:
            X_batch = X_batch.to(device)
            
            # Add noise
            X_noisy = add_noise(X_batch, noise_type=noise_type, noise_level=noise_level)
            
            # Reconstruct
            optimizer.zero_grad()
            
            if use_amp:
                with autocast('cuda'):
                    X_reconstructed = dae(X_noisy)
                    loss = criterion(X_reconstructed, X_batch)  # Reconstruct original data
                
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                X_reconstructed = dae(X_noisy)
                loss = criterion(X_reconstructed, X_batch)
                loss.backward()
                optimizer.step()
            
            train_loss += loss.item() * X_batch.size(0)
        
        train_loss /= len(train_dataset)
        
        # Validation
        dae.eval()
        val_loss = 0
        with torch.no_grad():
            for (X_batch,) in val_loader:
                X_batch = X_batch.to(device)
                X_noisy = add_noise(X_batch, noise_type=noise_type, noise_level=noise_level)
                
                if use_amp:
                    with autocast('cuda'):
                        X_reconstructed = dae(X_noisy)
                        loss = criterion(X_reconstructed, X_batch)
                else:
                    X_reconstructed = dae(X_noisy)
                    loss = criterion(X_reconstructed, X_batch)
                
                val_loss += loss.item() * X_batch.size(0)
        
        val_loss /= len(val_dataset)
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_dae_state = copy.deepcopy(dae.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1
        
        if verbose and (epoch + 1) % 10 == 0:
            print(f"  DAE Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
        
        if patience_counter >= patience:
            if verbose:
                print(f"  DAE early stopping at epoch {epoch+1}")
            break
    
    return best_dae_state, best_val_loss

print("DAE classes and functions loaded successfully (ResNetBlock-based)")

In [None]:
import torch.optim as optim
from schedulefree import RAdamScheduleFree, AdamWScheduleFree
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.amp import autocast, GradScaler
from sklearn.preprocessing import RobustScaler
import copy

# ============================================
# Configuration
# ============================================
SEED = 42

# Fixed settings
use_c_mixup = True
c_mixup_alpha = 1.0
c_mixup_sigma = 1.0
c_mixup_factor = 2
epochs = 100000
early_stopping_patience = 500

# DAE settings
dae_pretrain_epochs = 200
dae_patience = 30

print("="*60)
print("ResNet for Tabular Data with DAE Support (Fixed Architecture)")
print("="*60)
print(f"SEED: {SEED}")
print(f"epochs: {epochs}")
print(f"early_stopping_patience: {early_stopping_patience}")
print(f"dae_pretrain_epochs: {dae_pretrain_epochs}")
print("="*60)


class EMA:
    """Exponential Moving Average (EMA) for model weights"""
    def __init__(self, model, decay=0.999):
        self.model = model
        self.decay = decay
        self.shadow = {}
        self.backup = {}
        self.register()
    
    def register(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()
    
    def update(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                new_average = self.decay * self.shadow[name] + (1.0 - self.decay) * param.data
                self.shadow[name] = new_average.clone()
    
    def apply_shadow(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.backup[name] = param.data.clone()
                param.data = self.shadow[name]
    
    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                param.data = self.backup[name]
        self.backup = {}


def smooth_clip(x, clip_val=3.0):
    """Apply smooth clipping using tanh"""
    return np.tanh(x / clip_val) * clip_val


def inverse_smooth_clip(x, clip_val=3.0):
    """Inverse of smooth clipping with numerical stability"""
    x_normalized = x / clip_val
    x_safe = np.clip(x_normalized, -0.995, 0.995)
    result = np.arctanh(x_safe) * clip_val
    result = np.where(np.isfinite(result), result, np.sign(x) * clip_val * 10)
    return result


def c_mixup(X, y, alpha=1.0, sigma=1.0, augment_factor=2):
    """C-Mixup (Calibrated Mixup) data augmentation"""
    n_samples = X.shape[0]
    
    y_expanded = y.reshape(-1, 1)
    label_distances = (y_expanded - y_expanded.T) ** 2
    
    sampling_probs = np.exp(-label_distances / (2 * sigma ** 2))
    np.fill_diagonal(sampling_probs, 0)
    row_sums = sampling_probs.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1
    sampling_probs = sampling_probs / row_sums
    
    X_augmented = []
    y_augmented = []
    
    for _ in range(augment_factor):
        for i in range(n_samples):
            j = np.random.choice(n_samples, p=sampling_probs[i])
            lambda_mix = np.random.beta(alpha, alpha)
            
            x_mix = lambda_mix * X[i] + (1 - lambda_mix) * X[j]
            y_mix = lambda_mix * y[i] + (1 - lambda_mix) * y[j]
            
            X_augmented.append(x_mix)
            y_augmented.append(y_mix)
    
    X_aug = np.vstack([X] + [np.array(X_augmented)])
    y_aug = np.hstack([y] + [np.array(y_augmented)])
    
    return X_aug, y_aug


# ============================================
# Hyperparameters (新しい設計に対応)
# ============================================
params = {
    # DAE settings
    'use_dae': True,                    # DAEを使用するかどうか
    'dae_noise_type': 'gaussian',       # ノイズタイプ: 'gaussian', 'masking', 'swap'
    'dae_noise_level': 0.1,             # ノイズの強さ
    'dae_lr': 1e-3,                     # DAE事前学習の学習率
    'freeze_dae': False,                # DAEエンコーダーを固定するか
    
    # Model architecture (DAEとResNetで共通)
    'd': 256,                           # モデルの次元数
    'n_blocks': 4,                      # DAEエンコーダーのブロック数 (事前学習)
    'n_additional_blocks': 2,           # DAEエンコーダーの後の追加ブロック数
    'hidden_factor': 2.0,               # 隠れ層の倍率
    'hidden_dropout': 0.1,              # 追加ブロックのDropout
    'residual_dropout': 0.1,            # DAEエンコーダーブロックのDropout
    
    # Training
    'learning_rate': 1e-4,
    'weight_decay': 1e-5,
    'batch_size': 64,
    'optimizer': 'adamw_schedulefree',  # 'adamw', 'adamw_schedulefree', 'radam_schedulefree'
    'loss_function': 'mae',             # 'mse', 'mae', 'smooth_l1', 'huber'
    
    # EMA
    'use_ema': True,
    'ema_decay': 0.999,
}

print("\nHyperparameters:")
for key, value in params.items():
    print(f"  {key}: {value}")
print()

if params['use_dae']:
    total_blocks = params['n_blocks'] + params['n_additional_blocks']
    print(f"Model structure:")
    print(f"  DAE encoder: {params['n_blocks']} blocks (pretrained)")
    print(f"  Additional blocks: {params['n_additional_blocks']} blocks")
    print(f"  Total: {total_blocks} ResNet blocks")
else:
    total_blocks = params['n_blocks'] + params['n_additional_blocks']
    print(f"Model structure:")
    print(f"  Standard ResNet: {total_blocks} blocks")
print()


# ============================================
# Set random seeds
# ============================================
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# ============================================
# Preprocessing
# ============================================
print("\n" + "="*60)
print("PREPROCESSING")
print("="*60)

scaler_X = RobustScaler()
scaler_y = RobustScaler()

X_train_scaled = scaler_X.fit_transform(X_train_raw.values)
X_val_scaled = scaler_X.transform(X_val_raw.values)
X_test_scaled = scaler_X.transform(X_test_raw.values)

y_train_scaled = scaler_y.fit_transform(y_train_raw.reshape(-1, 1)).flatten()
y_val_scaled = scaler_y.transform(y_val_raw.reshape(-1, 1)).flatten()

# Smooth Clipping
clip_val = 3.0
X_train_clipped = smooth_clip(X_train_scaled, clip_val=clip_val)
X_val_clipped = smooth_clip(X_val_scaled, clip_val=clip_val)
X_test_clipped = smooth_clip(X_test_scaled, clip_val=clip_val)
y_train_clipped = smooth_clip(y_train_scaled, clip_val=clip_val)
y_val_clipped = smooth_clip(y_val_scaled, clip_val=clip_val)

print("Preprocessing complete")


# ============================================
# DAE Pretraining (if enabled)
# ============================================
dae_state = None
if params['use_dae']:
    print("\n" + "="*60)
    print("DAE PRETRAINING")
    print("="*60)
    
    # DAEの初期化（新しい設計）
    dae = DenoisingAutoencoder(
        input_size=X_train_clipped.shape[1],
        d=params['d'],
        n_blocks=params['n_blocks'],
        hidden_factor=params['hidden_factor'],
        dropout_rate=params['residual_dropout']
    )
    
    print(f"DAE structure: {params['n_blocks']} ResNet blocks, d={params['d']}")
    
    dae_state, dae_val_loss = pretrain_dae(
        dae,
        X_train_clipped,
        X_val_clipped,
        epochs=dae_pretrain_epochs,
        batch_size=params['batch_size'],
        noise_type=params['dae_noise_type'],
        noise_level=params['dae_noise_level'],
        learning_rate=params['dae_lr'],
        patience=dae_patience,
        device=device,
        seed=SEED,
        verbose=True
    )
    
    print(f"\nDAE pretraining complete. Best Val Loss: {dae_val_loss:.6f}")


# ============================================
# C-Mixup augmentation
# ============================================
print("\n" + "="*60)
print("DATA AUGMENTATION")
print("="*60)

if use_c_mixup:
    X_train_final, y_train_final = c_mixup(
        X_train_clipped, 
        y_train_clipped, 
        alpha=c_mixup_alpha, 
        sigma=c_mixup_sigma,
        augment_factor=c_mixup_factor
    )
    print(f"C-Mixup applied: {X_train_clipped.shape[0]} -> {X_train_final.shape[0]} samples")
else:
    X_train_final = X_train_clipped
    y_train_final = y_train_clipped
    print("No augmentation")

X_val_final = X_val_clipped
y_val_final = y_val_clipped


# ============================================
# Create DataLoaders
# ============================================
g = torch.Generator()
g.manual_seed(SEED)

train_dataset = TensorDataset(
    torch.tensor(X_train_final, dtype=torch.float32), 
    torch.tensor(y_train_final, dtype=torch.float32)
)
val_dataset = TensorDataset(
    torch.tensor(X_val_final, dtype=torch.float32), 
    torch.tensor(y_val_final, dtype=torch.float32)
)

train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], 
                         shuffle=True, pin_memory=True, generator=g)
val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], 
                       shuffle=False, pin_memory=True)


# ============================================
# Initialize Model（新しい設計）
# ============================================
print("\n" + "="*60)
print("MODEL INITIALIZATION")
print("="*60)

if params['use_dae']:
    # ResNetWithDAE（新しい設計）
    model = ResNetWithDAE(
        input_size=X_train_final.shape[1],
        d=params['d'],
        n_blocks=params['n_blocks'],
        hidden_factor=params['hidden_factor'],
        hidden_dropout=params['hidden_dropout'],
        residual_dropout=params['residual_dropout'],
        use_dae_encoder=True,
        n_additional_blocks=params['n_additional_blocks']
    )
    
    # Load pretrained DAE encoder
    model.load_dae_encoder(dae_state)
    print(f"Loaded pretrained DAE encoder ({params['n_blocks']} blocks)")
    
    # Optionally freeze DAE encoder
    if params['freeze_dae']:
        for param in model.dae_encoder.parameters():
            param.requires_grad = False
        print("DAE encoder frozen")
    else:
        print("DAE encoder will be fine-tuned")
    
    if params['n_additional_blocks'] > 0:
        print(f"Additional {params['n_additional_blocks']} blocks added after DAE encoder")
else:
    # Standard ResNet
    total_blocks = params['n_blocks'] + params['n_additional_blocks']
    model = ResNetModel(
        input_size=X_train_final.shape[1],
        d=params['d'],
        n_blocks=total_blocks,
        hidden_factor=params['hidden_factor'],
        dropout_rate=params['residual_dropout']
    )
    print(f"Standard ResNet with {total_blocks} blocks")

model = model.to(device)
print(f"Model initialized on {device}")


# ============================================
# Initialize Training Components
# ============================================
# EMA
use_ema = params['use_ema']
ema = EMA(model, decay=params['ema_decay']) if use_ema else None
if use_ema:
    print(f"EMA enabled with decay={params['ema_decay']}")

# Loss function
loss_map = {
    'mse': nn.MSELoss(), 
    'mae': nn.L1Loss(), 
    'smooth_l1': nn.SmoothL1Loss(), 
    'huber': nn.HuberLoss()
}
criterion = loss_map[params['loss_function']]
print(f"Loss function: {params['loss_function']}")

# Optimizer
optimizer_name = params['optimizer']
is_schedulefree = optimizer_name.endswith('_schedulefree')
if optimizer_name == 'adamw_schedulefree':
    optimizer = AdamWScheduleFree(model.parameters(), lr=params['learning_rate'], 
                                 weight_decay=params['weight_decay'])
elif optimizer_name == 'radam_schedulefree':
    optimizer = RAdamScheduleFree(model.parameters(), lr=params['learning_rate'], 
                                 weight_decay=params['weight_decay'])
elif optimizer_name == 'adamw':
    optimizer = optim.AdamW(model.parameters(), lr=params['learning_rate'], 
                           weight_decay=params['weight_decay'])
else:
    raise ValueError(f"Unknown optimizer: {optimizer_name}")
print(f"Optimizer: {optimizer_name}")

# Mixed precision training
use_amp = torch.cuda.is_available()
scaler = GradScaler('cuda') if use_amp else None
if use_amp:
    print("Mixed precision training enabled")


# ============================================
# Training Loop
# ============================================
print("\n" + "="*60)
print("TRAINING")
print("="*60)

best_val_rmse = float('inf')
best_model_state = None
best_ema_shadow = None
patience_counter = 0

for epoch in range(epochs):
    # Training mode
    if is_schedulefree:
        optimizer.train()
    
    model.train()
    train_loss = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        
        if use_amp:
            with autocast('cuda'):
                outputs = model(X_batch)
                loss = criterion(outputs.squeeze(-1), y_batch)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(-1), y_batch)
            loss.backward()
            optimizer.step()
        
        train_loss += loss.item() * X_batch.size(0)
        
        if ema is not None:
            ema.update()
    
    train_loss /= len(train_dataset)
    
    # Evaluation mode
    if is_schedulefree:
        optimizer.eval()
    
    if ema is not None:
        ema.apply_shadow()
    
    model.eval()
    
    # Validation
    val_predictions = []
    val_targets = []
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            if use_amp:
                with autocast('cuda'):
                    outputs = model(X_batch)
            else:
                outputs = model(X_batch)
            
            val_predictions.extend(outputs.squeeze(-1).cpu().numpy())
            val_targets.extend(y_batch.cpu().numpy())
    
    if ema is not None:
        ema.restore()
    
    # Calculate RMSE in original scale
    val_predictions_unclipped = inverse_smooth_clip(np.array(val_predictions), clip_val=clip_val)
    val_targets_unclipped = inverse_smooth_clip(np.array(val_targets), clip_val=clip_val)
    
    val_predictions_original = scaler_y.inverse_transform(val_predictions_unclipped.reshape(-1, 1)).flatten()
    val_targets_original = scaler_y.inverse_transform(val_targets_unclipped.reshape(-1, 1)).flatten()
    
    val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))
    
    # Print progress every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{epochs} - train_loss: {train_loss:.6f}, val_RMSE: {val_rmse:.4f} (best: {best_val_rmse:.4f})")
    
    # Early stopping
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_model_state = copy.deepcopy(model.state_dict())
        if ema is not None:
            best_ema_shadow = copy.deepcopy(ema.shadow)
        patience_counter = 0
    else:
        patience_counter += 1
    
    if patience_counter >= early_stopping_patience:
        print(f"\nEarly stopping at epoch {epoch + 1}")
        break

print(f"\n{'='*60}")
print(f"TRAINING COMPLETE")
print(f"Best Validation RMSE: {best_val_rmse:.4f}")
print(f"{'='*60}")


# ============================================
# Load best model and make predictions
# ============================================
print("\n" + "="*60)
print("MAKING PREDICTIONS")
print("="*60)

model.load_state_dict(best_model_state)
if ema is not None and best_ema_shadow is not None:
    # Apply EMA shadow for inference
    for name, param in model.named_parameters():
        if name in best_ema_shadow:
            param.data = best_ema_shadow[name]

model.eval()
test_tensor = torch.tensor(X_test_clipped, dtype=torch.float32).to(device)

with torch.no_grad():
    if use_amp:
        with autocast('cuda'):
            predictions_clipped = model(test_tensor).squeeze().cpu().numpy()
    else:
        predictions_clipped = model(test_tensor).squeeze().cpu().numpy()
    
    # Inverse smooth clipping
    predictions_unclipped = inverse_smooth_clip(predictions_clipped, clip_val=clip_val)
    
    # Inverse scaling
    predictions = scaler_y.inverse_transform(predictions_unclipped.reshape(-1, 1)).flatten()

# Save submission
submission = pd.DataFrame({
    "id": range(1455, 1455 + len(predictions)), 
    "DIC": predictions
})
submission_filename = f"submission_resnet_dae.csv"
submission.to_csv(submission_filename, index=False)

print(f"Saved: {submission_filename}")
print(f"{'='*60}")

In [None]:

# name: 坂田煌翔
# student_id: 62408940