In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install schedulefree -q

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy import stats
import numpy as np
import pandas as pd
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# CRITICAL FIX: Rename TA1.x to TA1 to match test data
train = train.rename(columns={"TA1.x": "TA1"})

# ============================================
# Feature Engineering Configuration
# ============================================
print("="*60)
print("FEATURE ENGINEERING CONFIGURATION")
print("="*60)

# PBLD vs Polynomial Features
use_pbld = True  # Set to True for PBLD embeddings, False for Polynomial features

if use_pbld:
    print("Mode: PBLD Embeddings (Periodic Bias Linear DenseNet)")
    print("  → Neural embeddings with periodic patterns")
else:
    print("Mode: Polynomial Features (degree=2)")
    print("  → Traditional feature engineering")

# ============================================
# Feature Engineering
# ============================================
print(f"\n{'='*60}")
print("FEATURE ENGINEERING")
print(f"{'='*60}")

# Extract features and target
feature_columns = [col for col in train.columns if col != 'DIC']
X = train[feature_columns].copy()
y = train['DIC'].values
X_test = test[feature_columns].copy()

print(f"\nOriginal features: {X.shape[1]} features")

# ============================================
# Yeo-Johnson Transformation (X features only)
# ============================================
print(f"\n{'='*60}")
print("YEO-JOHNSON TRANSFORMATION (X features)")
print(f"{'='*60}")

# Check skewness for each feature
print("\nOriginal feature skewness:")
skewness_dict = {}
for col in feature_columns:
    skew_val = stats.skew(X[col])
    skewness_dict[col] = skew_val
    print(f"  {col:15s}: {skew_val:7.4f}")

# Identify high-skew features (|skewness| > 0.5)
high_skew_features = [col for col, skew_val in skewness_dict.items() if abs(skew_val) > 0.5]

print(f"\nHigh-skew features (|skew| > 0.5): {len(high_skew_features)}")
for col in high_skew_features:
    print(f"  {col}: {skewness_dict[col]:.4f}")

if len(high_skew_features) > 0:
    # Apply Yeo-Johnson transformation to high-skew features only
    pt = PowerTransformer(method='yeo-johnson', standardize=False)
    
    X_transformed = X.copy()
    X_test_transformed = X_test.copy()
    
    # Transform only high-skew features
    X_transformed[high_skew_features] = pt.fit_transform(X[high_skew_features])
    X_test_transformed[high_skew_features] = pt.transform(X_test[high_skew_features])
    
    # Check skewness after transformation
    print(f"\nSkewness after Yeo-Johnson transformation:")
    for col in high_skew_features:
        skew_after = stats.skew(X_transformed[col])
        print(f"  {col:15s}: {skewness_dict[col]:7.4f} → {skew_after:7.4f}")
    
    X = X_transformed
    X_test = X_test_transformed
    
    print(f"\nYeo-Johnson benefits:")
    print(f"  ✓ Normalized {len(high_skew_features)} high-skew features")
    print(f"  ✓ Better gradient flow in neural network")
    print(f"  ✓ Reduced impact of outliers")
else:
    print("\nNo high-skew features found. Skipping transformation.")

# ============================================
# Feature Expansion (PBLD or Polynomial)
# ============================================
print(f"\n{'='*60}")
print("FEATURE EXPANSION")
print(f"{'='*60}")

if not use_pbld:
    # Traditional Polynomial Features
    poly = PolynomialFeatures(degree=2, include_bias=False)
    
    print(f"\nGenerating polynomial features (degree=2)...")
    print(f"  - Original features")
    print(f"  - Squared terms (x^2)")
    print(f"  - Interaction terms (x_i * x_j)")
    
    X = poly.fit_transform(X.values)
    X_test = poly.transform(X_test.values)
    
    print(f"\nPolynomial features generated:")
    print(f"  Original: 16 features")
    print(f"  After polynomial expansion: {X.shape[1]} features")
    print(f"  (16 original + 16 squared + 120 interaction terms)")
else:
    # PBLD embeddings will be applied in the model
    # Just convert to numpy arrays here
    print(f"\nPBLD embeddings will be applied in neural network")
    print(f"  Input features: {X.shape[1]}")
    print(f"  Embeddings will be learned during training")
    X = X.values
    X_test = X_test.values

# ============================================
# Robust Scaling + Smooth Clipping
# ============================================
print(f"\n{'='*60}")
print("ROBUST SCALING + SMOOTH CLIPPING")
print(f"{'='*60}")

# Use RobustScaler (median and IQR-based, robust to outliers)
scaler_X = RobustScaler()
scaler_y = RobustScaler()

X_scaled = scaler_X.fit_transform(X)
X_test_scaled = scaler_X.transform(X_test)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

print(f"RobustScaler applied (median-based, robust to outliers)")

# Smooth clipping using tanh (softer than hard clipping)
clip_value = 3.0  # Soft clip at ±3 (similar to ±3 std in StandardScaler)

def smooth_clip(x, clip_val=3.0):
    """
    Smooth clipping using tanh function
    - Values around 0 are unchanged
    - Large values are smoothly compressed towards ±clip_val
    """
    return np.tanh(x / clip_val) * clip_val

X_scaled = smooth_clip(X_scaled, clip_value)
X_test_scaled = smooth_clip(X_test_scaled, clip_value)
y_scaled = smooth_clip(y_scaled, clip_value)

print(f"Smooth clipping applied (tanh-based, clip_value={clip_value})")
print(f"  X_scaled range: [{X_scaled.min():.3f}, {X_scaled.max():.3f}]")
print(f"  y_scaled range: [{y_scaled.min():.3f}, {y_scaled.max():.3f}]")

print(f"\nNormalized features:")
print(f"  X_scaled shape: {X_scaled.shape}")
print(f"  y_scaled shape: {y_scaled.shape}")

# ============================================
# K-Fold Cross Validation Setup
# ============================================
print(f"\n{'='*60}")
print("K-FOLD CROSS VALIDATION SETUP")
print(f"{'='*60}")

n_splits = 5  # Number of folds
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

print(f"K-Fold CV: {n_splits} folds")
print(f"Each fold will be trained separately")
print(f"Final prediction: ensemble of all {n_splits} models")
print("="*60)

In [None]:
# ============================================
# PBLD Embedding (Periodic Bias Linear DenseNet)
# ============================================

class PBLDEmbedding(nn.Module):
    def __init__(self, num_features, embed_dim=32, num_dense_layers=3):
        """
        Periodic Bias Linear DenseNet Embedding
        
        Args:
            num_features: Number of input features
            embed_dim: Dimension of embedding per feature
            num_dense_layers: Number of DenseNet layers
        """
        super(PBLDEmbedding, self).__init__()
        self.num_features = num_features
        self.embed_dim = embed_dim
        self.num_dense_layers = num_dense_layers
        
        # Feature-wise linear embeddings
        self.feature_linears = nn.ModuleList([
            nn.Linear(1, embed_dim) for _ in range(num_features)
        ])
        
        # Periodic bias parameters (learnable frequencies and phases)
        self.frequencies = nn.Parameter(torch.randn(num_features, embed_dim) * 0.1)
        self.phases = nn.Parameter(torch.randn(num_features, embed_dim) * 0.1)
        
        # DenseNet layers
        self.dense_layers = nn.ModuleList()
        in_dim = num_features * embed_dim
        
        for i in range(num_dense_layers):
            self.dense_layers.append(nn.Linear(in_dim, num_features * embed_dim))
            in_dim += num_features * embed_dim  # Dense connection accumulation
        
        # Total output dimension
        self.output_dim = in_dim
        
        print(f"\nPBLD Embedding Architecture:")
        print(f"  Input features: {num_features}")
        print(f"  Embedding dim per feature: {embed_dim}")
        print(f"  Dense layers: {num_dense_layers}")
        print(f"  Output dimension: {self.output_dim}")
    
    def forward(self, x):
        """
        Args:
            x: [batch_size, num_features]
        Returns:
            embedded: [batch_size, output_dim]
        """
        batch_size = x.shape[0]
        
        # Step 1: Feature-wise embedding + periodic bias
        embeddings = []
        for i in range(self.num_features):
            # Linear embedding
            feature_val = x[:, i:i+1]  # [batch, 1]
            linear_embed = self.feature_linears[i](feature_val)  # [batch, embed_dim]
            
            # Periodic bias: sin(2π * freq * x + phase)
            periodic_bias = torch.sin(
                2 * np.pi * self.frequencies[i].unsqueeze(0) * feature_val + 
                self.phases[i].unsqueeze(0)
            )
            
            # Combine
            embed = linear_embed + periodic_bias
            embeddings.append(embed)
        
        # Concatenate all feature embeddings
        h = torch.cat(embeddings, dim=1)  # [batch, num_features * embed_dim]
        
        # Step 2: DenseNet layers
        dense_outputs = [h]
        for layer in self.dense_layers:
            h_new = torch.relu(layer(torch.cat(dense_outputs, dim=1)))
            dense_outputs.append(h_new)
        
        # Step 3: Final concatenation (DenseNet style)
        output = torch.cat(dense_outputs, dim=1)  # [batch, output_dim]
        
        return output


# ============================================
# MLP Model with PBLD Embeddings
# ============================================

class MLPWithPBLD(nn.Module):
    def __init__(self, num_features, embed_dim=32, num_dense_layers=3, 
                 hidden_sizes=[512], dropout_rate=0.0, activation='gelu'):
        """
        MLP with PBLD embeddings
        
        Args:
            num_features: Number of input features (16 for CalCOFI)
            embed_dim: Embedding dimension per feature
            num_dense_layers: Number of dense layers in PBLD
            hidden_sizes: List of hidden layer sizes for MLP
            dropout_rate: Dropout probability
            activation: Activation function
        """
        super(MLPWithPBLD, self).__init__()
        
        # PBLD embedding layer
        self.pbld = PBLDEmbedding(num_features, embed_dim, num_dense_layers)
        
        # Activation function
        activation_map = {
            'relu': nn.ReLU(),
            'gelu': nn.GELU(),
            'silu': nn.SiLU(),
            'tanh': nn.Tanh()
        }
        self.activation = activation_map.get(activation, nn.GELU())
        
        # MLP layers
        self.mlp_layers = nn.ModuleList()
        in_dim = self.pbld.output_dim
        
        for hidden_size in hidden_sizes:
            self.mlp_layers.append(nn.Linear(in_dim, hidden_size))
            self.mlp_layers.append(nn.Dropout(dropout_rate))
            in_dim = hidden_size
        
        # Output layer
        self.output = nn.Linear(in_dim, 1)
        
        # Residual connection from PBLD to output
        self.residual = nn.Linear(self.pbld.output_dim, 1)
        
        print(f"\nMLP Architecture:")
        print(f"  PBLD output → {self.pbld.output_dim} dims")
        print(f"  Hidden layers: {hidden_sizes}")
        print(f"  Activation: {activation}")
        print(f"  Dropout: {dropout_rate}")
        print(f"  + Residual connection")
    
    def forward(self, x):
        # PBLD embedding
        x_embedded = self.pbld(x)
        
        # MLP path
        h = x_embedded
        for i in range(0, len(self.mlp_layers), 2):
            h = self.mlp_layers[i](h)  # Linear
            h = self.activation(h)
            h = self.mlp_layers[i+1](h)  # Dropout
        
        main_output = self.output(h)
        
        # Residual path
        residual_output = self.residual(x_embedded)
        
        return main_output + residual_output


# ============================================
# Original MLP Model (for Polynomial features)
# ============================================

class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_sizes=[128], dropout_rate=0.0, activation='relu', use_batchnorm=True):
        """
        MLP model with flexible number of hidden layers
        
        Args:
            input_size: Number of input features
            hidden_sizes: List of hidden layer sizes
            dropout_rate: Dropout probability
            activation: Activation function name
            use_batchnorm: Whether to use batch normalization
        """
        super(MLPModel, self).__init__()

        self.use_batchnorm = use_batchnorm
        self.num_layers = len(hidden_sizes)

        # Activation function mapping
        activation_map = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(0.1),
            'elu': nn.ELU(),
            'gelu': nn.GELU(),
            'silu': nn.SiLU(),
            'tanh': nn.Tanh()
        }

        self.activation = activation_map.get(activation, nn.ReLU())

        # Determine initialization based on activation
        nonlinearity = 'relu' if activation in ['relu', 'leaky_relu'] else 'linear'

        # Build hidden layers dynamically
        self.hidden_layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList() if use_batchnorm else None
        self.dropouts = nn.ModuleList()
        
        layer_sizes = [input_size] + hidden_sizes
        
        for i in range(len(hidden_sizes)):
            # Hidden layer
            layer = nn.Linear(layer_sizes[i], layer_sizes[i+1])
            if nonlinearity == 'relu':
                nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='relu')
            else:
                nn.init.xavier_normal_(layer.weight)
            nn.init.constant_(layer.bias, 0)
            self.hidden_layers.append(layer)
            
            # Batch normalization
            if use_batchnorm:
                self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i+1]))
            
            # Dropout
            self.dropouts.append(nn.Dropout(dropout_rate))
        
        # Output layer
        self.output = nn.Linear(hidden_sizes[-1], 1)
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
        
        # Residual connection
        self.shortcut = nn.Linear(input_size, 1)
        nn.init.xavier_normal_(self.shortcut.weight)
        nn.init.constant_(self.shortcut.bias, 0)

    def forward(self, x):
        # Main path
        h = x
        for i in range(self.num_layers):
            h = self.hidden_layers[i](h)
            if self.use_batchnorm:
                h = self.batch_norms[i](h)
            h = self.activation(h)
            h = self.dropouts[i](h)
        
        main_output = self.output(h)
        
        # Residual path
        residual = self.shortcut(x)
        
        return main_output + residual


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n{'='*60}")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"{'='*60}")

In [None]:
import torch.optim as optim
from schedulefree import RAdamScheduleFree, AdamWScheduleFree
from torch.utils.data import Dataset, DataLoader, TensorDataset
import copy

# Hyperparameters
if use_pbld:
    # PBLD-specific hyperparameters
    embed_dim = 32
    num_dense_layers = 3
    hidden_sizes = [1024, 512]
else:
    # Polynomial features hyperparameters
    hidden_sizes = [2048]

dropout_rate = 0.0
activation = 'gelu'
use_batchnorm = False
loss_function = 'mae'
optimizer_name = 'adamw_schedulefree'
lr = 0.001
weight_decay = 1e-5
batch_size = 64
epochs = 1000
beta1 = 0.9
beta2 = 0.999

# Gaussian Noise Injection
use_gaussian_noise = True
noise_std = 0.05
noise_prob = 0.5

# SMOTER parameters
use_smoter = True
smoter_k = 5
smoter_percentage = 0.2

print("="*60)
print("K-FOLD CROSS VALIDATION TRAINING")
print("="*60)
print(f"Model: {'PBLD' if use_pbld else 'Standard MLP'}")
if use_pbld:
    print(f"PBLD config: embed_dim={embed_dim}, dense_layers={num_dense_layers}")
print(f"Folds: {n_splits}")
print(f"Hidden layers: {hidden_sizes}")
print(f"Activation: {activation}")
print(f"Loss: {loss_function}")
print(f"Optimizer: {optimizer_name}")
print(f"LR: {lr}, Weight decay: {weight_decay}")
print(f"Batch size: {batch_size}, Epochs: {epochs}")
print(f"Gaussian Noise: std={noise_std}, prob={noise_prob}")
print(f"SMOTER: k={smoter_k}, percentage={smoter_percentage:.0%}")
print("="*60)

# Loss function
loss_map = {'mse': nn.MSELoss(), 'mae': nn.L1Loss(), 'smooth_l1': nn.SmoothL1Loss(), 'huber': nn.HuberLoss()}
criterion = loss_map[loss_function]

# Inverse smooth clip function
def inverse_smooth_clip(x, clip_val=3.0):
    x_clipped = np.clip(x / clip_val, -0.9999, 0.9999)
    return np.arctanh(x_clipped) * clip_val

# Store models and metrics for each fold
fold_models = []
fold_metrics = []
fold_best_epochs = []

# K-Fold Cross Validation Loop
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_scaled)):
    print(f"\n{'='*60}")
    print(f"FOLD {fold + 1}/{n_splits}")
    print(f"{'='*60}")
    
    # Split data
    X_train_fold = X_scaled[train_idx]
    y_train_fold = y_scaled[train_idx]
    X_val_fold = X_scaled[val_idx]
    y_val_fold = y_scaled[val_idx]
    
    print(f"Fold {fold + 1} - Train: {len(X_train_fold)}, Val: {len(X_val_fold)}")
    
    # Apply SMOTER to training fold only
    if use_smoter:
        original_size = len(X_train_fold)
        n_synthetic = int(original_size * smoter_percentage)
        
        knn = NearestNeighbors(n_neighbors=smoter_k + 1, metric='euclidean')
        knn.fit(X_train_fold)
        
        X_synthetic = []
        y_synthetic = []
        
        for _ in range(n_synthetic):
            idx = np.random.randint(0, len(X_train_fold))
            sample_X = X_train_fold[idx]
            sample_y = y_train_fold[idx]
            
            distances, indices = knn.kneighbors([sample_X])
            neighbor_indices = indices[0][1:]
            neighbor_idx = np.random.choice(neighbor_indices)
            neighbor_X = X_train_fold[neighbor_idx]
            neighbor_y = y_train_fold[neighbor_idx]
            
            alpha = np.random.random()
            synthetic_X = sample_X + alpha * (neighbor_X - sample_X)
            synthetic_y = sample_y + alpha * (neighbor_y - sample_y)
            
            X_synthetic.append(synthetic_X)
            y_synthetic.append(synthetic_y)
        
        X_train_fold = np.vstack([X_train_fold, np.array(X_synthetic)])
        y_train_fold = np.concatenate([y_train_fold, np.array(y_synthetic)])
        
        print(f"SMOTER: {original_size} -> {len(X_train_fold)} (+{len(X_synthetic)})")
    
    # Create DataLoaders
    train_dataset = TensorDataset(torch.tensor(X_train_fold, dtype=torch.float32), 
                                   torch.tensor(y_train_fold, dtype=torch.float32))
    val_dataset = TensorDataset(torch.tensor(X_val_fold, dtype=torch.float32), 
                                 torch.tensor(y_val_fold, dtype=torch.float32))
    
    g = torch.Generator().manual_seed(SEED + fold)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize model (PBLD or standard MLP)
    if use_pbld:
        model = MLPWithPBLD(
            num_features=X_train_fold.shape[1],
            embed_dim=embed_dim,
            num_dense_layers=num_dense_layers,
            hidden_sizes=hidden_sizes,
            dropout_rate=dropout_rate,
            activation=activation
        )
    else:
        model = MLPModel(
            input_size=X_train_fold.shape[1],
            hidden_sizes=hidden_sizes, 
            dropout_rate=dropout_rate,
            activation=activation,
            use_batchnorm=use_batchnorm
        )
    
    model = model.to(device)
    
    # Optimizer
    is_schedulefree = optimizer_name.endswith('_schedulefree')
    if optimizer_name == 'adamw_schedulefree':
        optimizer = AdamWScheduleFree(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
    elif optimizer_name == 'radam_schedulefree':
        optimizer = RAdamScheduleFree(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
    else:
        optimizer = optim.AdamW(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
    
    # Training loop
    best_val_rmse = float('inf')
    best_model_state = None
    best_epoch = 0
    
    for epoch in range(epochs):
        if is_schedulefree:
            optimizer.train()
        
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            if use_gaussian_noise and np.random.rand() < noise_prob:
                noise = torch.randn_like(X_batch) * noise_std
                X_batch = X_batch + noise
            
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
        
        if is_schedulefree:
            optimizer.eval()
        
        # Validation
        model.eval()
        val_predictions = []
        val_targets = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                val_predictions.extend(outputs.squeeze().cpu().numpy())
                val_targets.extend(y_batch.cpu().numpy())
        
        # Inverse transformation
        val_predictions_unclipped = inverse_smooth_clip(np.array(val_predictions), clip_value)
        val_targets_unclipped = inverse_smooth_clip(np.array(val_targets), clip_value)
        
        val_predictions_original = scaler_y.inverse_transform(val_predictions_unclipped.reshape(-1, 1)).flatten()
        val_targets_original = scaler_y.inverse_transform(val_targets_unclipped.reshape(-1, 1)).flatten()
        
        val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))
        
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model_state = copy.deepcopy(model.state_dict())
            best_epoch = epoch + 1
        
        if epoch % 200 == 0:
            print(f"Fold {fold + 1} - Epoch {epoch + 1}/{epochs} | Val RMSE: {val_rmse:.4f} | Best: {best_val_rmse:.4f}")
    
    # Load best model
    model.load_state_dict(best_model_state)
    fold_models.append(model)
    fold_metrics.append(best_val_rmse)
    fold_best_epochs.append(best_epoch)
    
    print(f"Fold {fold + 1} complete - Best Val RMSE: {best_val_rmse:.4f} at epoch {best_epoch}")

# Summary
print(f"\n{'='*60}")
print("K-FOLD CV SUMMARY")
print(f"{'='*60}")
for i, (rmse, epoch) in enumerate(zip(fold_metrics, fold_best_epochs)):
    print(f"Fold {i + 1}: RMSE {rmse:.4f} at epoch {epoch}")
print(f"Mean CV RMSE: {np.mean(fold_metrics):.4f} ± {np.std(fold_metrics):.4f}")
print(f"Mean best epoch: {int(np.mean(fold_best_epochs))}")
print(f"{'='*60}")

In [None]:
# ============================================
# K-Fold Ensemble Prediction with TTA
# ============================================

# TTA parameters
tta_iterations = 20
tta_noise_std = 0.03

print("="*60)
print("K-FOLD ENSEMBLE PREDICTION WITH TTA")
print("="*60)
print(f"Number of models: {len(fold_models)}")
print(f"TTA iterations: {tta_iterations}")
print(f"TTA noise std: {tta_noise_std}")
print("="*60)

# Convert test data to tensor
test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

# Collect predictions from all folds and TTA iterations
all_fold_predictions = []

for fold_idx, model in enumerate(fold_models):
    print(f"\nFold {fold_idx + 1}/{len(fold_models)} predicting...")
    model.eval()
    
    # TTA for this fold
    fold_tta_predictions = []
    
    with torch.no_grad():
        for tta_iter in range(tta_iterations):
            # Add noise for TTA (except first iteration)
            if tta_iter == 0:
                test_tensor_augmented = test_tensor
            else:
                noise = torch.randn_like(test_tensor) * tta_noise_std
                test_tensor_augmented = test_tensor + noise
            
            # Make predictions
            predictions_scaled = model(test_tensor_augmented).squeeze().cpu().numpy()
            
            # Inverse transformation: clip → scale
            predictions_unclipped = inverse_smooth_clip(predictions_scaled, clip_value)
            predictions = scaler_y.inverse_transform(predictions_unclipped.reshape(-1, 1)).flatten()
            
            fold_tta_predictions.append(predictions)
    
    # Average TTA predictions for this fold
    fold_avg_prediction = np.mean(fold_tta_predictions, axis=0)
    all_fold_predictions.append(fold_avg_prediction)
    
    print(f"Fold {fold_idx + 1} complete - Prediction range: [{fold_avg_prediction.min():.2f}, {fold_avg_prediction.max():.2f}]")

# Ensemble: Average predictions across all folds
final_predictions = np.mean(all_fold_predictions, axis=0)

# Calculate prediction statistics
fold_std = np.std(all_fold_predictions, axis=0)

print(f"\n{'='*60}")
print("ENSEMBLE COMPLETE")
print(f"{'='*60}")
print(f"Final predictions - Min: {final_predictions.min():.2f}, Max: {final_predictions.max():.2f}, Mean: {final_predictions.mean():.2f}")
print(f"Average std across folds: {fold_std.mean():.4f}")
print(f"(Low std = high agreement between folds)")
print(f"{'='*60}")

# Prepare submission
submission = pd.DataFrame({"id": range(1455, 1455 + len(final_predictions)), "DIC": final_predictions})
submission.to_csv("submission.csv", index=False)
print("\nSubmission saved to submission.csv!")
print(f"CV RMSE estimate: {np.mean(fold_metrics):.4f} ± {np.std(fold_metrics):.4f}")

In [None]:
# score: 3.98009
# name: 坂田煌翔
# student_id: 62408940