In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install schedulefree -q

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy import stats
import numpy as np
import pandas as pd
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# CRITICAL FIX: Rename TA1.x to TA1 to match test data
train = train.rename(columns={"TA1.x": "TA1"})

# ============================================
# Feature Engineering Configuration
# ============================================
print("="*60)
print("FEATURE ENGINEERING CONFIGURATION")
print("="*60)
print("Mode: Polynomial Features (degree=2)")
print("  → Traditional feature engineering")

# ============================================
# Extract Raw Features and Target
# ============================================
print(f"\n{'='*60}")
print("DATA PREPARATION")
print(f"{'='*60}")

# Extract features and target
feature_columns = [col for col in train.columns if col != 'DIC']
X_raw = train[feature_columns].copy()  # Keep raw data for SMOTER in K-Fold
y_raw = train['DIC'].values.copy()  # Keep raw target for SMOTER in K-Fold
X_test_raw = test[feature_columns].copy()

print(f"Raw training data: {X_raw.shape}")
print(f"Raw test data: {X_test_raw.shape}")
print(f"Original features: {X_raw.shape[1]} features")

# Store for use in K-Fold loop
# We'll apply preprocessing separately for each fold after SMOTER

# ============================================
# K-Fold Cross Validation Setup
# ============================================
print(f"\n{'='*60}")
print("K-FOLD CROSS VALIDATION SETUP")
print(f"{'='*60}")

n_splits = 5  # Number of folds
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

print(f"K-Fold CV: {n_splits} folds")
print(f"Each fold will be trained separately")
print(f"Final prediction: ensemble of all {n_splits} models")
print(f"\nNOTE: SMOTER will be applied to RAW data before preprocessing")
print(f"      Validation folds will contain ONLY original samples")
print("="*60)

In [None]:
# ============================================
# MLP Model
# ============================================

class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_sizes=[128], dropout_rate=0.0, activation='relu', use_batchnorm=True):
        """
        MLP model with flexible number of hidden layers
        
        Args:
            input_size: Number of input features
            hidden_sizes: List of hidden layer sizes
            dropout_rate: Dropout probability
            activation: Activation function name
            use_batchnorm: Whether to use batch normalization
        """
        super(MLPModel, self).__init__()

        self.use_batchnorm = use_batchnorm
        self.num_layers = len(hidden_sizes)

        # Activation function mapping
        activation_map = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(0.1),
            'elu': nn.ELU(),
            'gelu': nn.GELU(),
            'silu': nn.SiLU(),
            'tanh': nn.Tanh()
        }

        self.activation = activation_map.get(activation, nn.ReLU())

        # Determine initialization based on activation
        nonlinearity = 'relu' if activation in ['relu', 'leaky_relu'] else 'linear'

        # Build hidden layers dynamically
        self.hidden_layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList() if use_batchnorm else None
        self.dropouts = nn.ModuleList()
        
        layer_sizes = [input_size] + hidden_sizes
        
        for i in range(len(hidden_sizes)):
            # Hidden layer
            layer = nn.Linear(layer_sizes[i], layer_sizes[i+1])
            if nonlinearity == 'relu':
                nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='relu')
            else:
                nn.init.xavier_normal_(layer.weight)
            nn.init.constant_(layer.bias, 0)
            self.hidden_layers.append(layer)
            
            # Batch normalization
            if use_batchnorm:
                self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i+1]))
            
            # Dropout
            self.dropouts.append(nn.Dropout(dropout_rate))
        
        # Output layer
        self.output = nn.Linear(hidden_sizes[-1], 1)
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
        
        # Residual connection
        self.shortcut = nn.Linear(input_size, 1)
        nn.init.xavier_normal_(self.shortcut.weight)
        nn.init.constant_(self.shortcut.bias, 0)

    def forward(self, x):
        # Main path
        h = x
        for i in range(self.num_layers):
            h = self.hidden_layers[i](h)
            if self.use_batchnorm:
                h = self.batch_norms[i](h)
            h = self.activation(h)
            h = self.dropouts[i](h)
        
        main_output = self.output(h)
        
        # Residual path
        residual = self.shortcut(x)
        
        return main_output + residual


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n{'='*60}")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"{'='*60}")

In [None]:
import torch.optim as optim
from schedulefree import RAdamScheduleFree, AdamWScheduleFree
from torch.utils.data import Dataset, DataLoader, TensorDataset
import copy

# Hyperparameters
hidden_sizes = [2048]
dropout_rate = 0.0
activation = 'gelu'
use_batchnorm = False
loss_function = 'mae'
optimizer_name = 'adamw_schedulefree'
lr = 0.001
weight_decay = 1e-5
batch_size = 64
epochs = 1000
beta1 = 0.9
beta2 = 0.999

# Gaussian Noise Injection
use_gaussian_noise = True
noise_std = 0.05
noise_prob = 0.5

# SMOTER parameters
use_smoter = True
smoter_k = 5
smoter_percentage = 0.2

# Smooth clipping parameter
clip_value = 3.0

print("="*60)
print("K-FOLD CROSS VALIDATION TRAINING")
print("="*60)
print(f"Model: Standard MLP")
print(f"Folds: {n_splits}")
print(f"Hidden layers: {hidden_sizes}")
print(f"Activation: {activation}")
print(f"Loss: {loss_function}")
print(f"Optimizer: {optimizer_name}")
print(f"LR: {lr}, Weight decay: {weight_decay}")
print(f"Batch size: {batch_size}, Epochs: {epochs}")
print(f"Gaussian Noise: std={noise_std}, prob={noise_prob}")
print(f"SMOTER: k={smoter_k}, percentage={smoter_percentage:.0%}")
print(f"SMOTER ORDER: Raw data → SMOTER → Yeo-Johnson → Polynomial → RobustScaling → Clipping")
print("="*60)

# Loss function
loss_map = {'mse': nn.MSELoss(), 'mae': nn.L1Loss(), 'smooth_l1': nn.SmoothL1Loss(), 'huber': nn.HuberLoss()}
criterion = loss_map[loss_function]

# Smooth clipping functions
def smooth_clip(x, clip_val=3.0):
    """Smooth clipping using tanh function"""
    return np.tanh(x / clip_val) * clip_val

def inverse_smooth_clip(x, clip_val=3.0):
    """Inverse of smooth clipping"""
    x_clipped = np.clip(x / clip_val, -0.9999, 0.9999)
    return np.arctanh(x_clipped) * clip_val

# Store models and metrics for each fold
fold_models = []
fold_metrics = []
fold_best_epochs = []

# K-Fold Cross Validation Loop
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_raw)):
    print(f"\n{'='*60}")
    print(f"FOLD {fold + 1}/{n_splits}")
    print(f"{'='*60}")
    
    # ============================================
    # Step 1: Get raw data for this fold
    # ============================================
    X_train_raw_fold = X_raw.iloc[train_idx].copy()
    y_train_raw_fold = y_raw[train_idx].copy()
    X_val_raw_fold = X_raw.iloc[val_idx].copy()
    y_val_raw_fold = y_raw[val_idx].copy()
    
    print(f"Raw data - Train: {len(X_train_raw_fold)}, Val: {len(X_val_raw_fold)}")
    
    # ============================================
    # Step 2: Apply SMOTER to RAW training data
    # ============================================
    if use_smoter:
        original_size = len(X_train_raw_fold)
        n_synthetic = int(original_size * smoter_percentage)
        
        # Fit k-NN on raw training data
        knn = NearestNeighbors(n_neighbors=smoter_k + 1, metric='euclidean')
        knn.fit(X_train_raw_fold.values)
        
        X_synthetic = []
        y_synthetic = []
        
        for _ in range(n_synthetic):
            idx = np.random.randint(0, len(X_train_raw_fold))
            sample_X = X_train_raw_fold.values[idx]
            sample_y = y_train_raw_fold[idx]
            
            distances, indices = knn.kneighbors([sample_X])
            neighbor_indices = indices[0][1:]  # Exclude self
            neighbor_idx = np.random.choice(neighbor_indices)
            neighbor_X = X_train_raw_fold.values[neighbor_idx]
            neighbor_y = y_train_raw_fold[neighbor_idx]
            
            # Linear interpolation
            alpha = np.random.random()
            synthetic_X = sample_X + alpha * (neighbor_X - sample_X)
            synthetic_y = sample_y + alpha * (neighbor_y - sample_y)
            
            X_synthetic.append(synthetic_X)
            y_synthetic.append(synthetic_y)
        
        # Combine original + synthetic RAW data
        X_synthetic_df = pd.DataFrame(X_synthetic, columns=X_train_raw_fold.columns)
        X_train_augmented_raw = pd.concat([X_train_raw_fold, X_synthetic_df], ignore_index=True)
        y_train_augmented_raw = np.concatenate([y_train_raw_fold, np.array(y_synthetic)])
        
        print(f"SMOTER applied: {original_size} → {len(X_train_augmented_raw)} (+{len(X_synthetic)} synthetic)")
    else:
        X_train_augmented_raw = X_train_raw_fold
        y_train_augmented_raw = y_train_raw_fold
    
    # ============================================
    # Step 3: Yeo-Johnson Transformation
    # ============================================
    print(f"\nApplying Yeo-Johnson transformation...")
    
    # Check skewness on training data
    skewness_dict = {}
    for col in X_train_augmented_raw.columns:
        skew_val = stats.skew(X_train_augmented_raw[col])
        skewness_dict[col] = skew_val
    
    high_skew_features = [col for col, skew_val in skewness_dict.items() if abs(skew_val) > 0.5]
    
    if len(high_skew_features) > 0:
        pt = PowerTransformer(method='yeo-johnson', standardize=False)
        
        X_train_transformed = X_train_augmented_raw.copy()
        X_val_transformed = X_val_raw_fold.copy()
        
        # Fit on augmented training data, transform both train and val
        X_train_transformed[high_skew_features] = pt.fit_transform(X_train_augmented_raw[high_skew_features])
        X_val_transformed[high_skew_features] = pt.transform(X_val_raw_fold[high_skew_features])
        
        print(f"  Yeo-Johnson: {len(high_skew_features)} high-skew features transformed")
    else:
        X_train_transformed = X_train_augmented_raw
        X_val_transformed = X_val_raw_fold
        print(f"  Yeo-Johnson: No high-skew features found")
    
    y_train_transformed = y_train_augmented_raw
    y_val_transformed = y_val_raw_fold
    
    # ============================================
    # Step 4: Polynomial Features
    # ============================================
    print(f"Applying Polynomial features...")
    
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_train_expanded = poly.fit_transform(X_train_transformed.values)
    X_val_expanded = poly.transform(X_val_transformed.values)
    print(f"  Polynomial: {X_train_transformed.shape[1]} → {X_train_expanded.shape[1]} features")
    
    # ============================================
    # Step 5: RobustScaling + Smooth Clipping
    # ============================================
    print(f"Applying RobustScaling and smooth clipping...")
    
    scaler_X = RobustScaler()
    scaler_y = RobustScaler()
    
    # Fit on augmented training data
    X_train_scaled = scaler_X.fit_transform(X_train_expanded)
    X_val_scaled = scaler_X.transform(X_val_expanded)
    
    y_train_scaled = scaler_y.fit_transform(y_train_transformed.reshape(-1, 1)).flatten()
    y_val_scaled = scaler_y.transform(y_val_transformed.reshape(-1, 1)).flatten()
    
    # Smooth clipping
    X_train_scaled = smooth_clip(X_train_scaled, clip_value)
    X_val_scaled = smooth_clip(X_val_scaled, clip_value)
    y_train_scaled = smooth_clip(y_train_scaled, clip_value)
    y_val_scaled = smooth_clip(y_val_scaled, clip_value)
    
    print(f"  RobustScaling + Clipping complete")
    print(f"  Final train shape: {X_train_scaled.shape}, val shape: {X_val_scaled.shape}")
    
    # ============================================
    # Step 6: Create DataLoaders
    # ============================================
    train_dataset = TensorDataset(torch.tensor(X_train_scaled, dtype=torch.float32), 
                                   torch.tensor(y_train_scaled, dtype=torch.float32))
    val_dataset = TensorDataset(torch.tensor(X_val_scaled, dtype=torch.float32), 
                                 torch.tensor(y_val_scaled, dtype=torch.float32))
    
    g = torch.Generator().manual_seed(SEED + fold)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # ============================================
    # Step 7: Initialize Model
    # ============================================
    model = MLPModel(
        input_size=X_train_scaled.shape[1],
        hidden_sizes=hidden_sizes, 
        dropout_rate=dropout_rate,
        activation=activation,
        use_batchnorm=use_batchnorm
    )
    
    model = model.to(device)
    
    # ============================================
    # Step 8: Initialize Optimizer
    # ============================================
    is_schedulefree = optimizer_name.endswith('_schedulefree')
    if optimizer_name == 'adamw_schedulefree':
        optimizer = AdamWScheduleFree(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
    elif optimizer_name == 'radam_schedulefree':
        optimizer = RAdamScheduleFree(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
    else:
        optimizer = optim.AdamW(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
    
    # ============================================
    # Step 9: Training Loop
    # ============================================
    best_val_rmse = float('inf')
    best_model_state = None
    best_epoch = 0
    
    for epoch in range(epochs):
        if is_schedulefree:
            optimizer.train()
        
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            if use_gaussian_noise and np.random.rand() < noise_prob:
                noise = torch.randn_like(X_batch) * noise_std
                X_batch = X_batch + noise
            
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
        
        if is_schedulefree:
            optimizer.eval()
        
        # Validation
        model.eval()
        val_predictions = []
        val_targets = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                val_predictions.extend(outputs.squeeze().cpu().numpy())
                val_targets.extend(y_batch.cpu().numpy())
        
        # Inverse transformation
        val_predictions_unclipped = inverse_smooth_clip(np.array(val_predictions), clip_value)
        val_targets_unclipped = inverse_smooth_clip(np.array(val_targets), clip_value)
        
        val_predictions_original = scaler_y.inverse_transform(val_predictions_unclipped.reshape(-1, 1)).flatten()
        val_targets_original = scaler_y.inverse_transform(val_targets_unclipped.reshape(-1, 1)).flatten()
        
        val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))
        
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model_state = copy.deepcopy(model.state_dict())
            best_epoch = epoch + 1
        
        if epoch % 200 == 0:
            print(f"Epoch {epoch + 1}/{epochs} | Val RMSE: {val_rmse:.4f} | Best: {best_val_rmse:.4f}")
    
    # Load best model
    model.load_state_dict(best_model_state)
    
    # Store for ensemble prediction
    fold_models.append({
        'model': model,
        'scaler_X': scaler_X,
        'scaler_y': scaler_y,
        'pt': pt if len(high_skew_features) > 0 else None,
        'high_skew_features': high_skew_features,
        'poly': poly
    })
    fold_metrics.append(best_val_rmse)
    fold_best_epochs.append(best_epoch)
    
    print(f"Fold {fold + 1} complete - Best Val RMSE: {best_val_rmse:.4f} at epoch {best_epoch}")

# Summary
print(f"\n{'='*60}")
print("K-FOLD CV SUMMARY")
print(f"{'='*60}")
for i, (rmse, epoch) in enumerate(zip(fold_metrics, fold_best_epochs)):
    print(f"Fold {i + 1}: RMSE {rmse:.4f} at epoch {epoch}")
print(f"Mean CV RMSE: {np.mean(fold_metrics):.4f} ± {np.std(fold_metrics):.4f}")
print(f"Mean best epoch: {int(np.mean(fold_best_epochs))}")
print(f"{'='*60}")

In [None]:
# ============================================
# K-Fold Ensemble Prediction with TTA
# ============================================

# TTA parameters
tta_iterations = 20
tta_noise_std = 0.03

print("="*60)
print("K-FOLD ENSEMBLE PREDICTION WITH TTA")
print("="*60)
print(f"Number of models: {len(fold_models)}")
print(f"TTA iterations: {tta_iterations}")
print(f"TTA noise std: {tta_noise_std}")
print("="*60)

# Collect predictions from all folds and TTA iterations
all_fold_predictions = []

for fold_idx, fold_data in enumerate(fold_models):
    print(f"\nFold {fold_idx + 1}/{len(fold_models)} predicting...")
    
    # Extract fold components
    model = fold_data['model']
    scaler_X = fold_data['scaler_X']
    scaler_y = fold_data['scaler_y']
    pt = fold_data['pt']
    high_skew_features = fold_data['high_skew_features']
    poly = fold_data['poly']
    
    # ============================================
    # Apply same preprocessing as training
    # ============================================
    
    # Step 1: Yeo-Johnson transformation
    X_test_transformed = X_test_raw.copy()
    if pt is not None and len(high_skew_features) > 0:
        X_test_transformed[high_skew_features] = pt.transform(X_test_raw[high_skew_features])
    
    # Step 2: Polynomial features
    X_test_expanded = poly.transform(X_test_transformed.values)
    
    # Step 3: RobustScaling + Smooth clipping
    X_test_scaled = scaler_X.transform(X_test_expanded)
    X_test_scaled = smooth_clip(X_test_scaled, clip_value)
    
    # Convert to tensor
    test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
    
    # ============================================
    # TTA predictions for this fold
    # ============================================
    model.eval()
    fold_tta_predictions = []
    
    with torch.no_grad():
        for tta_iter in range(tta_iterations):
            # Add noise for TTA (except first iteration)
            if tta_iter == 0:
                test_tensor_augmented = test_tensor
            else:
                noise = torch.randn_like(test_tensor) * tta_noise_std
                test_tensor_augmented = test_tensor + noise
            
            # Make predictions
            predictions_scaled = model(test_tensor_augmented).squeeze().cpu().numpy()
            
            # Inverse transformation: unclip → unscale
            predictions_unclipped = inverse_smooth_clip(predictions_scaled, clip_value)
            predictions = scaler_y.inverse_transform(predictions_unclipped.reshape(-1, 1)).flatten()
            
            fold_tta_predictions.append(predictions)
    
    # Average TTA predictions for this fold
    fold_avg_prediction = np.mean(fold_tta_predictions, axis=0)
    all_fold_predictions.append(fold_avg_prediction)
    
    print(f"Fold {fold_idx + 1} complete - Prediction range: [{fold_avg_prediction.min():.2f}, {fold_avg_prediction.max():.2f}]")

# Ensemble: Average predictions across all folds
final_predictions = np.mean(all_fold_predictions, axis=0)

# Calculate prediction statistics
fold_std = np.std(all_fold_predictions, axis=0)

print(f"\n{'='*60}")
print("ENSEMBLE COMPLETE")
print(f"{'='*60}")
print(f"Final predictions - Min: {final_predictions.min():.2f}, Max: {final_predictions.max():.2f}, Mean: {final_predictions.mean():.2f}")
print(f"Average std across folds: {fold_std.mean():.4f}")
print(f"(Low std = high agreement between folds)")
print(f"{'='*60}")

# Prepare submission
submission = pd.DataFrame({"id": range(1455, 1455 + len(final_predictions)), "DIC": final_predictions})
submission.to_csv("submission.csv", index=False)
print("\nSubmission saved to submission.csv!")
print(f"CV RMSE estimate: {np.mean(fold_metrics):.4f} ± {np.std(fold_metrics):.4f}")

In [None]:
# score: 3.98009
# name: 坂田煌翔
# student_id: 62408940