In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install schedulefree -q

In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy import stats
import numpy as np
import pandas as pd
import random

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# CRITICAL FIX: Rename TA1.x to TA1 to match test data
train = train.rename(columns={"TA1.x": "TA1"})

# ============================================
# Feature Engineering Configuration
# ============================================
print("="*60)
print("FEATURE ENGINEERING CONFIGURATION")
print("="*60)
print("Mode: Polynomial Features (degree=2)")
print("  → Traditional feature engineering")

# ============================================
# Extract Raw Features and Target
# ============================================
print(f"\n{'='*60}")
print("DATA PREPARATION")
print(f"{'='*60}")

# Extract features and target
feature_columns = [col for col in train.columns if col != 'DIC']
X_raw = train[feature_columns].copy()  # Keep raw data for SMOTER in K-Fold
y_raw = train['DIC'].values.copy()  # Keep raw target for SMOTER in K-Fold
X_test_raw = test[feature_columns].copy()

print(f"Raw training data: {X_raw.shape}")
print(f"Raw test data: {X_test_raw.shape}")
print(f"Original features: {X_raw.shape[1]} features")

# Store for use in K-Fold loop
# We'll apply preprocessing separately for each fold after SMOTER

# ============================================
# K-Fold Cross Validation Setup
# ============================================
print(f"\n{'='*60}")
print("K-FOLD CROSS VALIDATION SETUP")
print(f"{'='*60}")

n_splits = 3  # Number of folds
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

print(f"K-Fold CV: {n_splits} folds")
print(f"Each fold will be trained separately")
print(f"Final prediction: ensemble of all {n_splits} models")
print(f"\nNOTE: SMOTER will be applied to RAW data before preprocessing")
print(f"      Validation folds will contain ONLY original samples")
print("="*60)

In [None]:
# ============================================
# MLP Model
# ============================================

class FeatureScalingLayer(nn.Module):
    def __init__(self, num_features):
        """
        特徴量ごとに学習可能なスケーリング係数を持つ層
        ソフトな特徴量選択を実現
        
        Args:
            num_features: 入力特徴量の数
        """
        super(FeatureScalingLayer, self).__init__()
        # スケーリング係数を1.0で初期化（最初は元の特徴量をそのまま使用）
        self.scale = nn.Parameter(torch.ones(num_features))
    
    def forward(self, x):
        # 各特徴量にスケーリング係数を乗算
        return x * self.scale


class LearnableActivation(nn.Module):
    def __init__(self, num_features, activation_fn):
        """
        学習可能なパラメータを持つ活性化関数
        σ_α(x) = (1-α)x + α σ(x)
        
        Args:
            num_features: 特徴量の数(各ニューロンごとにαを持つ)
            activation_fn: ベースとなる活性化関数
        """
        super(LearnableActivation, self).__init__()
        self.activation_fn = activation_fn
        self.alpha = nn.Parameter(torch.ones(num_features))
    
    def forward(self, x):
        # αを[0, 1]の範囲にクリップ
        alpha_clamped = torch.clamp(self.alpha, 0.0, 1.0)
        # σ_α(x) = (1-α)x + α σ(x)
        return (1 - alpha_clamped) * x + alpha_clamped * self.activation_fn(x)


class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_sizes=[128], dropout_rate=0.0, activation='relu', use_batchnorm=True):
        """
        MLP model with flexible number of hidden layers
        
        Args:
            input_size: Number of input features
            hidden_sizes: List of hidden layer sizes
            dropout_rate: Dropout probability
            activation: Activation function name
            use_batchnorm: Whether to use batch normalization
        """
        super(MLPModel, self).__init__()

        self.use_batchnorm = use_batchnorm
        self.num_layers = len(hidden_sizes)

        # Activation function mapping
        activation_map = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(0.1),
            'elu': nn.ELU(),
            'gelu': nn.GELU(),
            'silu': nn.SiLU(),
            'tanh': nn.Tanh(),
            'mish': nn.Mish()
        }

        base_activation = activation_map.get(activation, nn.ReLU())

        # Determine initialization based on activation
        nonlinearity = 'relu' if activation in ['relu', 'leaky_relu'] else 'linear'

        # 特徴量スケーリング層（最初の層の前に配置）
        self.feature_scaling = FeatureScalingLayer(input_size)

        # Build hidden layers dynamically
        self.hidden_layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList() if use_batchnorm else None
        self.learnable_activations = nn.ModuleList()  # 学習可能な活性化関数
        self.dropouts = nn.ModuleList()
        
        layer_sizes = [input_size] + hidden_sizes
        
        for i in range(len(hidden_sizes)):
            # Hidden layer
            layer = nn.Linear(layer_sizes[i], layer_sizes[i+1])
            if nonlinearity == 'relu':
                nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity='relu')
            else:
                nn.init.xavier_normal_(layer.weight)
            nn.init.constant_(layer.bias, 0)
            self.hidden_layers.append(layer)
            
            # Batch normalization
            if use_batchnorm:
                self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i+1]))
            
            # 学習可能な活性化関数を各層に追加
            self.learnable_activations.append(
                LearnableActivation(layer_sizes[i+1], base_activation)
            )
            
            # Dropout
            self.dropouts.append(nn.Dropout(dropout_rate))
        
        # Output layer
        self.output = nn.Linear(hidden_sizes[-1], 1)
        nn.init.xavier_normal_(self.output.weight)
        nn.init.constant_(self.output.bias, 0)
        
        # Residual connection
        self.shortcut = nn.Linear(input_size, 1)
        nn.init.xavier_normal_(self.shortcut.weight)
        nn.init.constant_(self.shortcut.bias, 0)

    def forward(self, x):
        # 特徴量スケーリング（最初に適用）
        x_scaled = self.feature_scaling(x)
        
        # Main path
        h = x_scaled
        for i in range(self.num_layers):
            h = self.hidden_layers[i](h)
            if self.use_batchnorm:
                h = self.batch_norms[i](h)
            h = self.learnable_activations[i](h)  # 学習可能な活性化関数を使用
            h = self.dropouts[i](h)
        
        main_output = self.output(h)
        
        # Residual path（スケーリングされた入力を使用）
        residual = self.shortcut(x_scaled)
        
        return main_output + residual


# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n{'='*60}")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"{'='*60}")

In [None]:
import torch.optim as optim
from schedulefree import RAdamScheduleFree, AdamWScheduleFree
from torch.utils.data import Dataset, DataLoader, TensorDataset
import copy

# Fixed hyperparameters
activation = 'mish'
use_batchnorm = False
loss_function = 'mae'
optimizer_name = 'adamw_schedulefree'
beta1 = 0.9
beta2 = 0.999
use_gaussian_noise = True
use_smoter = True
clip_value = 3.0
early_stopping_patience = 1000
dropout_rate = 0.0  # Fixed at 0
use_ema = True  # EMA (Exponential Moving Average) を使用
ema_decay = 0.999  # EMA減衰率

# Model and training hyperparameters
hidden_size = 2048
lr = 6.524599513245179e-05
weight_decay = 1.7808427134495707e-05
batch_size = 32
noise_std = 0.01679418654429153
noise_prob = 0.5849751972790036
smoter_k = 6
smoter_percentage = 0.14018900109301446
epochs = 2500

# Pseudo labeling hyperparameters
pseudo_label_iterations = 0
pseudo_label_percentile = 20.0

hidden_sizes = [hidden_size]

print("="*60)
print("K-FOLD TRAINING WITH PSEUDO LABELING")
print("="*60)
print(f"K-Fold: {n_splits} folds")
print(f"Early stopping: patience={early_stopping_patience} epochs (based on RMSE)")
print(f"\nHyperparameters:")
print(f"  hidden_size: {hidden_size}")
print(f"  lr: {lr:.6f}")
print(f"  weight_decay: {weight_decay:.6e}")
print(f"  batch_size: {batch_size}")
print(f"  noise_std: {noise_std:.3f}")
print(f"  noise_prob: {noise_prob:.3f}")
print(f"  smoter_k: {smoter_k}")
print(f"  smoter_percentage: {smoter_percentage:.2f}")
print(f"  epochs: {epochs}")
print(f"  pseudo_label_iterations: {pseudo_label_iterations}")
print(f"  pseudo_label_percentile: {pseudo_label_percentile:.1f}%")
print(f"  loss_function: {loss_function}")
print(f"  use_ema: {use_ema}, ema_decay: {ema_decay}")
print("="*60)


class EMA:
    """
    Exponential Moving Average (EMA) for model weights
    重みの移動平均を計算してモデルの安定性と汎化性能を向上
    """
    def __init__(self, model, decay=0.999):
        self.model = model
        self.decay = decay
        self.shadow = {}
        self.backup = {}
        self.register()
    
    def register(self):
        """現在のモデルパラメータをshadowとして保存"""
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()
    
    def update(self):
        """EMA更新: shadow = decay * shadow + (1 - decay) * current"""
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                new_average = self.decay * self.shadow[name] + (1.0 - self.decay) * param.data
                self.shadow[name] = new_average.clone()
    
    def apply_shadow(self):
        """推論時: EMA重みをモデルに適用"""
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.backup[name] = param.data.clone()
                param.data = self.shadow[name]
    
    def restore(self):
        """訓練時: 元の重みを復元"""
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                param.data = self.backup[name]
        self.backup = {}


# Loss function
loss_map = {'mse': nn.MSELoss(), 'mae': nn.L1Loss(), 'smooth_l1': nn.SmoothL1Loss(), 'huber': nn.HuberLoss()}
criterion = loss_map[loss_function]

# Smooth clipping functions
def smooth_clip(x, clip_val=3.0):
    """Smooth clipping using tanh function"""
    return np.tanh(x / clip_val) * clip_val

def inverse_smooth_clip(x, clip_val=3.0):
    """Inverse of smooth clipping"""
    x_clipped = np.clip(x / clip_val, -0.9999, 0.9999)
    return np.arctanh(x_clipped) * clip_val

# Function to train K-Fold models with pseudo labels
def train_kfold(X_original, y_original, X_pseudo=None, y_pseudo=None):
    fold_models = []
    fold_metrics = []
    fold_best_epochs = []
    
    # K-Fold Cross Validation Loop
    # IMPORTANT: Split ONLY on original data to prevent test data leakage
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_original)):
        print(f"\nFold {fold + 1}/{n_splits}")
        
        # Get original data for this fold
        X_train_raw_fold = X_original.iloc[train_idx].copy()
        y_train_raw_fold = y_original[train_idx].copy()
        X_val_raw_fold = X_original.iloc[val_idx].copy()
        y_val_raw_fold = y_original[val_idx].copy()
        
        # Add pseudo labels ONLY to training fold (not validation!)
        if X_pseudo is not None and len(X_pseudo) > 0:
            X_train_raw_fold = pd.concat([X_train_raw_fold, X_pseudo], ignore_index=True)
            y_train_raw_fold = np.concatenate([y_train_raw_fold, y_pseudo])
            print(f"  Added {len(X_pseudo)} pseudo labels to training (validation remains original)")
        
        # Apply SMOTER to RAW training data
        if use_smoter:
            original_size = len(X_train_raw_fold)
            n_synthetic = int(original_size * smoter_percentage)
            
            knn = NearestNeighbors(n_neighbors=smoter_k + 1, metric='euclidean')
            knn.fit(X_train_raw_fold.values)
            
            X_synthetic = []
            y_synthetic = []
            
            for _ in range(n_synthetic):
                idx = np.random.randint(0, len(X_train_raw_fold))
                sample_X = X_train_raw_fold.values[idx]
                sample_y = y_train_raw_fold[idx]
                
                distances, indices = knn.kneighbors([sample_X])
                neighbor_indices = indices[0][1:]
                neighbor_idx = np.random.choice(neighbor_indices)
                neighbor_X = X_train_raw_fold.values[neighbor_idx]
                neighbor_y = y_train_raw_fold[neighbor_idx]
                
                alpha = np.random.random()
                synthetic_X = sample_X + alpha * (neighbor_X - sample_X)
                synthetic_y = sample_y + alpha * (neighbor_y - sample_y)
                
                X_synthetic.append(synthetic_X)
                y_synthetic.append(synthetic_y)
            
            X_synthetic_df = pd.DataFrame(X_synthetic, columns=X_train_raw_fold.columns)
            X_train_augmented_raw = pd.concat([X_train_raw_fold, X_synthetic_df], ignore_index=True)
            y_train_augmented_raw = np.concatenate([y_train_raw_fold, np.array(y_synthetic)])
        else:
            X_train_augmented_raw = X_train_raw_fold
            y_train_augmented_raw = y_train_raw_fold
        
        # Yeo-Johnson Transformation
        skewness_dict = {}
        for col in X_train_augmented_raw.columns:
            skew_val = stats.skew(X_train_augmented_raw[col])
            skewness_dict[col] = skew_val
        
        high_skew_features = [col for col, skew_val in skewness_dict.items() if abs(skew_val) > 0.5]
        
        if len(high_skew_features) > 0:
            pt = PowerTransformer(method='yeo-johnson', standardize=False)
            X_train_transformed = X_train_augmented_raw.copy()
            X_val_transformed = X_val_raw_fold.copy()
            X_train_transformed[high_skew_features] = pt.fit_transform(X_train_augmented_raw[high_skew_features])
            X_val_transformed[high_skew_features] = pt.transform(X_val_raw_fold[high_skew_features])
        else:
            X_train_transformed = X_train_augmented_raw
            X_val_transformed = X_val_raw_fold
        
        y_train_transformed = y_train_augmented_raw
        y_val_transformed = y_val_raw_fold
        
        # Polynomial Features
        poly = PolynomialFeatures(degree=2, include_bias=False)
        X_train_expanded = poly.fit_transform(X_train_transformed.values)
        X_val_expanded = poly.transform(X_val_transformed.values)
        
        # RobustScaling + Smooth Clipping
        scaler_X = RobustScaler()
        scaler_y = RobustScaler()
        
        X_train_scaled = scaler_X.fit_transform(X_train_expanded)
        X_val_scaled = scaler_X.transform(X_val_expanded)
        
        y_train_scaled = scaler_y.fit_transform(y_train_transformed.reshape(-1, 1)).flatten()
        y_val_scaled = scaler_y.transform(y_val_transformed.reshape(-1, 1)).flatten()
        
        X_train_scaled = smooth_clip(X_train_scaled, clip_value)
        X_val_scaled = smooth_clip(X_val_scaled, clip_value)
        y_train_scaled = smooth_clip(y_train_scaled, clip_value)
        y_val_scaled = smooth_clip(y_val_scaled, clip_value)
        
        # Create DataLoaders
        train_dataset = TensorDataset(torch.tensor(X_train_scaled, dtype=torch.float32), 
                                       torch.tensor(y_train_scaled, dtype=torch.float32))
        val_dataset = TensorDataset(torch.tensor(X_val_scaled, dtype=torch.float32), 
                                     torch.tensor(y_val_scaled, dtype=torch.float32))
        
        g = torch.Generator().manual_seed(SEED + fold)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        
        # Initialize Model
        model = MLPModel(
            input_size=X_train_scaled.shape[1],
            hidden_sizes=hidden_sizes, 
            dropout_rate=dropout_rate,
            activation=activation,
            use_batchnorm=use_batchnorm
        )
        model = model.to(device)
        
        # Initialize EMA
        ema = EMA(model, decay=ema_decay) if use_ema else None
        
        # Initialize Optimizer
        all_params = model.parameters()
        
        is_schedulefree = optimizer_name.endswith('_schedulefree')
        if optimizer_name == 'adamw_schedulefree':
            optimizer = AdamWScheduleFree(all_params, lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
        elif optimizer_name == 'radam_schedulefree':
            optimizer = RAdamScheduleFree(all_params, lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
        else:
            optimizer = optim.AdamW(all_params, lr=lr, betas=(beta1, beta2), weight_decay=weight_decay)
        
        # Training Loop with Early Stopping
        best_val_rmse = float('inf')
        best_val_loss = float('inf')
        best_model_state = None
        best_ema_shadow = None
        best_epoch = 0
        patience_counter = 0
        
        for epoch in range(epochs):
            if is_schedulefree:
                optimizer.train()
            
            model.train()
            
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                if use_gaussian_noise and np.random.rand() < noise_prob:
                    noise = torch.randn_like(X_batch) * noise_std
                    X_batch = X_batch + noise
                
                optimizer.zero_grad()
                outputs = model(X_batch)
                
                # Calculate loss
                loss = criterion(outputs.squeeze(-1), y_batch)
                
                loss.backward()
                optimizer.step()
                
                # Update EMA after each batch
                if ema is not None:
                    ema.update()
            
            if is_schedulefree:
                optimizer.eval()
            
            # Validation with EMA weights
            if ema is not None:
                ema.apply_shadow()  # Apply EMA weights for validation
            
            model.eval()
            
            val_predictions = []
            val_targets = []
            val_loss_sum = 0.0
            val_batches = 0
            
            with torch.no_grad():
                for X_batch, y_batch in val_loader:
                    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                    outputs = model(X_batch)
                    
                    # Calculate loss
                    batch_loss = criterion(outputs.squeeze(-1), y_batch)
                    
                    val_loss_sum += batch_loss.item()
                    val_batches += 1
                    
                    val_predictions.extend(outputs.squeeze(-1).cpu().numpy())
                    val_targets.extend(y_batch.cpu().numpy())
            
            if ema is not None:
                ema.restore()  # Restore original weights for training
            
            val_loss = val_loss_sum / val_batches
            val_predictions_unclipped = inverse_smooth_clip(np.array(val_predictions), clip_value)
            val_targets_unclipped = inverse_smooth_clip(np.array(val_targets), clip_value)
            
            val_predictions_original = scaler_y.inverse_transform(val_predictions_unclipped.reshape(-1, 1)).flatten()
            val_targets_original = scaler_y.inverse_transform(val_targets_unclipped.reshape(-1, 1)).flatten()
            
            val_rmse = np.sqrt(np.mean((val_predictions_original - val_targets_original)**2))
            
            if val_rmse < best_val_rmse:
                best_val_rmse = val_rmse
                best_val_loss = val_loss
                best_model_state = copy.deepcopy(model.state_dict())
                if ema is not None:
                    best_ema_shadow = copy.deepcopy(ema.shadow)
                best_epoch = epoch + 1
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= early_stopping_patience:
                break
        
        # Load best model
        model.load_state_dict(best_model_state)
        if ema is not None and best_ema_shadow is not None:
            ema.shadow = best_ema_shadow
            ema.apply_shadow()  # Apply best EMA weights for inference
        
        print(f"  Best RMSE: {best_val_rmse:.4f} at epoch {best_epoch}")
        if use_ema:
            print(f"  Using EMA weights (decay={ema_decay})")
        
        # Store for ensemble prediction
        fold_models.append({
            'model': model,
            'scaler_X': scaler_X,
            'scaler_y': scaler_y,
            'pt': pt if len(high_skew_features) > 0 else None,
            'high_skew_features': high_skew_features,
            'poly': poly
        })
        fold_metrics.append(best_val_rmse)
        fold_best_epochs.append(best_epoch)
    
    return fold_models, fold_metrics, fold_best_epochs

# Pseudo labeling iterations
X_pseudo_accumulated = None
y_pseudo_accumulated = None

for iteration in range(pseudo_label_iterations + 1):
    if iteration == 0:
        print(f"\n{'='*60}")
        print(f"INITIAL TRAINING (no pseudo labels)")
        print(f"{'='*60}")
    else:
        print(f"\n{'='*60}")
        print(f"PSEUDO LABELING ITERATION {iteration}/{pseudo_label_iterations}")
        print(f"{'='*60}")
    
    # Train K-Fold models
    # K-Fold split is ALWAYS done on original data only
    fold_models, fold_metrics, fold_best_epochs = train_kfold(
        X_raw, y_raw, X_pseudo_accumulated, y_pseudo_accumulated
    )
    
    print(f"\nIteration {iteration} CV Results:")
    for i, (rmse, epoch) in enumerate(zip(fold_metrics, fold_best_epochs)):
        print(f"  Fold {i + 1}: RMSE {rmse:.4f} at epoch {epoch}")
    print(f"  Mean CV RMSE: {np.mean(fold_metrics):.4f} ± {np.std(fold_metrics):.4f}")
    
    # Pseudo labeling (skip on last iteration)
    if iteration < pseudo_label_iterations:
        print(f"\nGenerating pseudo labels for iteration {iteration + 1}...")
        
        # Predict on test data with all folds
        all_fold_predictions = []
        for fold_idx, fold_data in enumerate(fold_models):
            model = fold_data['model']
            scaler_X = fold_data['scaler_X']
            scaler_y = fold_data['scaler_y']
            pt = fold_data['pt']
            high_skew_features = fold_data['high_skew_features']
            poly = fold_data['poly']
            
            # Preprocess test data
            X_test_transformed = X_test_raw.copy()
            if pt is not None and len(high_skew_features) > 0:
                X_test_transformed[high_skew_features] = pt.transform(X_test_raw[high_skew_features])
            
            X_test_expanded = poly.transform(X_test_transformed.values)
            X_test_scaled = scaler_X.transform(X_test_expanded)
            X_test_scaled = smooth_clip(X_test_scaled, clip_value)
            
            test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
            
            model.eval()
            with torch.no_grad():
                predictions_scaled = model(test_tensor).squeeze(-1).cpu().numpy()
            
            predictions_unclipped = inverse_smooth_clip(predictions_scaled, clip_value)
            predictions = scaler_y.inverse_transform(predictions_unclipped.reshape(-1, 1)).flatten()
            all_fold_predictions.append(predictions)
        
        # Calculate variance and select high-confidence samples
        fold_std = np.std(all_fold_predictions, axis=0)
        fold_mean = np.mean(all_fold_predictions, axis=0)
        
        confidence_threshold = np.percentile(fold_std, pseudo_label_percentile)
        high_confidence_mask = fold_std < confidence_threshold
        n_pseudo = high_confidence_mask.sum()
        
        print(f"  Confidence threshold (std): {confidence_threshold:.4f}")
        print(f"  High-confidence samples: {n_pseudo} ({n_pseudo/len(X_test_raw)*100:.1f}%)")
        
        if n_pseudo > 0:
            # Add pseudo labels to accumulator
            X_pseudo_new = X_test_raw.iloc[high_confidence_mask].copy()
            y_pseudo_new = fold_mean[high_confidence_mask]
            
            if X_pseudo_accumulated is None:
                X_pseudo_accumulated = X_pseudo_new
                y_pseudo_accumulated = y_pseudo_new
            else:
                X_pseudo_accumulated = pd.concat([X_pseudo_accumulated, X_pseudo_new], ignore_index=True)
                y_pseudo_accumulated = np.concatenate([y_pseudo_accumulated, y_pseudo_new])
            
            print(f"  Total pseudo labels: {len(X_pseudo_accumulated)}")

# Final CV RMSE
mean_cv_rmse = np.mean(fold_metrics)
std_cv_rmse = np.std(fold_metrics)

print(f"\n{'='*60}")
print("TRAINING COMPLETE")
print(f"{'='*60}")
print(f"Final Mean CV RMSE: {mean_cv_rmse:.4f} ± {std_cv_rmse:.4f}")
print(f"Validation set: ONLY original data (no test data leakage)")
print(f"{'='*60}")


In [None]:
# ============================================
# K-Fold Ensemble Prediction with Advanced Weighting
# ============================================

print("="*60)
print("K-FOLD ADVANCED ENSEMBLE PREDICTION")
print("="*60)
print(f"Number of models: {len(fold_models)}")
print(f"Weighting: Adaptive per-sample confidence + diversity bonus")
print("="*60)

# ============================================
# Step 1: Calculate global fold weights from validation RMSE
# ============================================
fold_rmse_array = np.array(fold_metrics)
# 三乗逆数を使用（より性能差を強調）
global_weights = 1.0 / (fold_rmse_array ** 3)
global_weights = global_weights / global_weights.sum()

print(f"\nGlobal fold weights (based on validation RMSE²):")
for i, (rmse, weight) in enumerate(zip(fold_metrics, global_weights)):
    print(f"  Fold {i + 1}: RMSE={rmse:.4f} → Global Weight={weight:.4f}")

# ============================================
# Step 2: Collect predictions and calculate diversity
# ============================================
all_fold_predictions = []

for fold_idx, fold_data in enumerate(fold_models):
    print(f"\nFold {fold_idx + 1}/{len(fold_models)} predicting...")
    
    # Extract fold components
    model = fold_data['model']
    scaler_X = fold_data['scaler_X']
    scaler_y = fold_data['scaler_y']
    pt = fold_data['pt']
    high_skew_features = fold_data['high_skew_features']
    poly = fold_data['poly']
    
    # Apply same preprocessing as training
    X_test_transformed = X_test_raw.copy()
    if pt is not None and len(high_skew_features) > 0:
        X_test_transformed[high_skew_features] = pt.transform(X_test_raw[high_skew_features])
    
    X_test_expanded = poly.transform(X_test_transformed.values)
    X_test_scaled = scaler_X.transform(X_test_expanded)
    X_test_scaled = smooth_clip(X_test_scaled, clip_value)
    
    test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
    
    # Single prediction
    model.eval()
    with torch.no_grad():
        predictions_scaled = model(test_tensor).squeeze().cpu().numpy()
        predictions_unclipped = inverse_smooth_clip(predictions_scaled, clip_value)
        predictions = scaler_y.inverse_transform(predictions_unclipped.reshape(-1, 1)).flatten()
    
    all_fold_predictions.append(predictions)
    print(f"Fold {fold_idx + 1} complete - Prediction range: [{predictions.min():.2f}, {predictions.max():.2f}]")

all_fold_predictions = np.array(all_fold_predictions)  # shape: (n_folds, n_samples)

# ============================================
# Step 3: Calculate per-sample adaptive weights
# ============================================
print(f"\nCalculating adaptive per-sample weights...")

# サンプルごとの標準偏差（予測の不一致度）
sample_std = np.std(all_fold_predictions, axis=0)  # shape: (n_samples,)

# サンプルごとの信頼度重み
# 標準偏差が小さい（モデル間の一致度が高い）ほど信頼度が高い
confidence_scores = np.exp(-sample_std / np.mean(sample_std))  # shape: (n_samples,)

# 各フォールドごとのサンプル別適応重み
adaptive_weights = np.zeros_like(all_fold_predictions)  # shape: (n_folds, n_samples)

for fold_idx in range(len(fold_models)):
    # 各サンプルについて、このフォールドの予測が平均からどれだけ離れているか
    mean_pred = np.mean(all_fold_predictions, axis=0)
    deviation = np.abs(all_fold_predictions[fold_idx] - mean_pred)
    
    # 偏差が小さいほど（コンセンサスに近いほど）重みが大きい
    consensus_weight = np.exp(-deviation / (np.std(deviation) + 1e-8))
    
    # グローバル重み × コンセンサス重み × 信頼度スコア
    adaptive_weights[fold_idx] = global_weights[fold_idx] * consensus_weight * confidence_scores

# 各サンプルで正規化（合計が1になるように）
adaptive_weights = adaptive_weights / (adaptive_weights.sum(axis=0, keepdims=True) + 1e-8)

# ============================================
# Step 4: Apply adaptive weighted ensemble
# ============================================
final_predictions = np.sum(all_fold_predictions * adaptive_weights, axis=0)

# ============================================
# Step 5: Post-processing with diversity consideration
# ============================================
# 標準偏差が非常に大きいサンプル（不確実性が高い）には中央値を使用
high_uncertainty_mask = sample_std > np.percentile(sample_std, 99)
median_predictions = np.median(all_fold_predictions, axis=0)

if high_uncertainty_mask.sum() > 0:
    print(f"\nApplying median for high-uncertainty samples: {high_uncertainty_mask.sum()} samples")
    final_predictions[high_uncertainty_mask] = median_predictions[high_uncertainty_mask]

# Calculate statistics
fold_std = np.std(all_fold_predictions, axis=0)
avg_adaptive_weight_std = np.std(adaptive_weights, axis=1).mean()

print(f"\n{'='*60}")
print("ADVANCED ENSEMBLE COMPLETE")
print(f"{'='*60}")
print(f"Final predictions - Min: {final_predictions.min():.2f}, Max: {final_predictions.max():.2f}, Mean: {final_predictions.mean():.2f}")
print(f"Average std across folds: {fold_std.mean():.4f}")
print(f"High confidence samples (std < 1.0): {(sample_std < 1.0).sum()} / {len(sample_std)}")
print(f"Low confidence samples (std > 2.0): {(sample_std > 2.0).sum()} / {len(sample_std)}")
print(f"\nWeighting summary:")
print(f"  Global weight range: [{global_weights.min():.4f}, {global_weights.max():.4f}]")
print(f"  Adaptive weight variation: {avg_adaptive_weight_std:.4f}")
print(f"  Median-adjusted samples: {high_uncertainty_mask.sum()}")
print(f"{'='*60}")

# Prepare submission
submission = pd.DataFrame({"id": range(1455, 1455 + len(final_predictions)), "DIC": final_predictions})
submission.to_csv("submission.csv", index=False)
print("\nSubmission saved to submission.csv!")
print(f"CV RMSE estimate: {np.mean(fold_metrics):.4f} ± {np.std(fold_metrics):.4f}")

In [None]:
# score: 3.68794
# name: 坂田煌翔
# student_id: 62408940