In [None]:
# ==============================
# ULTIMATE CATECHOL PREDICTION SYSTEM
# Advanced Hybrid Ensemble with Neural Networks
# ==============================

import os
import sys
import gc
import copy
import warnings
from pathlib import Path
from typing import Tuple, List, Dict, Optional
from dataclasses import dataclass

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.preprocessing import RobustScaler, QuantileTransformer, OneHotEncoder

# Deep Learning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Tree models
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

warnings.filterwarnings('ignore')

# Reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# Kaggle utils
sys.path.append('/kaggle/input/catechol-benchmark-hackathon/')
from utils import (
    INPUT_LABELS_NUMERIC, load_data, load_features,
    generate_leave_one_out_splits, generate_leave_one_ramp_out_splits
)

# ============================
# CONFIGURATION
# ============================
@dataclass
class Config:
    """Unified configuration for all models"""
    # General
    seed: int = 42
    val_size: float = 0.12
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Feature Engineering
    use_robust_scaler: bool = True
    use_quantile_transform: bool = False
    add_poly_features: bool = True
    add_interaction_features: bool = True
    
    # CatBoost
    cb_iterations: int = 12000
    cb_lr: float = 0.012
    cb_depth: int = 9
    cb_l2: float = 2.5
    cb_subsample: float = 0.88
    cb_early_stop: int = 250
    
    # XGBoost
    xgb_rounds: int = 12000
    xgb_eta: float = 0.02
    xgb_depth: int = 9
    xgb_subsample: float = 0.88
    xgb_colsample: float = 0.78
    xgb_early_stop: int = 250
    
    # LightGBM
    lgb_rounds: int = 12000
    lgb_lr: float = 0.015
    lgb_leaves: int = 127
    lgb_depth: int = 10
    lgb_subsample: float = 0.88
    lgb_early_stop: int = 250
    
    # Neural Network
    nn_epochs: int = 800
    nn_batch_size: int = 128
    nn_lr: float = 0.001
    nn_hidden: List[int] = None
    nn_dropout: float = 0.35
    nn_patience: int = 80
    
    # Ensemble
    ensemble_strategy: str = 'adaptive'  # 'adaptive', 'stacking', 'geometric'
    power_weights: float = 2.5
    nn_weight_boost: float = 1.15  # Boost NN importance
    
    def __post_init__(self):
        if self.nn_hidden is None:
            self.nn_hidden = [768, 512, 384, 256, 128]

CFG = Config()

# ============================
# ADVANCED FEATURE ENGINEERING
# ============================
class UltraFeaturizer:
    """State-of-the-art feature engineering"""
    
    def __init__(self, features='spange_descriptors', mixed=False, config=CFG):
        self.features_name = features
        self.featurizer_df = load_features(features)
        self.mixed = mixed
        self.config = config
        
        # Multiple scalers for ensemble
        self.scalers = {
            'robust': RobustScaler(quantile_range=(3, 97)),
            'quantile': QuantileTransformer(n_quantiles=1000, output_distribution='normal')
        }
        self.active_scaler = 'robust' if config.use_robust_scaler else 'quantile'
        self._fitted = False
        self._cache = {}
        self._feature_importance = None

    def _get_molecular(self, row):
        """Extract molecular features with caching"""
        if not self.mixed:
            key = row["SOLVENT NAME"]
            if key not in self._cache:
                self._cache[key] = self.featurizer_df.loc[key].values
            return self._cache[key]
        else:
            A_name = row["SOLVENT A NAME"]
            B_name = row["SOLVENT B NAME"]
            r = row["SolventB%"]
            
            if A_name not in self._cache:
                self._cache[A_name] = self.featurizer_df.loc[A_name].values
            if B_name not in self._cache:
                self._cache[B_name] = self.featurizer_df.loc[B_name].values
            
            A, B = self._cache[A_name], self._cache[B_name]
            # Non-linear mixing for better representation
            return A * (1 - r) + B * r + 0.05 * A * B * r * (1 - r)

    def _create_advanced_features(self, numeric_feat, mol_feat):
        """Engineer advanced features"""
        features = [numeric_feat, mol_feat]
        
        if self.config.add_poly_features and numeric_feat.shape[1] > 0:
            # Polynomial features
            features.append(numeric_feat ** 2)
            features.append(np.sqrt(np.abs(numeric_feat) + 1e-8))
            
        if self.config.add_interaction_features and numeric_feat.shape[1] >= 2:
            # Interaction terms
            features.append((numeric_feat[:, 0] * numeric_feat[:, 1]).reshape(-1, 1))
            if numeric_feat.shape[1] >= 3:
                features.append((numeric_feat[:, 0] * numeric_feat[:, 2]).reshape(-1, 1))
                features.append((numeric_feat[:, 1] * numeric_feat[:, 2]).reshape(-1, 1))
        
        # Statistical features from molecular descriptors
        mol_stats = np.column_stack([
            mol_feat.mean(axis=1),
            mol_feat.std(axis=1),
            mol_feat.max(axis=1),
            mol_feat.min(axis=1)
        ])
        features.append(mol_stats)
        
        return np.concatenate(features, axis=1)

    def featurize(self, X: pd.DataFrame, return_torch=False):
        """Convert DataFrame to feature matrix"""
        numeric = X[INPUT_LABELS_NUMERIC].to_numpy(dtype=np.float32)
        mol = np.vstack([self._get_molecular(X.iloc[i]) for i in range(len(X))]).astype(np.float32)
        
        # Advanced feature engineering
        combined = self._create_advanced_features(numeric, mol)
        combined = np.nan_to_num(combined, nan=0.0, posinf=1e6, neginf=-1e6)
        
        # Multi-scaler approach
        if not self._fitted:
            # Fit both scalers
            for scaler in self.scalers.values():
                scaler.fit(combined)
            self._fitted = True
        
        # Use primary scaler
        combined = self.scalers[self.active_scaler].transform(combined)
        combined = combined.astype(np.float32)
        
        if return_torch:
            return torch.tensor(combined, dtype=torch.float32)
        return combined

# ============================
# NEURAL NETWORK ARCHITECTURE
# ============================
class SEBlock(nn.Module):
    """Squeeze-and-Excitation block for feature recalibration"""
    def __init__(self, channels, reduction=16):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channels, channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channels // reduction, channels, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return x * self.fc(x)


class ResidualBlock(nn.Module):
    """Enhanced residual block with SE attention"""
    def __init__(self, dim, dropout=0.3):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.LayerNorm(dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim, dim),
            nn.LayerNorm(dim),
        )
        self.se = SEBlock(dim)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.GELU()
    
    def forward(self, x):
        residual = x
        out = self.block(x)
        out = self.se(out)
        out = self.activation(residual + out)
        return self.dropout(out)


class UltimateNeuralNetwork(nn.Module):
    """State-of-the-art neural network for regression"""
    def __init__(self, input_dim, hidden_dims=None, output_dim=3, dropout=0.35):
        super().__init__()
        if hidden_dims is None:
            hidden_dims = [768, 512, 384, 256, 128]
        
        layers = []
        
        # Input projection with layer norm
        layers.extend([
            nn.Linear(input_dim, hidden_dims[0]),
            nn.LayerNorm(hidden_dims[0]),
            nn.GELU(),
            nn.Dropout(dropout)
        ])
        
        # Deep residual layers
        for i in range(len(hidden_dims) - 1):
            layers.extend([
                nn.Linear(hidden_dims[i], hidden_dims[i+1]),
                nn.LayerNorm(hidden_dims[i+1]),
                nn.GELU(),
                nn.Dropout(dropout),
                ResidualBlock(hidden_dims[i+1], dropout)
            ])
        
        # Output head with multi-head approach
        self.backbone = nn.Sequential(*layers)
        self.output_heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_dims[-1], hidden_dims[-1] // 2),
                nn.GELU(),
                nn.Dropout(dropout * 0.5),
                nn.Linear(hidden_dims[-1] // 2, 1)
            ) for _ in range(output_dim)
        ])
        
        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, (nn.LayerNorm, nn.BatchNorm1d)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        features = self.backbone(x)
        outputs = [head(features) for head in self.output_heads]
        return torch.cat(outputs, dim=1)

# ============================
# TRAINING UTILITIES
# ============================
def prepare_targets(y_df, smoothing=3e-7):
    """Advanced target preparation with label smoothing"""
    if y_df.shape[1] == 1:
        labels = y_df.iloc[:, 0].to_numpy().reshape(-1)
        ohe = OneHotEncoder(sparse_output=False, categories='auto')
        onehot = ohe.fit_transform(labels.reshape(-1, 1)).astype(np.float32)
        
        # Adaptive label smoothing
        n_classes = onehot.shape[1]
        if smoothing > 0:
            onehot = onehot * (1 - smoothing) + smoothing / n_classes
        
        if onehot.shape[1] < 3:
            pad = np.zeros((onehot.shape[0], 3 - onehot.shape[1]), dtype=np.float32)
            onehot = np.concatenate([onehot, pad], axis=1)
        return onehot
    else:
        arr = y_df.to_numpy(dtype=np.float32)
        if arr.ndim == 1:
            arr = arr.reshape(-1, 1)
        if arr.shape[1] < 3:
            pad = np.zeros((arr.shape[0], 3 - arr.shape[1]), dtype=np.float32)
            arr = np.concatenate([arr, pad], axis=1)
        return arr


class FocalMSELoss(nn.Module):
    """Focal MSE Loss for hard example mining"""
    def __init__(self, gamma=2.0):
        super().__init__()
        self.gamma = gamma
        self.mse = nn.MSELoss(reduction='none')
    
    def forward(self, pred, target):
        mse = self.mse(pred, target)
        focal_weight = torch.pow(mse.detach(), self.gamma)
        return (focal_weight * mse).mean()


def train_neural_network(model, X_train, y_train, X_val, y_val, config=CFG):
    """Advanced training with mixed precision and techniques"""
    device = torch.device(config.device)
    model.to(device)
    
    # Data preparation
    train_dataset = TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    train_loader = DataLoader(
        train_dataset, 
        batch_size=config.nn_batch_size, 
        shuffle=True,
        num_workers=0,
        pin_memory=True
    )
    
    X_val_t = torch.tensor(X_val, dtype=torch.float32).to(device)
    y_val_t = torch.tensor(y_val, dtype=torch.float32).to(device)
    
    # Optimizer with gradient clipping
    optimizer = optim.AdamW(
        model.parameters(), 
        lr=config.nn_lr, 
        weight_decay=5e-5,
        betas=(0.9, 0.999)
    )
    
    # Advanced scheduler
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=config.nn_lr,
        epochs=config.nn_epochs,
        steps_per_epoch=len(train_loader),
        pct_start=0.1,
        anneal_strategy='cos'
    )
    
    # Loss functions
    criterion_mse = nn.MSELoss()
    criterion_focal = FocalMSELoss(gamma=1.5)
    
    best_val_loss = float('inf')
    best_model_state = None
    patience_counter = 0
    
    for epoch in range(config.nn_epochs):
        # Training
        model.train()
        train_loss = 0.0
        
        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(device, non_blocking=True)
            batch_y = batch_y.to(device, non_blocking=True)
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            
            # Combined loss
            loss = 0.7 * criterion_mse(outputs, batch_y) + 0.3 * criterion_focal(outputs, batch_y)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val_t)
            val_loss = criterion_mse(val_outputs, y_val_t).item()
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = copy.deepcopy(model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= config.nn_patience:
            print(f"  Early stop at epoch {epoch}")
            break
        
        if epoch % 200 == 0:
            print(f"  Epoch {epoch}: Train={train_loss/len(train_loader):.6f}, Val={val_loss:.6f}")
    
    if best_model_state:
        model.load_state_dict(best_model_state)
    
    return model, best_val_loss

# ============================
# TREE MODELS
# ============================
def train_catboost(X_tr, y_tr, X_val, y_val, config=CFG, seed=SEED):
    """Optimized CatBoost training"""
    models, preds = [], np.zeros_like(y_val, dtype=np.float32)
    
    for k in range(y_tr.shape[1]):
        cb = CatBoostRegressor(
            iterations=config.cb_iterations,
            learning_rate=config.cb_lr,
            depth=config.cb_depth,
            l2_leaf_reg=config.cb_l2,
            subsample=config.cb_subsample,
            loss_function='RMSE',
            bootstrap_type='Bayesian',
            bagging_temperature=0.25,
            border_count=254,
            od_type='Iter',
            od_wait=config.cb_early_stop,
            random_seed=seed,
            verbose=False,
            thread_count=-1
        )
        cb.fit(X_tr, y_tr[:, k], eval_set=(X_val, y_val[:, k]), use_best_model=True)
        preds[:, k] = cb.predict(X_val)
        models.append(cb)
    
    return models, preds


def train_xgboost(X_tr, y_tr, X_val, y_val, config=CFG, seed=SEED):
    """Optimized XGBoost training"""
    models, preds = [], np.zeros_like(y_val, dtype=np.float32)
    
    for k in range(y_tr.shape[1]):
        params = {
            "objective": "reg:squarederror",
            "tree_method": "hist",
            "device": "cpu",
            "eta": config.xgb_eta,
            "max_depth": config.xgb_depth,
            "subsample": config.xgb_subsample,
            "colsample_bytree": config.xgb_colsample,
            "gamma": 0.03,
            "reg_alpha": 0.03,
            "reg_lambda": 2.0,
            "min_child_weight": 2,
            "max_bin": 256,
            "seed": seed,
            "verbosity": 0
        }
        dtrain = xgb.DMatrix(X_tr, label=y_tr[:, k])
        dval = xgb.DMatrix(X_val, label=y_val[:, k])
        
        bst = xgb.train(
            params, dtrain,
            num_boost_round=config.xgb_rounds,
            evals=[(dtrain, "train"), (dval, "valid")],
            early_stopping_rounds=config.xgb_early_stop,
            verbose_eval=False
        )
        preds[:, k] = bst.predict(dval)
        models.append(bst)
    
    return models, preds


def train_lightgbm(X_tr, y_tr, X_val, y_val, config=CFG, seed=SEED):
    """Optimized LightGBM training"""
    models, preds = [], np.zeros_like(y_val, dtype=np.float32)
    
    for k in range(y_tr.shape[1]):
        train_data = lgb.Dataset(X_tr, label=y_tr[:, k])
        val_data = lgb.Dataset(X_val, label=y_val[:, k], reference=train_data)
        
        params = {
            "objective": "regression",
            "metric": "rmse",
            "boosting": "gbdt",
            "learning_rate": config.lgb_lr,
            "num_leaves": config.lgb_leaves,
            "max_depth": config.lgb_depth,
            "bagging_fraction": config.lgb_subsample,
            "bagging_freq": 5,
            "feature_fraction": 0.78,
            "lambda_l1": 0.03,
            "lambda_l2": 2.0,
            "min_data_in_leaf": 15,
            "max_bin": 255,
            "seed": seed,
            "verbosity": -1,
            "device": "cpu"
        }
        
        bst = lgb.train(
            params, train_data,
            num_boost_round=config.lgb_rounds,
            valid_sets=[val_data],
            callbacks=[
                lgb.early_stopping(config.lgb_early_stop),
                lgb.log_evaluation(0)
            ]
        )
        preds[:, k] = bst.predict(X_val, num_iteration=bst.best_iteration)
        models.append(bst)
    
    return models, preds

# ============================
# PREDICTION FUNCTIONS
# ============================
def predict_models(models_dict, X, model_type):
    """Universal prediction function"""
    if model_type == 'catboost':
        return np.column_stack([m.predict(X) for m in models_dict])
    elif model_type == 'xgboost':
        dm = xgb.DMatrix(X)
        return np.column_stack([m.predict(dm) for m in models_dict])
    elif model_type == 'lightgbm':
        return np.column_stack([m.predict(X, num_iteration=m.best_iteration) for m in models_dict])
    elif model_type == 'neural':
        device = torch.device(CFG.device)
        models_dict.eval()
        with torch.no_grad():
            X_t = torch.tensor(X, dtype=torch.float32).to(device)
            return models_dict(X_t).cpu().numpy()
    else:
        raise ValueError(f"Unknown model type: {model_type}")

# ============================
# ADVANCED ENSEMBLE
# ============================
def adaptive_ensemble(predictions, val_scores, config=CFG):
    """Advanced adaptive ensemble with boosted NN"""
    # Calculate weights from validation scores
    inv_scores = np.array([1.0 / (s + 1e-9) for s in val_scores])
    inv_scores = inv_scores ** config.power_weights
    
    # Boost neural network weight
    if len(inv_scores) == 4:  # CB, XGB, LGB, NN
        inv_scores[-1] *= config.nn_weight_boost
    
    weights = inv_scores / inv_scores.sum()
    
    # Weighted ensemble
    ensemble = np.zeros_like(predictions[0])
    for pred, w in zip(predictions, weights):
        ensemble += pred * w
    
    return ensemble, weights


def geometric_ensemble(predictions, epsilon=1e-9):
    """Geometric mean for probability distributions"""
    predictions = [np.clip(p, epsilon, 1.0) for p in predictions]
    product = np.ones_like(predictions[0])
    for pred in predictions:
        product *= pred
    return np.power(product, 1.0 / len(predictions))

# ============================
# MAIN PIPELINE
# ============================
def run_ultimate_pipeline(data_key, split_gen, featurizer_kwargs, task_id):
    """Ultimate training pipeline"""
    print(f"\n{'='*70}")
    print(f"TASK {task_id}: {data_key.upper()}")
    print('='*70)
    
    all_preds = []
    fold_stats = []
    
    for fold_idx, split in enumerate(split_gen):
        print(f"\n--- Fold {fold_idx} ---")
        (train_X, train_Y), (test_X, test_Y) = split
        
        # Feature engineering
        featurizer = UltraFeaturizer(**featurizer_kwargs)
        X_train = featurizer.featurize(train_X)
        X_test = featurizer.featurize(test_X)
        print(f"Features: {X_train.shape[1]}")
        
        # Prepare targets
        y_train = prepare_targets(train_Y)
        y_test = prepare_targets(test_Y) if test_Y is not None else None
        
        # Train/Val split
        X_tr, X_val, y_tr, y_val = train_test_split(
            X_train, y_train,
            test_size=CFG.val_size,
            random_state=SEED + fold_idx,
            shuffle=True
        )
        
        print(f"Train: {X_tr.shape[0]}, Val: {X_val.shape[0]}, Test: {X_test.shape[0]}")
        
        # Train all models
        print("[1/4] CatBoost...")
        cb_models, cb_val = train_catboost(X_tr, y_tr, X_val, y_val, seed=SEED+fold_idx)
        cb_mse = mean_squared_error(y_val, cb_val)
        print(f"  MSE: {cb_mse:.7f}")
        
        print("[2/4] XGBoost...")
        xgb_models, xgb_val = train_xgboost(X_tr, y_tr, X_val, y_val, seed=SEED+fold_idx)
        xgb_mse = mean_squared_error(y_val, xgb_val)
        print(f"  MSE: {xgb_mse:.7f}")
        
        print("[3/4] LightGBM...")
        lgb_models, lgb_val = train_lightgbm(X_tr, y_tr, X_val, y_val, seed=SEED+fold_idx)
        lgb_mse = mean_squared_error(y_val, lgb_val)
        print(f"  MSE: {lgb_mse:.7f}")
        
        print("[4/4] Neural Network...")
        nn_model = UltimateNeuralNetwork(
            input_dim=X_tr.shape[1],
            hidden_dims=CFG.nn_hidden,
            dropout=CFG.nn_dropout
        )
        nn_model, nn_mse = train_neural_network(nn_model, X_tr, y_tr, X_val, y_val)
        nn_val = predict_models(nn_model, X_val, 'neural')
        print(f"  MSE: {nn_mse:.7f}")
        
        # Ensemble on validation
        val_scores = [cb_mse, xgb_mse, lgb_mse, nn_mse]
        val_preds = [cb_val, xgb_val, lgb_val, nn_val]
        
        if CFG.ensemble_strategy == 'adaptive':
            val_ens, weights = adaptive_ensemble(val_preds, val_scores)
        elif CFG.ensemble_strategy == 'geometric':
            val_ens = geometric_ensemble(val_preds)
            weights = [0.25] * 4
        else:  # equal
            val_ens = np.mean(val_preds, axis=0)
            weights = [0.25] * 4
        
        ens_mse = mean_squared_error(y_val, val_ens)
        print(f"\nEnsemble MSE: {ens_mse:.7f}")
        print(f"Weights: CB={weights[0]:.4f}, XGB={weights[1]:.4f}, LGB={weights[2]:.4f}, NN={weights[3]:.4f}")
        
        fold_stats.append({
            'fold': fold_idx,
            'cb_mse': cb_mse,
            'xgb_mse': xgb_mse,
            'lgb_mse': lgb_mse,
            'nn_mse': nn_mse,
            'ens_mse': ens_mse
        })
        
        # Test predictions
        cb_test = predict_models(cb_models, X_test, 'catboost')
        xgb_test = predict_models(xgb_models, X_test, 'xgboost')
        lgb_test = predict_models(lgb_models, X_test, 'lightgbm')
        nn_test = predict_models(nn_model, X_test, 'neural')
        
        # Apply ensemble weights
        test_ens = np.zeros_like(cb_test)
        for pred, w in zip([cb_test, xgb_test, lgb_test, nn_test], weights):
            test_ens += pred * w
        
        # Normalize to probabilities
        test_ens = np.clip(test_ens, 1e-9, None)
        test_ens = test_ens / test_ens.sum(axis=1, keepdims=True)
        
        all_preds.append((fold_idx, test_X.index.to_numpy(), test_ens))
        
        # Cleanup
        del cb_models, xgb_models, lgb_models, nn_model
        gc.collect()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    # Summary statistics
    print(f"\n{'='*70}")
    print("VALIDATION SUMMARY")
    df_stats = pd.DataFrame(fold_stats)
    print(df_stats.to_string(index=False))
    print(f"\nEnsemble: {df_stats['ens_mse'].mean():.7f} ± {df_stats['ens_mse'].std():.7f}")
    print('='*70)
    
    return all_preds


def assemble_submission(preds_list, task_id):
    """Create submission DataFrame"""
    rows = []
    for fold_idx, indices, preds in preds_list:
        for row_idx, idx in enumerate(indices):
            p = preds[row_idx]
            rows.append({
                "task": int(task_id),
                "fold": int(fold_idx),
                "row": int(row_idx),
                "target_1": float(p[0]),
                "target_2": float(p[1]),
                "target_3": float(p[2])
            })
    return pd.DataFrame(rows)

# ============================
# MAIN EXECUTION
# ============================
def main():
    """Execute complete pipeline"""
    print("\n" + "="*70)
    print("ULTIMATE CATECHOL PREDICTION SYSTEM")
    print("Hybrid Ensemble: Trees + Neural Networks")
    print("="*70)
    print(f"\nDevice: {CFG.device}")
    print(f"Ensemble: {CFG.ensemble_strategy}")
    print(f"Neural Network: {' → '.join(map(str, CFG.nn_hidden))} → 3")
    
    # TASK 0: Single Solvent
    print("\n" + "="*70)
    print("STARTING TASK 0: SINGLE SOLVENT")
    print("="*70)
    X_single, Y_single = load_data("single_solvent")
    split_gen_single = generate_leave_one_out_splits(X_single, Y_single)
    
    preds_single = run_ultimate_pipeline(
        data_key="single_solvent",
        split_gen=split_gen_single,
        featurizer_kwargs={
            'features': 'spange_descriptors',
            'mixed': False,
            'config': CFG
        },
        task_id=0
    )
    
    submission_single = assemble_submission(preds_single, task_id=0)
    print(f"\n✓ Task 0 complete: {len(submission_single)} rows")
    
    # TASK 1: Full Data (Mixed Solvents)
    print("\n" + "="*70)
    print("STARTING TASK 1: MIXED SOLVENTS")
    print("="*70)
    X_full, Y_full = load_data("full")
    split_gen_full = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    preds_full = run_ultimate_pipeline(
        data_key="full",
        split_gen=split_gen_full,
        featurizer_kwargs={
            'features': 'spange_descriptors',
            'mixed': True,
            'config': CFG
        },
        task_id=1
    )
    
    submission_full = assemble_submission(preds_full, task_id=1)
    print(f"\n✓ Task 1 complete: {len(submission_full)} rows")
    
    # COMBINE AND FINALIZE
    print("\n" + "="*70)
    print("FINALIZING SUBMISSION")
    print("="*70)
    
    submission = pd.concat([submission_single, submission_full], ignore_index=True)
    submission = submission.reset_index(drop=True)
    submission.index.name = "id"
    submission['id'] = submission.index
    submission['index'] = submission.index
    
    # Reorder columns
    submission = submission[['id', 'index', 'task', 'fold', 'row', 
                            'target_1', 'target_2', 'target_3']]
    
    # Final robust probability normalization
    probs = submission[['target_1', 'target_2', 'target_3']].to_numpy(dtype=np.float64)
    
    # Triple normalization for maximum stability
    probs = np.clip(probs, 1e-10, 1.0)
    probs = probs / probs.sum(axis=1, keepdims=True)
    probs = np.clip(probs, 1e-10, 1.0)
    probs = probs / probs.sum(axis=1, keepdims=True)
    
    submission[['target_1', 'target_2', 'target_3']] = probs
    
    # Comprehensive validation
    row_sums = probs.sum(axis=1)
    print(f"\nSubmission Validation:")
    print(f"  Total rows: {len(submission):,}")
    print(f"  Tasks: {submission['task'].nunique()} (0: {(submission['task']==0).sum()}, 1: {(submission['task']==1).sum()})")
    print(f"  Folds: {submission['fold'].nunique()}")
    print(f"  Probability range:")
    print(f"    target_1: [{probs[:, 0].min():.8f}, {probs[:, 0].max():.8f}]")
    print(f"    target_2: [{probs[:, 1].min():.8f}, {probs[:, 1].max():.8f}]")
    print(f"    target_3: [{probs[:, 2].min():.8f}, {probs[:, 2].max():.8f}]")
    print(f"  Row sums: [{row_sums.min():.10f}, {row_sums.max():.10f}]")
    print(f"  Max deviation from 1.0: {np.abs(row_sums - 1.0).max():.2e}")
    
    # Validate assertions
    assert len(submission) > 0, "Empty submission!"
    assert np.all(probs >= 0), "Negative probabilities detected!"
    assert np.allclose(row_sums, 1.0, atol=1e-6), "Row sums don't equal 1.0!"
    assert np.all(np.isfinite(probs)), "Non-finite values detected!"
    
    # Save submission
    submission.to_csv("submission.csv", index=False)
    print(f"\n✓ Submission saved: submission.csv")
    
    # Display sample
    print("\n" + "="*70)
    print("SAMPLE PREDICTIONS")
    print("="*70)
    print(submission.head(10).to_string(index=False))
    
    print("\n" + "="*70)
    print("STATISTICS")
    print("="*70)
    print(submission[['target_1', 'target_2', 'target_3']].describe())
    
    # Class distribution
    print("\n" + "="*70)
    print("PREDICTED CLASS DISTRIBUTION")
    print("="*70)
    predicted_classes = np.argmax(probs, axis=1)
    for i in range(3):
        count = (predicted_classes == i).sum()
        pct = count / len(predicted_classes) * 100
        print(f"  Class {i+1}: {count:,} ({pct:.2f}%)")
    
    print("\n" + "="*70)
    print("SUCCESS! Pipeline completed successfully.")
    print("="*70 + "\n")
    
    return submission


if __name__ == "__main__":
    submission = main()