# Experiment 050: CatBoost + XGBoost Ensemble (FIXED CV Scheme)

**Goal:** Fix the submission bug from exp_049 by using the CORRECT CV scheme:
- Single solvent: Leave-one-solvent-out (24 folds) ✓
- Full data: Leave-one-solvent-PAIR-out (13 folds) ✗ (was 87 RAMP NUM folds)

**The Bug:** exp_049 used RAMP NUM (87 folds) instead of solvent PAIRS (13 folds) for full data CV.

**The Fix:** Use the official `generate_leave_one_ramp_out_splits` function which splits by solvent PAIRS.

In [1]:
import sys
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from abc import ABC, abstractmethod
import tqdm

# Define constants
INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"
]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_FEATURES = ["SOLVENT NAME"]
INPUT_LABELS_FULL_FEATURES = ["SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

# Data loading functions
DATA_PATH = "/home/data"

def load_data_local(name="full"):
    """Load dataset."""
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features_local(name="spange_descriptors"):
    """Load feature lookup table."""
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

# OFFICIAL CV FUNCTIONS (from utils.py)
def generate_leave_one_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvents (24 folds)."""
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvent PAIRS (13 folds).
    
    CRITICAL: This splits by solvent PAIRS (SOLVENT A NAME, SOLVENT B NAME),
    NOT by RAMP NUM!
    """
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != solvent_pair).any(axis=1)
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

# Load data
X_single, Y_single = load_data_local("single_solvent")
X_full, Y_full = load_data_local("full")

print(f"Single solvent: X={X_single.shape}, Y={Y_single.shape}")
print(f"Full data: X={X_full.shape}, Y={Y_full.shape}")

# Count folds
print(f"\nSingle solvent folds: {len(X_single['SOLVENT NAME'].unique())}")
print(f"Full data folds (solvent PAIRS): {len(X_full[['SOLVENT A NAME', 'SOLVENT B NAME']].drop_duplicates())}")

Single solvent: X=(656, 3), Y=(656, 3)
Full data: X=(1227, 5), Y=(1227, 3)

Single solvent folds: 24
Full data folds (solvent PAIRS): 13


In [2]:
# Feature engineering functions from ens-model kernel

def feature_priority(name: str) -> int:
    """Assign priority score to feature name based on prefix."""
    if name.startswith("spange_"):
        return 5
    if name.startswith("acs_"):
        return 4
    if name.startswith("drfps_"):
        return 3
    if name.startswith("frag_"):
        return 2
    if name.startswith("smiles_"):
        return 1
    return 0

def filter_correlated_features(df: pd.DataFrame, threshold: float = 0.90):
    """Drop columns that are highly correlated."""
    numeric_df = df.select_dtypes(include=[np.number])
    
    if numeric_df.shape[1] == 0:
        return df, []
    
    # Drop constant columns first
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        numeric_df = numeric_df.drop(columns=constant_cols)
    
    # Correlation matrix
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    
    cols = upper.columns.tolist()
    to_drop = set()
    
    # Find high correlation pairs
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                if col_i in to_drop or col_j in to_drop:
                    continue
                p_i = feature_priority(col_i)
                p_j = feature_priority(col_j)
                if p_i > p_j:
                    to_drop.add(col_j)
                elif p_j > p_i:
                    to_drop.add(col_i)
                else:
                    idx_i = df.columns.get_loc(col_i) if col_i in df.columns else 999
                    idx_j = df.columns.get_loc(col_j) if col_j in df.columns else 999
                    to_drop.add(col_i if idx_i > idx_j else col_j)
    
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors="ignore")
    
    return df_filtered, all_to_drop

def add_numeric_features(X_numeric: pd.DataFrame) -> pd.DataFrame:
    """Add engineered numeric features."""
    X_num = X_numeric.copy()
    cols = set(X_num.columns)
    
    if {"Temperature", "Residence Time"} <= cols:
        # Convert Temperature to Kelvin
        X_num["Temperature"] = X_num["Temperature"] + 273.15
        
        T = X_num["Temperature"]
        rt = X_num["Residence Time"]
        
        # Interaction term
        X_num["T_x_RT"] = T * rt
        
        # Log transformation
        X_num["RT_log"] = np.log(rt + 1e-6)
        
        # Inverse temperature
        X_num["T_inv"] = 1 / T
        
        # Scaled residence time
        X_num["RT_scaled"] = rt / rt.mean()
    
    return X_num

print("Feature engineering functions defined.")

Feature engineering functions defined.


In [3]:
# Build combined solvent feature table

def build_solvent_feature_table(threshold: float = 0.90):
    """Build combined solvent feature table from multiple sources."""
    sources = [
        "spange_descriptors",
        "acs_pca_descriptors",
        "drfps_catechol",
        "fragprints",
        "smiles",
    ]
    
    dfs = []
    
    for src in sources:
        df_src = load_features_local(src).copy()
        
        if "SOLVENT NAME" not in df_src.columns:
            df_src = df_src.reset_index().rename(columns={"index": "SOLVENT NAME"})
        
        # Bit-table filtering for binary fingerprints
        if src in ["drfps_catechol", "fragprints"]:
            prefix = "drfps" if src == "drfps_catechol" else "frag"
            
            # Drop all-zero and all-one columns
            df_src = df_src.loc[:, (df_src != 0).any(axis=0)]
            df_src = df_src.loc[:, (df_src != 1).any(axis=0)]
            
            # Drop columns with only 1 occurrence
            values = df_src.drop(columns={"SOLVENT NAME"}, errors="ignore")
            count = values.sum(axis=0).T
            drop_cols = count[count == 1].index
            df_src = df_src.drop(columns=drop_cols, errors="ignore")
            
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        
        elif src == "spange_descriptors":
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"spange_{c}" for c in cols_to_rename})
        
        elif src == "acs_pca_descriptors":
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"acs_{c}" for c in cols_to_rename})
        
        elif src == "smiles":
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"smiles_{c}" for c in cols_to_rename})
        
        dfs.append(df_src)
    
    # Merge all dataframes on SOLVENT NAME
    from functools import reduce
    merged = reduce(lambda left, right: pd.merge(left, right, on="SOLVENT NAME", how="outer"), dfs)
    
    print(f"Combined features before filtering: {merged.shape}")
    
    # Apply correlation filtering
    merged_filtered, dropped = filter_correlated_features(merged, threshold=threshold)
    
    print(f"Combined features after filtering: {merged_filtered.shape}")
    print(f"Dropped {len(dropped)} columns")
    
    return merged_filtered

# Build the feature table
solvent_table = build_solvent_feature_table(threshold=0.90)
print(f"\nFinal solvent table shape: {solvent_table.shape}")

Combined features before filtering: (26, 115)
Combined features after filtering: (26, 68)
Dropped 47 columns

Final solvent table shape: (26, 68)


In [4]:
# Create featurizer class

class CombinedFeaturizer:
    """Featurizer that combines solvent features with numeric features."""
    
    def __init__(self, solvent_table, data='single'):
        self.solvent_table = solvent_table
        self.data_mode = data
        self.scaler = None
        self.feature_cols = None
    
    def featurize(self, X, fit_scaler=False):
        """Convert input DataFrame to feature matrix."""
        X = X.copy()
        
        if self.data_mode == 'single':
            # Single solvent: merge with solvent table
            X_merged = X.merge(self.solvent_table, on='SOLVENT NAME', how='left')
            
            # Get numeric columns
            numeric_cols = [c for c in X_merged.columns if c != 'SOLVENT NAME' and X_merged[c].dtype in [np.float64, np.int64, np.float32, np.int32]]
            X_numeric = X_merged[numeric_cols].copy()
        else:
            # Full data (mixture): merge with solvent table for both solvents
            # Rename solvent table columns for solvent A
            solvent_A = self.solvent_table.copy()
            solvent_A = solvent_A.rename(columns={'SOLVENT NAME': 'SOLVENT A NAME'})
            solvent_A.columns = ['SOLVENT A NAME'] + [f'{c}_A' for c in solvent_A.columns if c != 'SOLVENT A NAME']
            
            # Rename solvent table columns for solvent B
            solvent_B = self.solvent_table.copy()
            solvent_B = solvent_B.rename(columns={'SOLVENT NAME': 'SOLVENT B NAME'})
            solvent_B.columns = ['SOLVENT B NAME'] + [f'{c}_B' for c in solvent_B.columns if c != 'SOLVENT B NAME']
            
            # Merge
            X_merged = X.merge(solvent_A, on='SOLVENT A NAME', how='left')
            X_merged = X_merged.merge(solvent_B, on='SOLVENT B NAME', how='left')
            
            # Get numeric columns
            numeric_cols = [c for c in X_merged.columns if c not in ['SOLVENT A NAME', 'SOLVENT B NAME'] and X_merged[c].dtype in [np.float64, np.int64, np.float32, np.int32]]
            X_numeric = X_merged[numeric_cols].copy()
        
        # Add engineered features
        X_numeric = add_numeric_features(X_numeric)
        
        # Store feature columns
        if self.feature_cols is None:
            self.feature_cols = list(X_numeric.columns)
        
        # Convert to numpy
        X_np = X_numeric.values.astype(np.float64)
        
        # Handle NaN
        X_np = np.nan_to_num(X_np, nan=0.0)
        
        # Scale features
        if fit_scaler:
            self.scaler = StandardScaler()
            X_np = self.scaler.fit_transform(X_np)
        elif self.scaler is not None:
            X_np = self.scaler.transform(X_np)
        
        return torch.tensor(X_np, dtype=torch.double)

print("CombinedFeaturizer class defined.")

CombinedFeaturizer class defined.


In [5]:
# CatBoost + XGBoost Ensemble Model

class CatBoostXGBEnsemble:
    """CatBoost + XGBoost ensemble following ens-model kernel."""
    
    def __init__(self, data='single', verbose=False):
        self.data_mode = data
        self.verbose = verbose
        self.featurizer = CombinedFeaturizer(solvent_table, data=data)
        
        # Ensemble weights from ens-model kernel
        if data == 'single':
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            cat_weight = 1.0
            xgb_weight = 2.0
        
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum
        
        # CatBoost parameters from ens-model kernel
        if data == 'single':
            self.cat_params = dict(
                random_seed=42,
                loss_function="MultiRMSE",
                depth=3,
                learning_rate=0.07,
                n_estimators=1050,
                l2_leaf_reg=3.5,
                bootstrap_type="Bayesian",
                bagging_temperature=0.225,
                grow_policy="SymmetricTree",
                rsm=0.75,
                verbose=False,
            )
        else:
            self.cat_params = dict(
                random_seed=42,
                loss_function="MultiRMSE",
                depth=3,
                learning_rate=0.06,
                n_estimators=1100,
                l2_leaf_reg=2.5,
                bootstrap_type="Bayesian",
                bagging_temperature=0.25,
                grow_policy="SymmetricTree",
                rsm=0.75,
                verbose=False,
            )
        
        # XGBoost parameters from ens-model kernel
        if data == 'single':
            self.xgb_params = dict(
                random_state=42,
                objective="reg:squarederror",
                tree_method="hist",
                subsample=0.5,
                reg_lambda=0.6,
                reg_alpha=0.0,
                n_estimators=1000,
                min_child_weight=1,
                max_depth=4,
                max_delta_step=1,
                learning_rate=0.02,
                grow_policy="depthwise",
                gamma=0.0,
                colsample_bytree=0.3,
                colsample_bylevel=0.6,
            )
        else:
            self.xgb_params = dict(
                random_state=42,
                objective="reg:squarederror",
                tree_method="approx",
                subsample=0.5,
                reg_lambda=0.6,
                reg_alpha=0.0,
                n_estimators=1000,
                min_child_weight=1,
                max_depth=4,
                max_delta_step=1,
                learning_rate=0.02,
                grow_policy="lossguide",
                gamma=0.0,
                colsample_bytree=0.3,
                colsample_bylevel=0.6,
            )
        
        self.cat_model = None
        self.xgb_models = None
        self.n_targets = None
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        """Train CatBoost and XGBoost models."""
        # Featurize
        X_np = self.featurizer.featurize(train_X, fit_scaler=True).numpy()
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]
        
        # Train CatBoost (multi-target)
        self.cat_model = CatBoostRegressor(**self.cat_params)
        self.cat_model.fit(X_np, Y_np)
        
        # Train XGBoost (one per target)
        self.xgb_models = []
        for t in range(self.n_targets):
            model_t = XGBRegressor(**self.xgb_params)
            model_t.fit(X_np, Y_np[:, t])
            self.xgb_models.append(model_t)
        
        if verbose or self.verbose:
            print(f"[CatBoostXGBEnsemble] Trained in '{self.data_mode}' mode")
    
    def predict(self, X):
        """Predict with ensemble and apply output normalization."""
        X_np = self.featurizer.featurize(X, fit_scaler=False).numpy()
        
        # CatBoost prediction
        cat_pred = self.cat_model.predict(X_np)
        cat_pred = np.asarray(cat_pred)
        if cat_pred.ndim == 1:
            cat_pred = cat_pred.reshape(-1, 1)
        
        # XGBoost prediction
        xgb_preds = [m.predict(X_np) for m in self.xgb_models]
        xgb_pred = np.column_stack(xgb_preds)
        
        # Weighted ensemble
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        
        # Output normalization (sum to 1 constraint)
        out = np.clip(out, a_min=0.0, a_max=None)
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        
        return torch.tensor(out, dtype=torch.double)

print("CatBoostXGBEnsemble class defined.")
print(f"Single solvent weights: CatBoost={7/13:.2f}, XGBoost={6/13:.2f}")
print(f"Full data weights: CatBoost={1/3:.2f}, XGBoost={2/3:.2f}")

CatBoostXGBEnsemble class defined.
Single solvent weights: CatBoost=0.54, XGBoost=0.46
Full data weights: CatBoost=0.33, XGBoost=0.67


In [None]:
# Leave-One-Solvent-Out CV for single solvents (24 folds)
print("Running Leave-One-Solvent-Out CV for single solvents (24 folds)...")
print()

fold_mses = []
fold_results = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_single, Y_single)):
    test_solvent = test_X["SOLVENT NAME"].iloc[0]
    
    # Create fresh model for each fold
    model = CatBoostXGBEnsemble(data='single')
    model.train_model(train_X, train_Y)
    
    # Predict on test solvent
    preds = model.predict(test_X).numpy()
    actuals = test_Y.values
    
    # Calculate MSE
    mse = np.mean((preds - actuals) ** 2)
    fold_mses.append(mse)
    fold_results.append({'solvent': test_solvent, 'mse': mse})
    
    print(f"Fold {fold_idx}: {test_solvent}: MSE = {mse:.6f}")

mean_mse_single = np.mean(fold_mses)
std_mse_single = np.std(fold_mses)
print(f"\nSingle Solvent CV MSE: {mean_mse_single:.6f} +/- {std_mse_single:.6f}")
print(f"Baseline (exp_030): CV = 0.008298")
if mean_mse_single < 0.008298:
    print(f"IMPROVEMENT: {(0.008298 - mean_mse_single) / 0.008298 * 100:.2f}%")
else:
    print(f"Degradation: {(mean_mse_single - 0.008298) / 0.008298 * 100:.2f}%")

In [None]:
# Leave-One-Solvent-PAIR-Out CV for full data (13 folds)
# CRITICAL: This is the CORRECT CV scheme - by solvent PAIRS, not RAMP NUM!
print("\nRunning Leave-One-Solvent-PAIR-Out CV for full data (13 folds)...")
print("CRITICAL: Using solvent PAIRS, not RAMP NUM!")
print()

mix_fold_mses = []
mix_fold_results = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    solvent_a = test_X["SOLVENT A NAME"].iloc[0]
    solvent_b = test_X["SOLVENT B NAME"].iloc[0]
    
    # Create fresh model for each fold
    model = CatBoostXGBEnsemble(data='full')
    model.train_model(train_X, train_Y)
    
    # Predict on test solvent pair
    preds = model.predict(test_X).numpy()
    actuals = test_Y.values
    
    # Calculate MSE
    mse = np.mean((preds - actuals) ** 2)
    mix_fold_mses.append(mse)
    mix_fold_results.append({'solvent_a': solvent_a, 'solvent_b': solvent_b, 'mse': mse, 'n_samples': len(test_X)})
    
    print(f"Fold {fold_idx}: {solvent_a} + {solvent_b} (n={len(test_X)}): MSE = {mse:.6f}")

mean_mse_full = np.mean(mix_fold_mses)
std_mse_full = np.std(mix_fold_mses)
print(f"\nFull Data CV MSE: {mean_mse_full:.6f} +/- {std_mse_full:.6f}")
print(f"Number of folds: {len(mix_fold_mses)} (should be 13)")

In [None]:
# Combined CV score
print("\n" + "="*60)
print("COMBINED CV SCORE")
print("="*60)

# Weight by number of samples
n_single = len(X_single)
n_full = len(X_full)
total = n_single + n_full

weighted_cv = (n_single * mean_mse_single + n_full * mean_mse_full) / total

print(f"Single solvent CV: {mean_mse_single:.6f} (n={n_single}, 24 folds)")
print(f"Full data CV: {mean_mse_full:.6f} (n={n_full}, 13 folds)")
print(f"Weighted combined CV: {weighted_cv:.6f}")
print(f"\nBaseline (exp_030): CV = 0.008298")
if mean_mse_single < 0.008298:
    print(f"Single solvent improvement: {(0.008298 - mean_mse_single) / 0.008298 * 100:.2f}%")
else:
    print(f"Single solvent degradation: {(mean_mse_single - 0.008298) / 0.008298 * 100:.2f}%")

In [None]:
# Analyze per-solvent-pair results
print("\nPer-solvent-pair MSE analysis:")
print("="*60)

mix_df = pd.DataFrame(mix_fold_results)
mix_df = mix_df.sort_values('mse', ascending=False)

print("\nHardest solvent pairs:")
for _, row in mix_df.head(5).iterrows():
    print(f"  {row['solvent_a']} + {row['solvent_b']} (n={row['n_samples']}): MSE = {row['mse']:.6f}")

print("\nEasiest solvent pairs:")
for _, row in mix_df.tail(5).iterrows():
    print(f"  {row['solvent_a']} + {row['solvent_b']} (n={row['n_samples']}): MSE = {row['mse']:.6f}")

In [None]:
# Generate submission in the CORRECT format
print("\n" + "="*60)
print("GENERATING SUBMISSION")
print("="*60)

# Single solvent predictions (24 folds)
print("\nGenerating single solvent predictions (24 folds)...")
all_predictions_single = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(tqdm.tqdm(list(generate_leave_one_out_splits(X_single, Y_single)))):
    model = CatBoostXGBEnsemble(data='single')
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions_single.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions_single)
print(f"Single solvent predictions: {len(submission_single_solvent)}")

In [None]:
# Full data predictions (13 folds by solvent PAIRS)
print("\nGenerating full data predictions (13 folds by solvent PAIRS)...")
all_predictions_full = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(tqdm.tqdm(list(generate_leave_one_ramp_out_splits(X_full, Y_full)))):
    model = CatBoostXGBEnsemble(data='full')
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions_full.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions_full)
print(f"Full data predictions: {len(submission_full_data)}")

In [None]:
# Combine and save submission
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

# Save to submission directory
import os
os.makedirs('/home/submission', exist_ok=True)
submission.to_csv("/home/submission/submission.csv", index=True)

print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Total rows: {len(submission)}")
print(f"\nSubmission head:")
print(submission.head())
print(f"\nSubmission tail:")
print(submission.tail())

In [None]:
# Summary
print("="*60)
print("EXPERIMENT 050: CatBoost + XGBoost Ensemble (FIXED CV Scheme)")
print("="*60)
print()
print("THE FIX:")
print("  - exp_049 used RAMP NUM (87 folds) for full data CV")
print("  - exp_050 uses solvent PAIRS (13 folds) - the CORRECT scheme")
print()
print("RESULTS:")
print(f"  Single solvent CV MSE: {mean_mse_single:.6f} +/- {std_mse_single:.6f} (24 folds)")
print(f"  Full data CV MSE: {mean_mse_full:.6f} +/- {std_mse_full:.6f} (13 folds)")
print(f"  Weighted combined CV: {weighted_cv:.6f}")
print()
print("COMPARISON TO BASELINE (exp_030):")
print(f"  Baseline CV: 0.008298")
print(f"  This experiment CV: {mean_mse_single:.6f}")
if mean_mse_single < 0.008298:
    print(f"  Improvement: {(0.008298 - mean_mse_single) / 0.008298 * 100:.2f}%")
else:
    print(f"  Degradation: {(mean_mse_single - 0.008298) / 0.008298 * 100:.2f}%")
print()
print("SUBMISSION:")
print(f"  Saved to: /home/submission/submission.csv")
print(f"  Total rows: {len(submission)}")
print(f"  Single solvent rows: {len(submission_single_solvent)}")
print(f"  Full data rows: {len(submission_full_data)}")
print()
print("EXPECTED LB (based on CV-LB relationship):")
print(f"  LB = 4.29 * {mean_mse_single:.6f} + 0.0528 = {4.29 * mean_mse_single + 0.0528:.4f}")
print(f"  exp_030 LB: 0.0877")