# Experiment 049: CatBoost + XGBoost Ensemble

**Goal:** Implement the CatBoost + XGBoost ensemble from the ens-model kernel.

**Key differences from our previous approaches:**
1. CatBoost with MultiRMSE loss (multi-target in single model)
2. Output normalization (sum to 1 constraint)
3. Combined feature table with correlation filtering
4. Different ensemble weights for single vs full data

**Hypothesis:** CatBoost may have different generalization properties that could CHANGE the CV-LB relationship.

In [5]:
import sys
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from abc import ABC, abstractmethod

# Define constants
INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"
]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_FEATURES = ["SOLVENT NAME"]
INPUT_LABELS_FULL_FEATURES = ["SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

# Data loading functions
DATA_PATH = "/home/data"

def load_data_local():
    """Load both single and full datasets."""
    # Single solvent
    df_single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
    X_single = df_single[INPUT_LABELS_SINGLE_SOLVENT]
    Y_single = df_single[TARGET_LABELS]
    
    # Full data
    df_full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
    X_full = df_full[INPUT_LABELS_FULL_SOLVENT]
    Y_full = df_full[TARGET_LABELS]
    
    return X_single, Y_single, X_full, Y_full

def load_features_local(name="spange_descriptors"):
    """Load feature lookup table."""
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

# Define base classes
class SmilesFeaturizer(ABC):
    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError

# Load data
X_single, Y_single, X_full, Y_full = load_data_local()
print(f"Single solvent: X={X_single.shape}, Y={Y_single.shape}")
print(f"Full data: X={X_full.shape}, Y={Y_full.shape}")
print(f"\nSolvents: {sorted(X_single['SOLVENT NAME'].unique())}")

Single solvent: X=(656, 3), Y=(656, 3)
Full data: X=(1227, 5), Y=(1227, 3)

Solvents: ['1,1,1,3,3,3-Hexafluoropropan-2-ol', '2,2,2-Trifluoroethanol', '2-Methyltetrahydrofuran [2-MeTHF]', 'Acetonitrile', 'Acetonitrile.Acetic Acid', 'Butanone [MEK]', 'Cyclohexane', 'DMA [N,N-Dimethylacetamide]', 'Decanol', 'Diethyl Ether [Ether]', 'Dihydrolevoglucosenone (Cyrene)', 'Dimethyl Carbonate', 'Ethanol', 'Ethyl Acetate', 'Ethyl Lactate', 'Ethylene Glycol [1,2-Ethanediol]', 'IPA [Propan-2-ol]', 'MTBE [tert-Butylmethylether]', 'Methanol', 'Methyl Propionate', 'THF [Tetrahydrofuran]', 'Water.2,2,2-Trifluoroethanol', 'Water.Acetonitrile', 'tert-Butanol [2-Methylpropan-2-ol]']


In [6]:
# Feature engineering functions from ens-model kernel

def feature_priority(name: str) -> int:
    """Assign priority score to feature name based on prefix."""
    if name.startswith("spange_"):
        return 5
    if name.startswith("acs_"):
        return 4
    if name.startswith("drfps_"):
        return 3
    if name.startswith("frag_"):
        return 2
    if name.startswith("smiles_"):
        return 1
    return 0

def filter_correlated_features(df: pd.DataFrame, threshold: float = 0.90):
    """Drop columns that are highly correlated."""
    numeric_df = df.select_dtypes(include=[np.number])
    
    if numeric_df.shape[1] == 0:
        return df, []
    
    # Drop constant columns first
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        numeric_df = numeric_df.drop(columns=constant_cols)
    
    # Correlation matrix
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    
    cols = upper.columns.tolist()
    to_drop = set()
    
    # Find high correlation pairs
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                if col_i in to_drop or col_j in to_drop:
                    continue
                p_i = feature_priority(col_i)
                p_j = feature_priority(col_j)
                if p_i > p_j:
                    to_drop.add(col_j)
                elif p_j > p_i:
                    to_drop.add(col_i)
                else:
                    idx_i = df.columns.get_loc(col_i) if col_i in df.columns else 999
                    idx_j = df.columns.get_loc(col_j) if col_j in df.columns else 999
                    to_drop.add(col_i if idx_i > idx_j else col_j)
    
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors="ignore")
    
    return df_filtered, all_to_drop

def add_numeric_features(X_numeric: pd.DataFrame) -> pd.DataFrame:
    """Add engineered numeric features."""
    X_num = X_numeric.copy()
    cols = set(X_num.columns)
    
    if {"Temperature", "Residence Time"} <= cols:
        # Convert Temperature to Kelvin
        X_num["Temperature"] = X_num["Temperature"] + 273.15
        
        T = X_num["Temperature"]
        rt = X_num["Residence Time"]
        
        # Interaction term
        X_num["T_x_RT"] = T * rt
        
        # Log transformation
        X_num["RT_log"] = np.log(rt + 1e-6)
        
        # Inverse temperature
        X_num["T_inv"] = 1 / T
        
        # Scaled residence time
        X_num["RT_scaled"] = rt / rt.mean()
    
    return X_num

print("Feature engineering functions defined.")

Feature engineering functions defined.


In [7]:
# Build combined solvent feature table

def build_solvent_feature_table(threshold: float = 0.90):
    """Build combined solvent feature table from multiple sources."""
    sources = [
        "spange_descriptors",
        "acs_pca_descriptors",
        "drfps_catechol",
        "fragprints",
        "smiles",
    ]
    
    dfs = []
    
    for src in sources:
        df_src = load_features_local(src).copy()
        
        if "SOLVENT NAME" not in df_src.columns:
            df_src = df_src.reset_index().rename(columns={"index": "SOLVENT NAME"})
        
        # Bit-table filtering for binary fingerprints
        if src in ["drfps_catechol", "fragprints"]:
            prefix = "drfps" if src == "drfps_catechol" else "frag"
            
            # Drop all-zero and all-one columns
            df_src = df_src.loc[:, (df_src != 0).any(axis=0)]
            df_src = df_src.loc[:, (df_src != 1).any(axis=0)]
            
            # Drop columns with only 1 occurrence
            values = df_src.drop(columns={"SOLVENT NAME"}, errors="ignore")
            count = values.sum(axis=0).T
            drop_cols = count[count == 1].index
            df_src = df_src.drop(columns=drop_cols, errors="ignore")
            
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        
        elif src == "spange_descriptors":
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"spange_{c}" for c in cols_to_rename})
        
        elif src == "acs_pca_descriptors":
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"acs_{c}" for c in cols_to_rename})
        
        elif src == "smiles":
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"smiles_{c}" for c in cols_to_rename})
        
        dfs.append(df_src)
    
    # Merge all dataframes on SOLVENT NAME
    from functools import reduce
    merged = reduce(lambda left, right: pd.merge(left, right, on="SOLVENT NAME", how="outer"), dfs)
    
    print(f"Combined features before filtering: {merged.shape}")
    
    # Apply correlation filtering
    merged_filtered, dropped = filter_correlated_features(merged, threshold=threshold)
    
    print(f"Combined features after filtering: {merged_filtered.shape}")
    print(f"Dropped {len(dropped)} columns")
    
    return merged_filtered

# Build the feature table
solvent_table = build_solvent_feature_table(threshold=0.90)
print(f"\nFinal solvent table shape: {solvent_table.shape}")
print(f"Columns: {list(solvent_table.columns)[:20]}...")

Combined features before filtering: (26, 115)
Combined features after filtering: (26, 68)
Dropped 47 columns

Final solvent table shape: (26, 68)
Columns: ['SOLVENT NAME', 'spange_dielectric constant', 'spange_ET(30)', 'spange_beta', 'spange_pi*', 'spange_SB', 'spange_SP', 'spange_SdP', 'spange_N', 'spange_n', 'acs_PC1', 'acs_PC2', 'acs_PC3', 'acs_PC4', 'acs_PC5', 'drfps_34', 'drfps_67', 'drfps_110', 'drfps_125', 'drfps_209']...


In [13]:
# Create featurizer class

class CombinedFeaturizer:
    """Featurizer that combines solvent features with numeric features."""
    
    def __init__(self, solvent_table, data='single'):
        self.solvent_table = solvent_table
        self.data_mode = data
        self.scaler = None
        self.feature_cols = None
    
    def featurize(self, X, fit_scaler=False):
        """Convert input DataFrame to feature matrix."""
        X = X.copy()
        
        if self.data_mode == 'single':
            # Single solvent: merge with solvent table
            X_merged = X.merge(self.solvent_table, on='SOLVENT NAME', how='left')
            
            # Get numeric columns
            numeric_cols = [c for c in X_merged.columns if c != 'SOLVENT NAME' and X_merged[c].dtype in [np.float64, np.int64, np.float32, np.int32]]
            X_numeric = X_merged[numeric_cols].copy()
        else:
            # Full data (mixture): merge with solvent table for both solvents
            # Rename solvent table columns for solvent A
            solvent_A = self.solvent_table.copy()
            solvent_A = solvent_A.rename(columns={'SOLVENT NAME': 'SOLVENT A NAME'})
            solvent_A.columns = ['SOLVENT A NAME'] + [f'{c}_A' for c in solvent_A.columns if c != 'SOLVENT A NAME']
            
            # Rename solvent table columns for solvent B
            solvent_B = self.solvent_table.copy()
            solvent_B = solvent_B.rename(columns={'SOLVENT NAME': 'SOLVENT B NAME'})
            solvent_B.columns = ['SOLVENT B NAME'] + [f'{c}_B' for c in solvent_B.columns if c != 'SOLVENT B NAME']
            
            # Merge
            X_merged = X.merge(solvent_A, on='SOLVENT A NAME', how='left')
            X_merged = X_merged.merge(solvent_B, on='SOLVENT B NAME', how='left')
            
            # Get numeric columns
            numeric_cols = [c for c in X_merged.columns if c not in ['SOLVENT A NAME', 'SOLVENT B NAME'] and X_merged[c].dtype in [np.float64, np.int64, np.float32, np.int32]]
            X_numeric = X_merged[numeric_cols].copy()
        
        # Add engineered features
        X_numeric = add_numeric_features(X_numeric)
        
        # Store feature columns
        if self.feature_cols is None:
            self.feature_cols = list(X_numeric.columns)
        
        # Convert to numpy
        X_np = X_numeric.values.astype(np.float64)
        
        # Handle NaN
        X_np = np.nan_to_num(X_np, nan=0.0)
        
        # Scale features
        if fit_scaler:
            self.scaler = StandardScaler()
            X_np = self.scaler.fit_transform(X_np)
        elif self.scaler is not None:
            X_np = self.scaler.transform(X_np)
        
        return torch.tensor(X_np, dtype=torch.double)

print("CombinedFeaturizer class defined.")

CombinedFeaturizer class defined.


In [14]:
# CatBoost + XGBoost Ensemble Model

class CatBoostXGBEnsemble(BaseModel):
    """CatBoost + XGBoost ensemble following ens-model kernel."""
    
    def __init__(self, data='single', verbose=False):
        self.data_mode = data
        self.verbose = verbose
        self.featurizer = CombinedFeaturizer(solvent_table, data=data)
        
        # Ensemble weights from ens-model kernel
        if data == 'single':
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            cat_weight = 1.0
            xgb_weight = 2.0
        
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum
        
        # CatBoost parameters from ens-model kernel
        if data == 'single':
            self.cat_params = dict(
                random_seed=42,
                loss_function="MultiRMSE",
                depth=3,
                learning_rate=0.07,
                n_estimators=1050,
                l2_leaf_reg=3.5,
                bootstrap_type="Bayesian",
                bagging_temperature=0.225,
                grow_policy="SymmetricTree",
                rsm=0.75,
                verbose=False,
            )
        else:
            self.cat_params = dict(
                random_seed=42,
                loss_function="MultiRMSE",
                depth=3,
                learning_rate=0.06,
                n_estimators=1100,
                l2_leaf_reg=2.5,
                bootstrap_type="Bayesian",
                bagging_temperature=0.25,
                grow_policy="SymmetricTree",
                rsm=0.75,
                verbose=False,
            )
        
        # XGBoost parameters from ens-model kernel
        if data == 'single':
            self.xgb_params = dict(
                random_state=42,
                objective="reg:squarederror",
                tree_method="hist",
                subsample=0.5,
                reg_lambda=0.6,
                reg_alpha=0.0,
                n_estimators=1000,
                min_child_weight=1,
                max_depth=4,
                max_delta_step=1,
                learning_rate=0.02,
                grow_policy="depthwise",
                gamma=0.0,
                colsample_bytree=0.3,
                colsample_bylevel=0.6,
            )
        else:
            self.xgb_params = dict(
                random_state=42,
                objective="reg:squarederror",
                tree_method="approx",
                subsample=0.5,
                reg_lambda=0.6,
                reg_alpha=0.0,
                n_estimators=1000,
                min_child_weight=1,
                max_depth=4,
                max_delta_step=1,
                learning_rate=0.02,
                grow_policy="lossguide",
                gamma=0.0,
                colsample_bytree=0.3,
                colsample_bylevel=0.6,
            )
        
        self.cat_model = None
        self.xgb_models = None
        self.n_targets = None
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        """Train CatBoost and XGBoost models."""
        # Featurize
        X_np = self.featurizer.featurize(train_X, fit_scaler=True).numpy()
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]
        
        # Train CatBoost (multi-target)
        self.cat_model = CatBoostRegressor(**self.cat_params)
        self.cat_model.fit(X_np, Y_np)
        
        # Train XGBoost (one per target)
        self.xgb_models = []
        for t in range(self.n_targets):
            model_t = XGBRegressor(**self.xgb_params)
            model_t.fit(X_np, Y_np[:, t])
            self.xgb_models.append(model_t)
        
        if verbose or self.verbose:
            print(f"[CatBoostXGBEnsemble] Trained in '{self.data_mode}' mode")
    
    def predict(self, X):
        """Predict with ensemble and apply output normalization."""
        X_np = self.featurizer.featurize(X, fit_scaler=False).numpy()
        
        # CatBoost prediction
        cat_pred = self.cat_model.predict(X_np)
        cat_pred = np.asarray(cat_pred)
        if cat_pred.ndim == 1:
            cat_pred = cat_pred.reshape(-1, 1)
        
        # XGBoost prediction
        xgb_preds = [m.predict(X_np) for m in self.xgb_models]
        xgb_pred = np.column_stack(xgb_preds)
        
        # Weighted ensemble
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        
        # Output normalization (sum to 1 constraint)
        out = np.clip(out, a_min=0.0, a_max=None)
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        
        return torch.tensor(out, dtype=torch.double)

print("CatBoostXGBEnsemble class defined.")
print(f"Single solvent weights: CatBoost={7/13:.2f}, XGBoost={6/13:.2f}")
print(f"Full data weights: CatBoost={1/3:.2f}, XGBoost={2/3:.2f}")

CatBoostXGBEnsemble class defined.
Single solvent weights: CatBoost=0.54, XGBoost=0.46
Full data weights: CatBoost=0.33, XGBoost=0.67


In [10]:
# Leave-One-Solvent-Out CV for single solvents
print("Running Leave-One-Solvent-Out CV for single solvents...")
print()

all_solvents = sorted(X_single["SOLVENT NAME"].unique())
fold_mses = []
fold_results = []

for test_solvent in all_solvents:
    mask = X_single["SOLVENT NAME"] != test_solvent
    
    # Create fresh model and featurizer for each fold
    model = CatBoostXGBEnsemble(data='single')
    model.train_model(X_single[mask], Y_single[mask])
    
    # Predict on test solvent
    preds = model.predict(X_single[~mask]).numpy()
    actuals = Y_single[~mask].values
    
    # Calculate MSE
    mse = np.mean((preds - actuals) ** 2)
    fold_mses.append(mse)
    fold_results.append({'solvent': test_solvent, 'mse': mse})
    
    print(f"Fold {test_solvent}: MSE = {mse:.6f}")

mean_mse = np.mean(fold_mses)
std_mse = np.std(fold_mses)
print(f"\nSingle Solvent CV MSE: {mean_mse:.6f} +/- {std_mse:.6f}")
print(f"Baseline (exp_030): CV = 0.008298")
if mean_mse < 0.008298:
    print(f"IMPROVEMENT: {(0.008298 - mean_mse) / 0.008298 * 100:.2f}%")
else:
    print(f"Degradation: {(mean_mse - 0.008298) / 0.008298 * 100:.2f}%")

Running Leave-One-Solvent-Out CV for single solvents...



Fold 1,1,1,3,3,3-Hexafluoropropan-2-ol: MSE = 0.029401


Fold 2,2,2-Trifluoroethanol: MSE = 0.019110


Fold 2-Methyltetrahydrofuran [2-MeTHF]: MSE = 0.002023


Fold Acetonitrile: MSE = 0.011238


Fold Acetonitrile.Acetic Acid: MSE = 0.022562


Fold Butanone [MEK]: MSE = 0.003177


Fold Cyclohexane: MSE = 0.002913


Fold DMA [N,N-Dimethylacetamide]: MSE = 0.001434


Fold Decanol: MSE = 0.008781


Fold Diethyl Ether [Ether]: MSE = 0.015867


Fold Dihydrolevoglucosenone (Cyrene): MSE = 0.005143


Fold Dimethyl Carbonate: MSE = 0.007844


Fold Ethanol: MSE = 0.003238


Fold Ethyl Acetate: MSE = 0.001843


Fold Ethyl Lactate: MSE = 0.002626


Fold Ethylene Glycol [1,2-Ethanediol]: MSE = 0.017804


Fold IPA [Propan-2-ol]: MSE = 0.012355


Fold MTBE [tert-Butylmethylether]: MSE = 0.000945


Fold Methanol: MSE = 0.003990


Fold Methyl Propionate: MSE = 0.001081


Fold THF [Tetrahydrofuran]: MSE = 0.000795


Fold Water.2,2,2-Trifluoroethanol: MSE = 0.002372


Fold Water.Acetonitrile: MSE = 0.016135


Fold tert-Butanol [2-Methylpropan-2-ol]: MSE = 0.001529

Single Solvent CV MSE: 0.008092 +/- 0.007938
Baseline (exp_030): CV = 0.008298
IMPROVEMENT: 2.48%


In [None]:
# Analyze per-solvent results
print("\nPer-solvent MSE analysis:")
print("="*60)

fold_df = pd.DataFrame(fold_results)
fold_df = fold_df.sort_values('mse', ascending=False)

print("\nTop 5 hardest solvents:")
for _, row in fold_df.head(5).iterrows():
    print(f"  {row['solvent']}: MSE = {row['mse']:.6f}")

print("\nTop 5 easiest solvents:")
for _, row in fold_df.tail(5).iterrows():
    print(f"  {row['solvent']}: MSE = {row['mse']:.6f}")

In [15]:
# Leave-One-Ramp-Out CV for mixtures
print("\nRunning Leave-One-Ramp-Out CV for mixtures...")
print()

# Load full data with RAMP NUM
df_full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
X_full_with_ramp = df_full[INPUT_LABELS_FULL_SOLVENT + ['RAMP NUM']]
Y_full = df_full[TARGET_LABELS]

# Get unique ramps
ramps = X_full_with_ramp["RAMP NUM"].unique()
mix_fold_mses = []

for test_ramp in ramps:
    mask = X_full_with_ramp["RAMP NUM"] != test_ramp
    
    # Get X without RAMP NUM for training
    X_train = X_full_with_ramp[mask][INPUT_LABELS_FULL_SOLVENT]
    X_test = X_full_with_ramp[~mask][INPUT_LABELS_FULL_SOLVENT]
    
    # Create fresh model for each fold
    model = CatBoostXGBEnsemble(data='full')
    model.train_model(X_train, Y_full[mask])
    
    # Predict on test ramp
    preds = model.predict(X_test).numpy()
    actuals = Y_full[~mask].values
    
    # Calculate MSE
    mse = np.mean((preds - actuals) ** 2)
    mix_fold_mses.append(mse)
    
    print(f"Ramp {test_ramp}: MSE = {mse:.6f}")

mix_mean_mse = np.mean(mix_fold_mses)
mix_std_mse = np.std(mix_fold_mses)
print(f"\nMixture CV MSE: {mix_mean_mse:.6f} +/- {mix_std_mse:.6f}")


Running Leave-One-Ramp-Out CV for mixtures...



Ramp 0: MSE = 0.001230


Ramp 1: MSE = 0.000673


Ramp 2: MSE = 0.005224


Ramp 3: MSE = 0.000410


Ramp 4: MSE = 0.000976


Ramp 5: MSE = 0.011362


Ramp 6: MSE = 0.001013


Ramp 7: MSE = 0.014924


Ramp 8: MSE = 0.000632


Ramp 9: MSE = 0.020679


Ramp 10: MSE = 0.011034


Ramp 11: MSE = 0.001982


Ramp 12: MSE = 0.010017


Ramp 13: MSE = 0.005755


Ramp 14: MSE = 0.012554


Ramp 15: MSE = 0.003189


Ramp 16: MSE = 0.013827


Ramp 17: MSE = 0.002977


Ramp 18: MSE = 0.000223


Ramp 19: MSE = 0.001116


Ramp 20: MSE = 0.000979


Ramp 21: MSE = 0.005766


Ramp 22: MSE = 0.003876


Ramp 23: MSE = 0.001458


Ramp 24: MSE = 0.004095


Ramp 25: MSE = 0.000745


Ramp 26: MSE = 0.000157


Ramp 27: MSE = 0.000322


Ramp 28: MSE = 0.009866


Ramp 29: MSE = 0.020048


Ramp 30: MSE = 0.000427


Ramp 31: MSE = 0.020731


Ramp 32: MSE = 0.000733


Ramp 33: MSE = 0.002274


Ramp 34: MSE = 0.000542


Ramp 35: MSE = 0.001680


Ramp 36: MSE = 0.001319


Ramp 37: MSE = 0.007119


Ramp 38: MSE = 0.019709


Ramp 39: MSE = 0.004595


Ramp 40: MSE = 0.014460


Ramp 41: MSE = 0.004359


Ramp 42: MSE = 0.000703


Ramp 43: MSE = 0.000075


Ramp 44: MSE = 0.000029


Ramp 45: MSE = 0.000231


Ramp 46: MSE = 0.002789


Ramp 47: MSE = 0.004459


Ramp 48: MSE = 0.000492


Ramp 49: MSE = 0.007894


Ramp 50: MSE = 0.003892


Ramp 51: MSE = 0.006231


Ramp 52: MSE = 0.001361


Ramp 53: MSE = 0.010827


Ramp 54: MSE = 0.000836


Ramp 55: MSE = 0.008135


Ramp 56: MSE = 0.008096


Ramp 57: MSE = 0.001967


Ramp 58: MSE = 0.015421


Ramp 59: MSE = 0.002281


Ramp 60: MSE = 0.000337


Ramp 61: MSE = 0.001102


Ramp 62: MSE = 0.002150


Ramp 63: MSE = 0.000234


Ramp 64: MSE = 0.001966


Ramp 65: MSE = 0.003417


Ramp 66: MSE = 0.000921


Ramp 67: MSE = 0.004653


Ramp 68: MSE = 0.000716


Ramp 69: MSE = 0.001661


Ramp 70: MSE = 0.000808


Ramp 71: MSE = 0.000203


Ramp 72: MSE = 0.004416


Ramp 73: MSE = 0.002106


Ramp 74: MSE = 0.003348


Ramp 75: MSE = 0.005658


Ramp 76: MSE = 0.001074


Ramp 77: MSE = 0.003537


Ramp 78: MSE = 0.002257


Ramp 79: MSE = 0.010185


Ramp 80: MSE = 0.007058


Ramp 81: MSE = 0.013841


Ramp 82: MSE = 0.013247


Ramp 83: MSE = 0.006776


Ramp 84: MSE = 0.011691


Ramp 85: MSE = 0.009946


Ramp 86: MSE = 0.009490

Mixture CV MSE: 0.005099 +/- 0.005408


In [16]:
# Combined CV score (weighted average)
print("\n" + "="*60)
print("COMBINED CV SCORE")
print("="*60)

# Weight by number of samples
n_single = len(X_single)
n_full = len(X_full)
total = n_single + n_full

weighted_cv = (n_single * mean_mse + n_full * mix_mean_mse) / total

print(f"Single solvent CV: {mean_mse:.6f} (n={n_single})")
print(f"Mixture CV: {mix_mean_mse:.6f} (n={n_full})")
print(f"Weighted combined CV: {weighted_cv:.6f}")
print(f"\nBaseline (exp_030): CV = 0.008298")
if weighted_cv < 0.008298:
    print(f"IMPROVEMENT: {(0.008298 - weighted_cv) / 0.008298 * 100:.2f}%")
else:
    print(f"Degradation: {(weighted_cv - 0.008298) / 0.008298 * 100:.2f}%")


COMBINED CV SCORE
Single solvent CV: 0.008092 (n=656)
Mixture CV: 0.005099 (n=1227)
Weighted combined CV: 0.006141

Baseline (exp_030): CV = 0.008298
IMPROVEMENT: 25.99%


In [17]:
# Generate submission
print("\nGenerating submission...")

# Train final models on all data
final_single_model = CatBoostXGBEnsemble(data='single')
final_single_model.train_model(X_single, Y_single)

final_full_model = CatBoostXGBEnsemble(data='full')
final_full_model.train_model(X_full, Y_full)

# Save submission
import os
os.makedirs('/home/submission', exist_ok=True)

# The submission format follows the competition template
print("\nFinal models trained.")
print(f"Single solvent CV: {mean_mse:.6f}")
print(f"Mixture CV: {mix_mean_mse:.6f}")
print(f"Weighted combined CV: {weighted_cv:.6f}")


Generating submission...



Final models trained.
Single solvent CV: 0.008092
Mixture CV: 0.005099
Weighted combined CV: 0.006141


In [None]:
# Create submission in the required format
# This mimics what the competition template does

import tqdm

# Single solvent predictions
print("Generating single solvent predictions...")
X_s, Y_s = X_single, Y_single
all_solvents = sorted(X_s["SOLVENT NAME"].unique())
all_predictions_single = []

for fold_idx, test_solvent in enumerate(tqdm.tqdm(all_solvents)):
    mask = X_s["SOLVENT NAME"] != test_solvent
    train_X, train_Y = X_s[mask], Y_s[mask]
    test_X, test_Y = X_s[~mask], Y_s[~mask]
    
    model = CatBoostXGBEnsemble(data='single')
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions_single.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions_single)
print(f"Single solvent predictions: {len(submission_single_solvent)}")

In [None]:
# Full data predictions
print("Generating full data predictions...")

# Load full data with RAMP NUM
df_full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
X_f = df_full[INPUT_LABELS_FULL_SOLVENT]
Y_f = df_full[TARGET_LABELS]

# Get unique ramps for leave-one-ramp-out
all_ramps = df_full["RAMP NUM"].unique()
all_predictions_full = []

for fold_idx, test_ramp in enumerate(tqdm.tqdm(all_ramps)):
    mask = df_full["RAMP NUM"] != test_ramp
    train_X, train_Y = X_f[mask], Y_f[mask]
    test_X, test_Y = X_f[~mask], Y_f[~mask]
    
    model = CatBoostXGBEnsemble(data='full')
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions_full.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions_full)
print(f"Full data predictions: {len(submission_full_data)}")

In [None]:
# Combine and save submission
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

# Save to submission directory
submission.to_csv("/home/submission/submission.csv", index=True)
print(f"Submission saved to /home/submission/submission.csv")
print(f"Total rows: {len(submission)}")
print(f"\nSubmission head:")
print(submission.head())
print(f"\nSubmission tail:")
print(submission.tail())