# Experiment 055: CatBoost + XGBoost with Official Leave-One-Out CV

**Goal:** Fix the submission format by using the official Leave-One-Out CV scheme.

**Key Changes from exp_054:**
1. Use official Leave-One-Out CV (24 folds for single, 13 folds for full)
2. NOT GroupKFold (5 folds)
3. Same CatBoost + XGBoost ensemble model

**Expected:** This should produce a valid submission that the evaluation system accepts.

In [1]:
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.set_default_dtype(torch.double)

DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

print('Imports complete.')

Imports complete.


In [2]:
# Data loading functions
def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

# OFFICIAL Leave-One-Out CV functions (NOT GroupKFold!)
def generate_leave_one_out_splits(X, Y):
    """Leave-one-solvent-out CV (24 folds for single solvent data)."""
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    """Leave-one-ramp-out CV (13 folds for full data)."""
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined (OFFICIAL Leave-One-Out CV).')

Data loading functions defined (OFFICIAL Leave-One-Out CV).


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, ACS PCA: {ACS_PCA_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}')

Spange: (26, 13), ACS PCA: (24, 5), DRFP filtered: (24, 122)


In [4]:
# Featurizer with Arrhenius kinetics + combined features
class CombinedFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.acs_pca_df = ACS_PCA_DF
        self.drfp_df = DRFP_FILTERED
        # Features: 2 (numeric) + 3 (kinetic) + 13 (spange) + 5 (acs_pca) + 122 (drfp) = 145
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.acs_pca_df.shape[1] + self.drfp_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1) / 100.0
            
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_acs, X_drfp])

print(f'CombinedFeaturizer defined. Feature dimension: {CombinedFeaturizer().feats_dim}')

CombinedFeaturizer defined. Feature dimension: 145


In [5]:
# CatBoost + XGBoost Ensemble Model
class CatBoostXGBoostEnsemble:
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = CombinedFeaturizer(mixed=(data=='full'))
        self.catboost_models = []
        self.xgb_models = []
        self.scaler = StandardScaler()
        # Ensemble weights
        self.catboost_weight = 0.6
        self.xgb_weight = 0.4

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X_all)
        
        # Train CatBoost models (one per target)
        self.catboost_models = []
        for i in range(3):
            model = CatBoostRegressor(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3,
                random_seed=42,
                verbose=False
            )
            model.fit(X_scaled, y_all[:, i])
            self.catboost_models.append(model)
        
        # Train XGBoost models (one per target)
        self.xgb_models = []
        for i in range(3):
            model = xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=6,
                reg_alpha=0.1,
                reg_lambda=1.0,
                random_state=42,
                verbosity=0
            )
            model.fit(X_scaled, y_all[:, i])
            self.xgb_models.append(model)

    def predict(self, X_test):
        X_std = self.featurizer.featurize(X_test, flip=False)
        X_scaled = self.scaler.transform(X_std)
        
        # CatBoost predictions
        catboost_preds = np.column_stack([m.predict(X_scaled) for m in self.catboost_models])
        
        # XGBoost predictions
        xgb_preds = np.column_stack([m.predict(X_scaled) for m in self.xgb_models])
        
        # Ensemble
        combined = self.catboost_weight * catboost_preds + self.xgb_weight * xgb_preds
        
        # Clip to [0, 1]
        combined = np.clip(combined, 0, 1)
        
        return torch.tensor(combined, dtype=torch.double)

print('CatBoostXGBoostEnsemble defined.')

CatBoostXGBoostEnsemble defined.


In [6]:
# Quick test
print('Testing model...')
X, Y = load_data('single_solvent')
print(f'Single solvent data: X={X.shape}, Y={Y.shape}')

# Test one fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

print(f'Train: {len(train_X)}, Test: {len(test_X)}')

model = CatBoostXGBoostEnsemble(data='single')
model.train_model(train_X, train_Y)
preds = model.predict(test_X)

print(f'Predictions shape: {preds.shape}')
print(f'Predictions range: [{preds.min():.4f}, {preds.max():.4f}]')
print('Model test passed!')

Testing model...
Single solvent data: X=(656, 3), Y=(656, 3)
Train: 619, Test: 37


Predictions shape: torch.Size([37, 3])
Predictions range: [0.0165, 0.9186]
Model test passed!


In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = CatBoostXGBoostEnsemble(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f'Single solvent predictions: {len(submission_single_solvent)}')
print(f'Unique folds: {submission_single_solvent["fold"].nunique()}')

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = CatBoostXGBoostEnsemble(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f'Full data predictions: {len(submission_full_data)}')
print(f'Unique folds: {submission_full_data["fold"].nunique()}')

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f'Submission saved to /home/submission/submission.csv')
print(f'Total rows: {len(submission)}')

In [None]:
# Verify submission format
print('\n' + '='*60)
print('SUBMISSION VERIFICATION')
print('='*60)

df = pd.read_csv('/home/submission/submission.csv')

print(f'\nColumns: {df.columns.tolist()}')
print(f'Total rows: {len(df)}')

print(f'\nTask 0 (single solvent):')
task0 = df[df['task'] == 0]
print(f'  Rows: {len(task0)}')
print(f'  Folds: {task0["fold"].nunique()}')
print(f'  Fold range: {task0["fold"].min()} to {task0["fold"].max()}')

print(f'\nTask 1 (full data):')
task1 = df[df['task'] == 1]
print(f'  Rows: {len(task1)}')
print(f'  Folds: {task1["fold"].nunique()}')
print(f'  Fold range: {task1["fold"].min()} to {task1["fold"].max()}')

print(f'\nTarget statistics:')
for col in ['target_1', 'target_2', 'target_3']:
    print(f'  {col}: min={df[col].min():.6f}, max={df[col].max():.6f}')

print(f'\nAny NaN: {df.isna().any().any()}')
print(f'Any Inf: {(df == float("inf")).any().any() or (df == float("-inf")).any().any()}')

print('\nâœ“ This uses OFFICIAL Leave-One-Out CV (24 folds for single, 13 folds for full)')

In [None]:
# Calculate CV score
print('\n' + '='*60)
print('CV SCORE CALCULATION')
print('='*60)

X_single, Y_single = load_data('single_solvent')
X_full, Y_full = load_data('full')

# Get actuals in same order as predictions
actuals_single = []
for solvent in sorted(X_single['SOLVENT NAME'].unique()):
    mask = X_single['SOLVENT NAME'] == solvent
    actuals_single.append(Y_single[mask].values)
actuals_single = np.vstack(actuals_single)

actuals_full = []
ramps = X_full[['SOLVENT A NAME', 'SOLVENT B NAME']].drop_duplicates()
for _, row in ramps.iterrows():
    mask = (X_full['SOLVENT A NAME'] == row['SOLVENT A NAME']) & (X_full['SOLVENT B NAME'] == row['SOLVENT B NAME'])
    actuals_full.append(Y_full[mask].values)
actuals_full = np.vstack(actuals_full)

# Get predictions
preds_single = submission_single_solvent[['target_1', 'target_2', 'target_3']].values
preds_full = submission_full_data[['target_1', 'target_2', 'target_3']].values

# Calculate MSE
mse_single = np.mean((actuals_single - preds_single) ** 2)
mse_full = np.mean((actuals_full - preds_full) ** 2)
mse_overall = (mse_single * len(actuals_single) + mse_full * len(actuals_full)) / (len(actuals_single) + len(actuals_full))

print(f'\nSingle Solvent MSE: {mse_single:.6f} (n={len(actuals_single)})')
print(f'Full Data MSE: {mse_full:.6f} (n={len(actuals_full)})')
print(f'Overall MSE: {mse_overall:.6f}')

print(f'\nBest previous CV: 0.008092 (exp_049-053)')
print(f'Best previous LB: 0.0877 (exp_030)')
print(f'Target: 0.0347')