# Experiment 060: Clean CatBoost + XGBoost Ensemble

**Goal:** Create a clean, simple submission using our best-performing model.

**Approach:**
- CatBoost + XGBoost ensemble (60:40 weights)
- Spange descriptors + Arrhenius kinetics features
- NO extrapolation detection (it hurt CV)
- Minimal code, following official template exactly

In [1]:
import pandas as pd
import numpy as np
import torch
from abc import ABC, abstractmethod
import tqdm
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

torch.set_default_dtype(torch.double)
DATA_PATH = "/home/data"

print("Imports complete.")

Imports complete.


In [2]:
# Constants
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

# Data loading
def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

# CV functions
def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    ramps = ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print("Data functions defined.")

Data functions defined.


In [3]:
# Load Spange descriptors
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
print(f'Spange: {SPANGE_DF.shape}')

Spange: (26, 13)


In [4]:
# Base classes
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

print("Base classes defined.")

Base classes defined.


In [5]:
# Simple CatBoost + XGBoost Ensemble
class CatBoostXGBoostEnsemble(BaseModel):
    def __init__(self, data='single'):
        self.data = data
        self.spange_df = SPANGE_DF
        self.cat_models = None
        self.xgb_models = None
        self.scaler = None
    
    def _prepare_features(self, X):
        # Numeric + Arrhenius
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_k = X_vals[:, 1:2] + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(X_vals[:, 0:1] + 1e-6)
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, inv_temp * log_time])
        
        if self.data == 'single':
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            return np.hstack([X_kinetic, X_spange])
        else:
            A = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B = self.spange_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1) / 100.0
            X_spange = A * (1 - pct) + B * pct
            return np.hstack([X_kinetic, pct, X_spange])
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X = self._prepare_features(train_X)
        Y = train_Y.values
        
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X)
        
        # CatBoost
        self.cat_models = []
        for i in range(3):
            m = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.05, l2_leaf_reg=3.0, random_seed=42, verbose=False)
            m.fit(X_scaled, Y[:, i])
            self.cat_models.append(m)
        
        # XGBoost
        self.xgb_models = []
        for i in range(3):
            m = XGBRegressor(n_estimators=400, max_depth=5, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, random_state=42, verbosity=0)
            m.fit(X_scaled, Y[:, i])
            self.xgb_models.append(m)
    
    def predict(self, X):
        X_feat = self._prepare_features(X)
        X_scaled = self.scaler.transform(X_feat)
        
        cat_preds = np.column_stack([m.predict(X_scaled) for m in self.cat_models])
        xgb_preds = np.column_stack([m.predict(X_scaled) for m in self.xgb_models])
        
        preds = 0.6 * cat_preds + 0.4 * xgb_preds
        preds = np.clip(preds, 0.0, 1.0)
        return torch.tensor(preds, dtype=torch.double)

print("CatBoostXGBoostEnsemble defined.")

CatBoostXGBoostEnsemble defined.


In [6]:
# Quick test
X, Y = load_data("single_solvent")
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

model = CatBoostXGBoostEnsemble()
model.train_model(train_X, train_Y)
preds = model.predict(test_X)
print(f"Test passed! Predictions shape: {preds.shape}")

Test passed! Predictions shape: torch.Size([37, 3])


In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = CatBoostXGBoostEnsemble() # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:00,  1.15it/s]

2it [00:01,  1.16it/s]

3it [00:02,  1.16it/s]

4it [00:03,  1.17it/s]

5it [00:04,  1.18it/s]

6it [00:05,  1.17it/s]

7it [00:06,  1.15it/s]

8it [00:06,  1.11it/s]

9it [00:07,  1.12it/s]

10it [00:08,  1.14it/s]

11it [00:09,  1.15it/s]

12it [00:10,  1.16it/s]

13it [00:11,  1.11it/s]

14it [00:12,  1.09it/s]

15it [00:13,  1.10it/s]

16it [00:14,  1.07it/s]

17it [00:15,  1.05it/s]

18it [00:16,  1.05it/s]

19it [00:17,  1.04it/s]

20it [00:18,  1.05it/s]

21it [00:18,  1.08it/s]

22it [00:19,  1.10it/s]

23it [00:20,  1.12it/s]

24it [00:21,  1.09it/s]

24it [00:21,  1.11it/s]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = CatBoostXGBoostEnsemble(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:01,  1.27s/it]

2it [00:02,  1.22s/it]

3it [00:03,  1.19s/it]

4it [00:04,  1.18s/it]

5it [00:05,  1.19s/it]

6it [00:07,  1.22s/it]

7it [00:08,  1.21s/it]

8it [00:09,  1.20s/it]

9it [00:10,  1.21s/it]

10it [00:12,  1.20s/it]

11it [00:13,  1.19s/it]

12it [00:14,  1.19s/it]

13it [00:15,  1.18s/it]

13it [00:15,  1.20s/it]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [10]:
# Calculate CV
print("\nCalculating CV...")
X, Y = load_data("single_solvent")
fold_mses = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X, Y)):
    model = CatBoostXGBoostEnsemble()
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    fold_mses.append(mse)
single_cv = np.mean(fold_mses)
print(f"Single solvent CV: {single_cv:.6f}")

X, Y = load_data("full")
full_fold_mses = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X, Y)):
    model = CatBoostXGBoostEnsemble(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    full_fold_mses.append(mse)
full_cv = np.mean(full_fold_mses)
print(f"Full data CV: {full_cv:.6f}")

print(f"\nFINAL CV: {single_cv:.6f}")
print(f"Predicted LB: {4.31 * single_cv + 0.0525:.4f}")


Calculating CV...


Single solvent CV: 0.011171


Full data CV: 0.013677

FINAL CV: 0.011171
Predicted LB: 0.1006
