# Experiment 052: CatBoost + XGBoost Ensemble (WITH CLIPPING)

**Goal:** Regenerate exp_050's submission with proper clipping to [0, 1].

**Issue:** Previous submissions may have failed due to target values > 1.0.

**Fix:** Clip all target values to [0, 1] before saving submission.

In [None]:
import sys
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

# Define constants
DATA_PATH = "/home/data"
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

# Load data
def load_data_local(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features_local(name="spange_descriptors"):
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

# Official CV functions
def generate_leave_one_out_splits(X, Y):
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield ((X[train_idcs_mask], Y[train_idcs_mask]), (X[~train_idcs_mask], Y[~train_idcs_mask]))

def generate_leave_one_ramp_out_splits(X, Y):
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != solvent_pair).any(axis=1)
        yield ((X[train_idcs_mask], Y[train_idcs_mask]), (X[~train_idcs_mask], Y[~train_idcs_mask]))

# Load data
X_single, Y_single = load_data_local("single_solvent")
X_full, Y_full = load_data_local("full")
spange = load_features_local("spange_descriptors")

print(f"Single solvent: X={X_single.shape}, Y={Y_single.shape}")
print(f"Full data: X={X_full.shape}, Y={Y_full.shape}")

In [None]:
# Feature engineering (same as exp_050)

def feature_priority(name: str) -> int:
    if name.startswith("spange_"): return 5
    if name.startswith("acs_"): return 4
    if name.startswith("drfps_"): return 3
    if name.startswith("frag_"): return 2
    if name.startswith("smiles_"): return 1
    return 0

def filter_correlated_features(df, threshold=0.90):
    numeric_df = df.select_dtypes(include=[np.number])
    if numeric_df.shape[1] == 0: return df, []
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols: numeric_df = numeric_df.drop(columns=constant_cols)
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    cols = upper.columns.tolist()
    to_drop = set()
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            if upper.iloc[i, j] > threshold:
                if col_i in to_drop or col_j in to_drop: continue
                p_i, p_j = feature_priority(col_i), feature_priority(col_j)
                if p_i > p_j: to_drop.add(col_j)
                elif p_j > p_i: to_drop.add(col_i)
                else:
                    idx_i = df.columns.get_loc(col_i) if col_i in df.columns else 999
                    idx_j = df.columns.get_loc(col_j) if col_j in df.columns else 999
                    to_drop.add(col_i if idx_i > idx_j else col_j)
    all_to_drop = list(set(constant_cols).union(to_drop))
    return df.drop(columns=all_to_drop, errors="ignore"), all_to_drop

def add_numeric_features(X_num):
    X_num = X_num.copy()
    if {"Temperature", "Residence Time"} <= set(X_num.columns):
        X_num["Temperature"] = X_num["Temperature"] + 273.15
        T, rt = X_num["Temperature"], X_num["Residence Time"]
        X_num["T_x_RT"] = T * rt
        X_num["RT_log"] = np.log(rt + 1e-6)
        X_num["T_inv"] = 1 / T
        X_num["RT_scaled"] = rt / rt.mean()
    return X_num

def build_solvent_feature_table(threshold=0.90):
    sources = ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints", "smiles"]
    dfs = []
    for src in sources:
        df_src = load_features_local(src).copy()
        if "SOLVENT NAME" not in df_src.columns:
            df_src = df_src.reset_index().rename(columns={"index": "SOLVENT NAME"})
        if src in ["drfps_catechol", "fragprints"]:
            prefix = "drfps" if src == "drfps_catechol" else "frag"
            df_src = df_src.loc[:, (df_src != 0).any(axis=0)]
            df_src = df_src.loc[:, (df_src != 1).any(axis=0)]
            values = df_src.drop(columns={"SOLVENT NAME"}, errors="ignore")
            drop_cols = values.sum(axis=0)[values.sum(axis=0) == 1].index
            df_src = df_src.drop(columns=drop_cols, errors="ignore")
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        elif src == "spange_descriptors":
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"spange_{c}" for c in cols_to_rename})
        elif src == "acs_pca_descriptors":
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"acs_{c}" for c in cols_to_rename})
        elif src == "smiles":
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"smiles_{c}" for c in cols_to_rename})
        dfs.append(df_src)
    from functools import reduce
    merged = reduce(lambda left, right: pd.merge(left, right, on="SOLVENT NAME", how="outer"), dfs)
    merged_filtered, _ = filter_correlated_features(merged, threshold=threshold)
    return merged_filtered

solvent_table = build_solvent_feature_table(threshold=0.90)
print(f"Solvent table shape: {solvent_table.shape}")

In [None]:
# Featurizer and Model classes (same as exp_050)

class CombinedFeaturizer:
    def __init__(self, solvent_table, data='single'):
        self.solvent_table = solvent_table
        self.data_mode = data
        self.scaler = None
    
    def featurize(self, X, fit_scaler=False):
        X = X.copy()
        if self.data_mode == 'single':
            X_merged = X.merge(self.solvent_table, on='SOLVENT NAME', how='left')
            numeric_cols = [c for c in X_merged.columns if c != 'SOLVENT NAME' and X_merged[c].dtype in [np.float64, np.int64, np.float32, np.int32]]
            X_numeric = X_merged[numeric_cols].copy()
        else:
            solvent_A = self.solvent_table.copy().rename(columns={'SOLVENT NAME': 'SOLVENT A NAME'})
            solvent_A.columns = ['SOLVENT A NAME'] + [f'{c}_A' for c in solvent_A.columns if c != 'SOLVENT A NAME']
            solvent_B = self.solvent_table.copy().rename(columns={'SOLVENT NAME': 'SOLVENT B NAME'})
            solvent_B.columns = ['SOLVENT B NAME'] + [f'{c}_B' for c in solvent_B.columns if c != 'SOLVENT B NAME']
            X_merged = X.merge(solvent_A, on='SOLVENT A NAME', how='left').merge(solvent_B, on='SOLVENT B NAME', how='left')
            numeric_cols = [c for c in X_merged.columns if c not in ['SOLVENT A NAME', 'SOLVENT B NAME'] and X_merged[c].dtype in [np.float64, np.int64, np.float32, np.int32]]
            X_numeric = X_merged[numeric_cols].copy()
        X_numeric = add_numeric_features(X_numeric)
        X_np = np.nan_to_num(X_numeric.values.astype(np.float64), nan=0.0)
        if fit_scaler:
            self.scaler = StandardScaler()
            X_np = self.scaler.fit_transform(X_np)
        elif self.scaler is not None:
            X_np = self.scaler.transform(X_np)
        return torch.tensor(X_np, dtype=torch.double)

class CatBoostXGBEnsemble:
    def __init__(self, data='single'):
        self.data_mode = data
        self.featurizer = CombinedFeaturizer(solvent_table, data=data)
        if data == 'single':
            self.cat_weight, self.xgb_weight = 7.0/13, 6.0/13
            self.cat_params = dict(random_seed=42, loss_function="MultiRMSE", depth=3, learning_rate=0.07, n_estimators=1050, l2_leaf_reg=3.5, bootstrap_type="Bayesian", bagging_temperature=0.225, grow_policy="SymmetricTree", rsm=0.75, verbose=False)
            self.xgb_params = dict(random_state=42, objective="reg:squarederror", tree_method="hist", subsample=0.5, reg_lambda=0.6, n_estimators=1000, max_depth=4, learning_rate=0.02, colsample_bytree=0.3, colsample_bylevel=0.6)
        else:
            self.cat_weight, self.xgb_weight = 1.0/3, 2.0/3
            self.cat_params = dict(random_seed=42, loss_function="MultiRMSE", depth=3, learning_rate=0.06, n_estimators=1100, l2_leaf_reg=2.5, bootstrap_type="Bayesian", bagging_temperature=0.25, grow_policy="SymmetricTree", rsm=0.75, verbose=False)
            self.xgb_params = dict(random_state=42, objective="reg:squarederror", tree_method="approx", subsample=0.5, reg_lambda=0.6, n_estimators=1000, max_depth=4, learning_rate=0.02, grow_policy="lossguide", colsample_bytree=0.3, colsample_bylevel=0.6)
        self.cat_model = None
        self.xgb_models = None
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        X_np = self.featurizer.featurize(train_X, fit_scaler=True).numpy()
        Y_np = train_Y.values
        self.cat_model = CatBoostRegressor(**self.cat_params)
        self.cat_model.fit(X_np, Y_np)
        self.xgb_models = [XGBRegressor(**self.xgb_params).fit(X_np, Y_np[:, t]) for t in range(Y_np.shape[1])]
    
    def predict(self, X):
        X_np = self.featurizer.featurize(X, fit_scaler=False).numpy()
        cat_pred = np.asarray(self.cat_model.predict(X_np))
        if cat_pred.ndim == 1: cat_pred = cat_pred.reshape(-1, 1)
        xgb_pred = np.column_stack([m.predict(X_np) for m in self.xgb_models])
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        # CRITICAL: Clip to [0, 1] and normalize
        out = np.clip(out, 0.0, 1.0)  # Clip to [0, 1]
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            out = out / np.maximum(totals, 1.0)
        return torch.tensor(out, dtype=torch.double)

print("Model classes defined with CLIPPING to [0, 1].")

In [None]:
# Quick CV check to verify model works
print("Running quick CV check...")

fold_mses = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(list(generate_leave_one_out_splits(X_single, Y_single))[:3]):
    model = CatBoostXGBEnsemble(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    fold_mses.append(mse)
    print(f"Fold {fold_idx}: MSE = {mse:.6f}")

print(f"\nQuick check passed. Mean MSE: {np.mean(fold_mses):.6f}")

In [None]:
# Generate submission
print("\n" + "="*60)
print("GENERATING SUBMISSION WITH CLIPPING")
print("="*60)

# Single solvent predictions (24 folds)
print("\nGenerating single solvent predictions (24 folds)...")
all_predictions_single = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(tqdm.tqdm(list(generate_leave_one_out_splits(X_single, Y_single)))):
    model = CatBoostXGBEnsemble(data='single')
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions_single.append({
            "task": 0, "fold": fold_idx, "row": row_idx,
            "target_1": row[0], "target_2": row[1], "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions_single)
print(f"Single solvent predictions: {len(submission_single_solvent)}")

In [None]:
# Full data predictions (13 folds by solvent PAIRS)
print("\nGenerating full data predictions (13 folds by solvent PAIRS)...")
all_predictions_full = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(tqdm.tqdm(list(generate_leave_one_ramp_out_splits(X_full, Y_full)))):
    model = CatBoostXGBEnsemble(data='full')
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions_full.append({
            "task": 1, "fold": fold_idx, "row": row_idx,
            "target_1": row[0], "target_2": row[1], "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions_full)
print(f"Full data predictions: {len(submission_full_data)}")

In [None]:
# Combine and save submission with CLIPPING
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

# CRITICAL: Clip all targets to [0, 1]
print("\nApplying final clipping to [0, 1]...")
for col in ['target_1', 'target_2', 'target_3']:
    before_clip = (submission[col] > 1).sum() + (submission[col] < 0).sum()
    submission[col] = submission[col].clip(0, 1)
    after_clip = (submission[col] > 1).sum() + (submission[col] < 0).sum()
    print(f"  {col}: {before_clip} values clipped")

# Save
import os
os.makedirs('/home/submission', exist_ok=True)
submission.to_csv("/home/submission/submission.csv", index=True)

print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Total rows: {len(submission)}")

In [None]:
# Verify submission
print("\n" + "="*60)
print("SUBMISSION VERIFICATION")
print("="*60)

df = pd.read_csv('/home/submission/submission.csv')

print(f"\nTotal rows: {len(df)}")
print(f"Tasks: {df['task'].unique()}")
print(f"Folds per task:")
print(df.groupby('task')['fold'].nunique())

print(f"\nTarget statistics:")
for col in ['target_1', 'target_2', 'target_3']:
    print(f"  {col}: min={df[col].min():.6f}, max={df[col].max():.6f}")
    print(f"    Values > 1: {(df[col] > 1).sum()}")
    print(f"    Values < 0: {(df[col] < 0).sum()}")

print(f"\nTarget sums:")
df['sum'] = df['target_1'] + df['target_2'] + df['target_3']
print(f"  min={df['sum'].min():.6f}, max={df['sum'].max():.6f}, mean={df['sum'].mean():.6f}")

print(f"\n✓ All targets in [0, 1] range")
print(f"✓ Submission format correct")

In [None]:
# Calculate CV for logging
print("\n" + "="*60)
print("CV CALCULATION FOR LOGGING")
print("="*60)

# Single solvent CV
print("\nCalculating single solvent CV...")
single_fold_mses = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_single, Y_single)):
    model = CatBoostXGBEnsemble(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    single_fold_mses.append(mse)

single_cv = np.mean(single_fold_mses)
single_cv_std = np.std(single_fold_mses)
print(f"Single solvent CV MSE: {single_cv:.6f} +/- {single_cv_std:.6f}")

# Full data CV
print("\nCalculating full data CV...")
full_fold_mses = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    model = CatBoostXGBEnsemble(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    full_fold_mses.append(mse)

full_cv = np.mean(full_fold_mses)
full_cv_std = np.std(full_fold_mses)
print(f"Full data CV MSE: {full_cv:.6f} +/- {full_cv_std:.6f}")

# Weighted combined CV
n_single, n_full = len(X_single), len(X_full)
weighted_cv = (n_single * single_cv + n_full * full_cv) / (n_single + n_full)
print(f"\nWeighted combined CV: {weighted_cv:.6f}")

print(f"\nBaseline (exp_050 without clipping):")
print(f"  Single CV: 0.008092")
print(f"  Full CV: 0.012482")

In [None]:
# Summary
print("\n" + "="*60)
print("EXPERIMENT 052: SUMMARY")
print("="*60)

print("\nGOAL: Regenerate exp_050 with proper clipping to [0, 1]")
print("\nFIX APPLIED:")
print("  - Clip predictions to [0, 1] in model.predict()")
print("  - Additional clipping before saving submission")
print("  - This should fix 'Evaluation metric raised an unexpected error'")

print(f"\nRESULTS:")
print(f"  Single solvent CV MSE: {single_cv:.6f}")
print(f"  Full data CV MSE: {full_cv:.6f}")
print(f"  Weighted combined CV: {weighted_cv:.6f}")

print(f"\nSUBMISSION:")
print(f"  Saved to: /home/submission/submission.csv")
print(f"  Total rows: {len(submission)}")
print(f"  All targets in [0, 1]: YES")

print(f"\nEXPECTED LB (based on CV-LB relationship):")
print(f"  LB = 4.31 * {single_cv:.6f} + 0.0525 = {4.31 * single_cv + 0.0525:.4f}")
print(f"  Best LB so far: 0.0877 (exp_030)")

In [None]:
# Final check
print("\n" + "="*60)
print("FINAL VERIFICATION")
print("="*60)

df = pd.read_csv('/home/submission/submission.csv')

# Check for any issues
issues = []
if df.isna().sum().sum() > 0:
    issues.append(f"NaN values: {df.isna().sum().sum()}")
if (df['target_1'] > 1).sum() > 0:
    issues.append(f"target_1 > 1: {(df['target_1'] > 1).sum()}")
if (df['target_2'] > 1).sum() > 0:
    issues.append(f"target_2 > 1: {(df['target_2'] > 1).sum()}")
if (df['target_3'] > 1).sum() > 0:
    issues.append(f"target_3 > 1: {(df['target_3'] > 1).sum()}")
if (df['target_1'] < 0).sum() > 0:
    issues.append(f"target_1 < 0: {(df['target_1'] < 0).sum()}")
if (df['target_2'] < 0).sum() > 0:
    issues.append(f"target_2 < 0: {(df['target_2'] < 0).sum()}")
if (df['target_3'] < 0).sum() > 0:
    issues.append(f"target_3 < 0: {(df['target_3'] < 0).sum()}")

if issues:
    print("ISSUES FOUND:")
    for issue in issues:
        print(f"  - {issue}")
else:
    print("✓ No issues found")
    print("✓ Submission is ready for upload")
    print(f"\nCV for logging: {single_cv:.6f}")

In [None]:
# Print final CV for easy copying
print(f"\n\nFINAL CV FOR LOGGING: {single_cv:.6f}")