# Yield Normalization on CatBoost+XGBoost Ensemble

**Hypothesis**: The "Ens Model" kernel uses yield normalization (clip to [0, inf], normalize so sum ≤ 1). This is a physics-based constraint that should help with distribution shift.

**Implementation**:
1. Use CatBoost + XGBoost ensemble (best CV from exp_049/050/053)
2. Add yield normalization: clip predictions to [0, inf], then normalize so sum ≤ 1
3. Compare with and without normalization

**Expected outcome**: Yield normalization should reduce distribution shift error by enforcing domain constraints.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostRegressor
import xgboost as xgb
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.set_default_dtype(torch.double)

print('Imports complete')

Imports complete


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Featurizer
class Featurizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            X_spange = A_spange * (1 - pct) + B_spange * pct
            X_drfp = A_drfp * (1 - pct) + B_drfp * pct
            X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])

print(f'Feature dimension: {Featurizer().feats_dim}')

Feature dimension: 145


In [5]:
# Yield normalization function (from Ens Model kernel)
def normalize_yields(predictions):
    """Apply yield normalization: clip to [0, inf], then normalize so sum <= 1.
    
    This is a physics-based constraint:
    - Yields cannot be negative
    - Total yield cannot exceed 100% (sum <= 1)
    """
    out = np.clip(predictions, a_min=0.0, a_max=None)
    
    if out.shape[1] > 1:
        totals = out.sum(axis=1, keepdims=True)
        divisor = np.maximum(totals, 1.0)  # Only normalize if sum > 1
        out = out / divisor
    
    return out

# Test
test_preds = np.array([[0.3, 0.4, 0.5], [-0.1, 0.6, 0.7], [0.2, 0.3, 0.4]])
print(f'Before normalization:\n{test_preds}')
print(f'After normalization:\n{normalize_yields(test_preds)}')

Before normalization:
[[ 0.3  0.4  0.5]
 [-0.1  0.6  0.7]
 [ 0.2  0.3  0.4]]
After normalization:
[[0.25       0.33333333 0.41666667]
 [0.         0.46153846 0.53846154]
 [0.2        0.3        0.4       ]]


In [6]:
# CatBoost + XGBoost Ensemble with Yield Normalization
class CatXGBEnsemble:
    def __init__(self, data='single', use_yield_norm=True):
        self.data = data
        self.use_yield_norm = use_yield_norm
        self.featurizer = Featurizer(mixed=(data == 'full'))
        
        # Task-specific weights (from Ens Model kernel)
        if data == 'single':
            self.cat_weight = 7.0 / 13.0  # 0.538
            self.xgb_weight = 6.0 / 13.0  # 0.462
        else:
            self.cat_weight = 1.0 / 3.0  # 0.333
            self.xgb_weight = 2.0 / 3.0  # 0.667
        
        self.cat_models = []
        self.xgb_models = []
        
    def train_model(self, X, Y):
        X_feat = self.featurizer.featurize(X)
        Y_np = Y.values
        
        self.cat_models = []
        self.xgb_models = []
        
        for i in range(3):  # 3 targets
            # CatBoost
            cat = CatBoostRegressor(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3.0,
                random_seed=42,
                verbose=False
            )
            cat.fit(X_feat, Y_np[:, i])
            self.cat_models.append(cat)
            
            # XGBoost
            xgb_model = xgb.XGBRegressor(
                n_estimators=400,
                learning_rate=0.05,
                max_depth=5,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.1,
                reg_lambda=1.0,
                random_state=42,
                verbosity=0
            )
            xgb_model.fit(X_feat, Y_np[:, i])
            self.xgb_models.append(xgb_model)
    
    def predict(self, X):
        X_feat = self.featurizer.featurize(X)
        
        # Get predictions from both models
        cat_preds = np.column_stack([m.predict(X_feat) for m in self.cat_models])
        xgb_preds = np.column_stack([m.predict(X_feat) for m in self.xgb_models])
        
        # Weighted ensemble
        out = self.cat_weight * cat_preds + self.xgb_weight * xgb_preds
        
        # Apply yield normalization if enabled
        if self.use_yield_norm:
            out = normalize_yields(out)
        
        return torch.tensor(out, dtype=torch.double)

print('CatXGBEnsemble defined')

CatXGBEnsemble defined


In [None]:
# Cross-validation comparison: with vs without yield normalization
print("="*60)
print("Cross-validation: Single Solvent Data")
print("="*60)

X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: X={X_single.shape}, Y={Y_single.shape}")

# With yield normalization
all_mse_with_norm = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_out_splits(X_single, Y_single), total=24):
    model = CatXGBEnsemble(data='single', use_yield_norm=True)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_with_norm.append(mse)

mse_single_with_norm = np.mean(all_mse_with_norm)
print(f"\nSingle Solvent MSE WITH yield norm: {mse_single_with_norm:.6f}")

# Without yield normalization
all_mse_without_norm = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_out_splits(X_single, Y_single), total=24):
    model = CatXGBEnsemble(data='single', use_yield_norm=False)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_without_norm.append(mse)

mse_single_without_norm = np.mean(all_mse_without_norm)
print(f"Single Solvent MSE WITHOUT yield norm: {mse_single_without_norm:.6f}")
print(f"Difference: {(mse_single_with_norm - mse_single_without_norm) / mse_single_without_norm * 100:.2f}%")

In [None]:
# Cross-validation for full data
print("="*60)
print("Cross-validation: Full Data")
print("="*60)

X_full, Y_full = load_data("full")
print(f"Full data: X={X_full.shape}, Y={Y_full.shape}")

# With yield normalization
all_mse_full_with_norm = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_ramp_out_splits(X_full, Y_full), total=13):
    model = CatXGBEnsemble(data='full', use_yield_norm=True)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_full_with_norm.append(mse)

mse_full_with_norm = np.mean(all_mse_full_with_norm)
print(f"\nFull Data MSE WITH yield norm: {mse_full_with_norm:.6f}")

# Without yield normalization
all_mse_full_without_norm = []
for (train_X, train_Y), (test_X, test_Y) in tqdm.tqdm(generate_leave_one_ramp_out_splits(X_full, Y_full), total=13):
    model = CatXGBEnsemble(data='full', use_yield_norm=False)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    all_mse_full_without_norm.append(mse)

mse_full_without_norm = np.mean(all_mse_full_without_norm)
print(f"Full Data MSE WITHOUT yield norm: {mse_full_without_norm:.6f}")
print(f"Difference: {(mse_full_with_norm - mse_full_without_norm) / mse_full_without_norm * 100:.2f}%")

In [None]:
# Calculate overall MSE
N_single = len(X_single)
N_full = len(X_full)
N_total = N_single + N_full

overall_mse_with_norm = (mse_single_with_norm * N_single + mse_full_with_norm * N_full) / N_total
overall_mse_without_norm = (mse_single_without_norm * N_single + mse_full_without_norm * N_full) / N_total

print("="*60)
print("SUMMARY")
print("="*60)
print(f"\nWITH Yield Normalization:")
print(f"  Single Solvent MSE: {mse_single_with_norm:.6f}")
print(f"  Full Data MSE: {mse_full_with_norm:.6f}")
print(f"  Overall MSE: {overall_mse_with_norm:.6f}")

print(f"\nWITHOUT Yield Normalization:")
print(f"  Single Solvent MSE: {mse_single_without_norm:.6f}")
print(f"  Full Data MSE: {mse_full_without_norm:.6f}")
print(f"  Overall MSE: {overall_mse_without_norm:.6f}")

print(f"\nComparison:")
print(f"  With vs Without: {(overall_mse_with_norm - overall_mse_without_norm) / overall_mse_without_norm * 100:.2f}%")
print(f"  Best CV (exp_049): 0.008092")
print(f"  This vs Best: {(overall_mse_with_norm - 0.008092) / 0.008092 * 100:.2f}%")

# Expected LB based on CV-LB relationship
expected_lb_with = 4.31 * overall_mse_with_norm + 0.0525
expected_lb_without = 4.31 * overall_mse_without_norm + 0.0525
print(f"\nExpected LB (with norm): {expected_lb_with:.4f}")
print(f"Expected LB (without norm): {expected_lb_without:.4f}")
print(f"Target LB: 0.0347")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = CatXGBEnsemble(data='single', use_yield_norm=True)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = CatXGBEnsemble(data='full', use_yield_norm=True)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################