# Experiment 069: Exact Ens Model Approach

Implementing the exact "Ens Model" kernel approach:
1. CatBoost with MultiRMSE (multi-output)
2. XGBoost with separate models per target
3. Different weights for single (7:6) vs full (1:2)
4. Feature priority-based correlation filtering
5. Multi-target normalization (clip + renormalize if sum > 1)

In [1]:
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostRegressor
import xgboost as xgb
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.set_default_dtype(torch.double)

DATA_PATH = '/home/data'
print('Libraries loaded')

Libraries loaded


In [2]:
# Load all feature sources
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFPS_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
FRAGPRINTS_DF = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)

print(f'Spange: {SPANGE_DF.shape}')
print(f'DRFPS: {DRFPS_DF.shape}')
print(f'ACS PCA: {ACS_PCA_DF.shape}')
print(f'Fragprints: {FRAGPRINTS_DF.shape}')

Spange: (26, 13)
DRFPS: (24, 2048)
ACS PCA: (24, 5)
Fragprints: (24, 2133)


In [3]:
# Feature priority function (from Ens Model kernel)
def feature_priority(name):
    if name.startswith('spange_'): return 5
    if name.startswith('acs_'): return 4
    if name.startswith('drfps_'): return 3
    if name.startswith('frag_'): return 2
    return 0

def filter_correlated_features(df, threshold=0.8):
    """Remove correlated features, keeping higher priority ones"""
    numeric_df = df.select_dtypes(include=[np.number])
    
    # Drop constant columns
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    numeric_df = numeric_df.drop(columns=constant_cols, errors='ignore')
    
    if numeric_df.shape[1] == 0:
        return df, []
    
    # Correlation matrix
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    
    cols = upper.columns.tolist()
    to_drop = set()
    
    # Find highly correlated pairs
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            if upper.iloc[i, j] > threshold:
                if col_i in to_drop or col_j in to_drop:
                    continue
                p_i = feature_priority(col_i)
                p_j = feature_priority(col_j)
                if p_i > p_j:
                    to_drop.add(col_j)
                elif p_j > p_i:
                    to_drop.add(col_i)
                else:
                    to_drop.add(col_j)  # Drop later one
    
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors='ignore')
    
    print(f'Dropped {len(all_to_drop)} features (threshold={threshold})')
    return df_filtered, all_to_drop

print('Feature filtering functions defined')

Feature filtering functions defined


In [4]:
# Build combined feature table
def build_solvent_table():
    """Build combined feature table with all feature sources"""
    # Get common solvents
    common_solvents = list(set(SPANGE_DF.index) & set(DRFPS_DF.index) & set(ACS_PCA_DF.index) & set(FRAGPRINTS_DF.index))
    
    # Spange features
    spange = SPANGE_DF.loc[common_solvents].copy()
    spange.columns = [f'spange_{c}' for c in spange.columns]
    
    # ACS PCA features
    acs = ACS_PCA_DF.loc[common_solvents].copy()
    acs.columns = [f'acs_{c}' for c in acs.columns]
    
    # DRFP features
    drfp = DRFPS_DF.loc[common_solvents].copy()
    drfp.columns = [f'drfps_{c}' for c in drfp.columns]
    
    # Fragprints features
    frag = FRAGPRINTS_DF.loc[common_solvents].copy()
    frag.columns = [f'frag_{c}' for c in frag.columns]
    
    # Combine
    combined = pd.concat([spange, acs, drfp, frag], axis=1)
    print(f'Combined features: {combined.shape}')
    
    # Filter correlated features
    filtered, dropped = filter_correlated_features(combined, threshold=0.8)
    print(f'After filtering: {filtered.shape}')
    
    return filtered

SOLVENT_TABLE = build_solvent_table()

Combined features: (24, 4199)


Dropped 4130 features (threshold=0.8)
After filtering: (24, 69)


In [5]:
# Data loading functions
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%", "RAMP NUM"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    for ramp in sorted(X["RAMP NUM"].unique()):
        mask = X["RAMP NUM"] != ramp
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [6]:
# Featurizer for single solvent
class PrecomputedFeaturizer:
    def __init__(self, solvent_table):
        self.solvent_table = solvent_table
    
    def featurize(self, X):
        # Numeric features
        T = X["Temperature"].values
        rt = X["Residence Time"].values
        
        # Arrhenius kinetics features
        T_kelvin = T + 273.15
        inv_T = 1000.0 / T_kelvin
        log_rt = np.log(rt + 1e-6)
        T_x_rt = T * rt
        
        numeric_features = np.column_stack([T, rt, inv_T, log_rt, T_x_rt])
        
        # Solvent features
        solvent_features = self.solvent_table.loc[X["SOLVENT NAME"]].values
        
        # Combine
        features = np.hstack([numeric_features, solvent_features])
        return torch.tensor(features, dtype=torch.double)

# Featurizer for mixed solvents
class PrecomputedFeaturizerMixed:
    def __init__(self, solvent_table):
        self.solvent_table = solvent_table
    
    def featurize(self, X):
        # Numeric features
        T = X["Temperature"].values
        rt = X["Residence Time"].values
        pct = X["SolventB%"].values / 100.0
        
        # Arrhenius kinetics features
        T_kelvin = T + 273.15
        inv_T = 1000.0 / T_kelvin
        log_rt = np.log(rt + 1e-6)
        T_x_rt = T * rt
        
        numeric_features = np.column_stack([T, rt, inv_T, log_rt, T_x_rt, pct])
        
        # Solvent features (weighted average)
        A_features = self.solvent_table.loc[X["SOLVENT A NAME"]].values
        B_features = self.solvent_table.loc[X["SOLVENT B NAME"]].values
        solvent_features = A_features * (1 - pct.reshape(-1, 1)) + B_features * pct.reshape(-1, 1)
        
        # Combine
        features = np.hstack([numeric_features, solvent_features])
        return torch.tensor(features, dtype=torch.double)

print('Featurizers defined')

Featurizers defined


In [7]:
# Multi-target normalization (from Ens Model kernel)
def multi_target_normalize(out):
    """Clip negatives to 0, then if sum > 1, scale down"""
    out = np.clip(out, a_min=0.0, a_max=None)
    totals = out.sum(axis=1, keepdims=True)
    divisor = np.maximum(totals, 1.0)
    out = out / divisor
    return out

print('Multi-target normalization defined')

Multi-target normalization defined


In [8]:
# CatBoost Model (from Ens Model kernel)
class CatBoostModel:
    def __init__(self, data='single'):
        self.data_mode = data
        
        if data == 'single':
            self.featurizer = PrecomputedFeaturizer(SOLVENT_TABLE)
            self.cat_params = dict(
                random_seed=42,
                loss_function='MultiRMSE',
                depth=3,
                learning_rate=0.07,
                n_estimators=1050,
                l2_leaf_reg=3.5,
                bootstrap_type='Bayesian',
                bagging_temperature=0.225,
                grow_policy='SymmetricTree',
                rsm=0.75,
                verbose=False,
            )
        else:
            self.featurizer = PrecomputedFeaturizerMixed(SOLVENT_TABLE)
            self.cat_params = dict(
                random_seed=42,
                loss_function='MultiRMSE',
                depth=3,
                learning_rate=0.06,
                n_estimators=1100,
                l2_leaf_reg=2.5,
                bootstrap_type='Bayesian',
                bagging_temperature=0.25,
                grow_policy='SymmetricTree',
                rsm=0.75,
                verbose=False,
            )
        
        self.model = None
    
    def train_model(self, train_X, train_Y):
        X_tensor = self.featurizer.featurize(train_X)
        X_np = X_tensor.numpy()
        Y_np = train_Y.values
        
        self.model = CatBoostRegressor(**self.cat_params)
        self.model.fit(X_np, Y_np)
    
    def predict(self, X):
        X_tensor = self.featurizer.featurize(X)
        X_np = X_tensor.numpy()
        out = self.model.predict(X_np)
        out = multi_target_normalize(out)
        return torch.tensor(out, dtype=torch.double)

print('CatBoost model defined')

CatBoost model defined


In [9]:
# XGBoost Model (from Ens Model kernel)
class XGBModel:
    def __init__(self, data='single'):
        self.data_mode = data
        
        if data == 'single':
            self.featurizer = PrecomputedFeaturizer(SOLVENT_TABLE)
            self.xgb_params = dict(
                random_state=42,
                objective='reg:squarederror',
                tree_method='hist',
                subsample=0.5,
                reg_lambda=0.6,
                reg_alpha=0.0,
                n_estimators=1000,
                min_child_weight=1,
                max_depth=4,
                max_delta_step=1,
                learning_rate=0.02,
                grow_policy='depthwise',
                gamma=0.0,
                colsample_bytree=0.3,
                colsample_bylevel=0.6,
            )
        else:
            self.featurizer = PrecomputedFeaturizerMixed(SOLVENT_TABLE)
            self.xgb_params = dict(
                random_state=42,
                objective='reg:squarederror',
                tree_method='approx',
                subsample=0.5,
                reg_lambda=0.6,
                reg_alpha=0.0,
                n_estimators=1000,
                min_child_weight=1,
                max_depth=4,
                max_delta_step=1,
                learning_rate=0.02,
                grow_policy='lossguide',
                gamma=0.0,
                colsample_bytree=0.3,
                colsample_bylevel=0.6,
            )
        
        self.models = None
    
    def train_model(self, train_X, train_Y):
        X_tensor = self.featurizer.featurize(train_X)
        X_np = X_tensor.numpy()
        Y_np = train_Y.values
        
        self.models = []
        for t in range(Y_np.shape[1]):
            model = xgb.XGBRegressor(**self.xgb_params)
            model.fit(X_np, Y_np[:, t])
            self.models.append(model)
    
    def predict(self, X):
        X_tensor = self.featurizer.featurize(X)
        X_np = X_tensor.numpy()
        preds = np.column_stack([m.predict(X_np) for m in self.models])
        preds = multi_target_normalize(preds)
        return torch.tensor(preds, dtype=torch.double)

print('XGBoost model defined')

XGBoost model defined


In [10]:
# Ensemble Model (from Ens Model kernel)
class EnsembleModel:
    def __init__(self, data='single'):
        self.data_mode = data
        
        # Different weights for single vs full (from Ens Model kernel)
        if data == 'single':
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            cat_weight = 1.0
            xgb_weight = 2.0
        
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum
        
        self.cat_model = CatBoostModel(data=data)
        self.xgb_model = XGBModel(data=data)
    
    def train_model(self, train_X, train_Y):
        self.cat_model.train_model(train_X, train_Y)
        self.xgb_model.train_model(train_X, train_Y)
    
    def predict(self, X):
        cat_pred = self.cat_model.predict(X)
        xgb_pred = self.xgb_model.predict(X)
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        return out

print(f'Ensemble model defined:')
print(f'  Single: CatBoost {7/13:.3f}, XGBoost {6/13:.3f}')
print(f'  Full: CatBoost {1/3:.3f}, XGBoost {2/3:.3f}')

Ensemble model defined:
  Single: CatBoost 0.538, XGBoost 0.462
  Full: CatBoost 0.333, XGBoost 0.667


In [11]:
# Run Single Solvent CV
print('Running Single Solvent CV...')
print('='*60)

X, Y = load_data("single_solvent")
split_generator = generate_leave_one_out_splits(X, Y)

all_preds_single = []
all_actuals_single = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = EnsembleModel(data='single')
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    all_preds_single.append(predictions.numpy())
    all_actuals_single.append(test_Y.values)

all_preds_single = np.vstack(all_preds_single)
all_actuals_single = np.vstack(all_actuals_single)
mse_single = np.mean((all_preds_single - all_actuals_single) ** 2)
print(f'\nSingle Solvent MSE: {mse_single:.6f} (n={len(all_preds_single)})')

Running Single Solvent CV...


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:01<00:30,  1.31s/it]

  8%|▊         | 2/24 [00:02<00:25,  1.14s/it]

 12%|█▎        | 3/24 [00:03<00:22,  1.08s/it]

 17%|█▋        | 4/24 [00:04<00:21,  1.06s/it]

 21%|██        | 5/24 [00:05<00:19,  1.05s/it]

 25%|██▌       | 6/24 [00:06<00:18,  1.04s/it]

 29%|██▉       | 7/24 [00:07<00:17,  1.04s/it]

 33%|███▎      | 8/24 [00:08<00:16,  1.03s/it]

 38%|███▊      | 9/24 [00:09<00:15,  1.03s/it]

 42%|████▏     | 10/24 [00:10<00:14,  1.03s/it]

 46%|████▌     | 11/24 [00:11<00:13,  1.03s/it]

 50%|█████     | 12/24 [00:12<00:12,  1.03s/it]

 54%|█████▍    | 13/24 [00:13<00:11,  1.03s/it]

 58%|█████▊    | 14/24 [00:14<00:10,  1.03s/it]

 62%|██████▎   | 15/24 [00:15<00:09,  1.04s/it]

 67%|██████▋   | 16/24 [00:16<00:08,  1.04s/it]

 71%|███████   | 17/24 [00:17<00:07,  1.05s/it]

 75%|███████▌  | 18/24 [00:18<00:06,  1.04s/it]

 79%|███████▉  | 19/24 [00:19<00:05,  1.04s/it]

 83%|████████▎ | 20/24 [00:20<00:04,  1.05s/it]

 88%|████████▊ | 21/24 [00:21<00:03,  1.04s/it]

 92%|█████████▏| 22/24 [00:23<00:02,  1.04s/it]

 96%|█████████▌| 23/24 [00:24<00:01,  1.03s/it]

100%|██████████| 24/24 [00:25<00:00,  1.03s/it]

100%|██████████| 24/24 [00:25<00:00,  1.04s/it]


Single Solvent MSE: 0.009175 (n=656)





In [None]:
# Run Full Data CV (87 folds)
print('\nRunning Full Data CV (87 ramps)...')
print('='*60)

X_full, Y_full = load_data("full")
split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)

all_preds_full = []
all_actuals_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=87):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = EnsembleModel(data='full')
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    all_preds_full.append(predictions.numpy())
    all_actuals_full.append(test_Y.values)

all_preds_full = np.vstack(all_preds_full)
all_actuals_full = np.vstack(all_actuals_full)
mse_full = np.mean((all_preds_full - all_actuals_full) ** 2)
print(f'\nFull Data MSE: {mse_full:.6f} (n={len(all_preds_full)})')

In [None]:
# Calculate combined CV score
n_single = len(all_preds_single)
n_full = len(all_preds_full)

weighted_cv = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)

print(f'\n' + '='*60)
print(f'FINAL RESULTS')
print(f'='*60)
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Weighted Combined MSE: {weighted_cv:.6f}')
print(f'\nBest baseline CV (exp_068): 0.007938')
print(f'Improvement: {(0.007938 - weighted_cv) / 0.007938 * 100:.2f}%')