# Experiment 094: Exact ens-model Kernel Replication

**Goal**: Replicate the ens-model kernel exactly with:
1. Correlation filtering with threshold=0.90
2. Feature priority: spange > acs > drfps > frag > smiles
3. Different ensemble weights: Single (CatBoost=7/13, XGB=6/13), Full (CatBoost=1/3, XGB=2/3)
4. Numeric features: T_x_RT, RT_log, T_inv, RT_scaled
5. CatBoost + XGBoost ensemble

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.set_default_dtype(torch.double)

print('Imports complete')

Imports complete


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

print(f'Spange: {SPANGE_DF.shape}, DRFP: {DRFP_DF.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP: (24, 2048), ACS PCA: (24, 5)


In [4]:
# Feature priority function (from ens-model kernel)
def feature_priority(name):
    """Higher number = more important to keep during correlation filtering."""
    if name.startswith("spange_"):
        return 5
    if name.startswith("acs_"):
        return 4
    if name.startswith("drfps_"):
        return 3
    if name.startswith("frag_"):
        return 2
    if name.startswith("smiles_"):
        return 1
    return 0

def filter_correlated_features(df, threshold=0.90):
    """Drop columns that are highly correlated with any other column."""
    numeric_df = df.select_dtypes(include=[np.number])
    
    if numeric_df.shape[1] == 0:
        return df, []
    
    # Drop constant columns first
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        numeric_df = numeric_df.drop(columns=constant_cols)
    
    # Correlation matrix
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    
    cols = upper.columns.tolist()
    to_drop = set()
    
    # Find all pairs with corr > threshold
    high_corr_pairs = []
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                high_corr_pairs.append((col_i, col_j, cval))
    
    # For each pair, decide which column to drop
    for col_i, col_j, cval in high_corr_pairs:
        if col_i in to_drop or col_j in to_drop:
            continue
        
        p_i = feature_priority(col_i)
        p_j = feature_priority(col_j)
        
        if p_i > p_j:
            drop = col_j
        elif p_j > p_i:
            drop = col_i
        else:
            # Same priority; drop the one that appears later
            idx_i = df.columns.get_loc(col_i)
            idx_j = df.columns.get_loc(col_j)
            drop = col_i if idx_i > idx_j else col_j
        
        to_drop.add(drop)
    
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors="ignore")
    
    return df_filtered, all_to_drop

print('Correlation filtering functions defined')

Correlation filtering functions defined


In [5]:
# Build solvent feature table with correlation filtering
def build_solvent_feature_table(threshold=0.90):
    """Build combined solvent feature table with correlation filtering."""
    
    # Spange descriptors
    spange_df = SPANGE_DF.copy()
    spange_df = spange_df.reset_index().rename(columns={"index": "SOLVENT NAME"})
    spange_cols = [c for c in spange_df.columns if c != "SOLVENT NAME"]
    spange_df = spange_df.rename(columns={c: f"spange_{c}" for c in spange_cols})
    
    # ACS PCA descriptors
    acs_df = ACS_PCA_DF.copy()
    acs_df = acs_df.reset_index().rename(columns={"index": "SOLVENT NAME"})
    acs_cols = [c for c in acs_df.columns if c != "SOLVENT NAME"]
    acs_df = acs_df.rename(columns={c: f"acs_{c}" for c in acs_cols})
    
    # DRFP descriptors (filter zero-variance)
    drfp_df = DRFP_DF.copy()
    drfp_df = drfp_df.loc[:, (drfp_df != 0).any(axis=0)]  # Drop all-zero
    drfp_df = drfp_df.loc[:, (drfp_df != 1).any(axis=0)]  # Drop all-one
    drfp_df = drfp_df.reset_index().rename(columns={"index": "SOLVENT NAME"})
    drfp_cols = [c for c in drfp_df.columns if c != "SOLVENT NAME"]
    drfp_df = drfp_df.rename(columns={c: f"drfps_{c}" for c in drfp_cols})
    
    # Merge all
    combined = spange_df.merge(acs_df, on="SOLVENT NAME", how="outer")
    combined = combined.merge(drfp_df, on="SOLVENT NAME", how="outer")
    combined = combined.set_index("SOLVENT NAME")
    
    # Apply correlation filtering
    combined_filtered, dropped = filter_correlated_features(combined, threshold=threshold)
    
    print(f"Original features: {combined.shape[1]}, After filtering: {combined_filtered.shape[1]}")
    print(f"Dropped {len(dropped)} features")
    
    return combined_filtered

# Build the table
SOLVENT_TABLE = build_solvent_feature_table(threshold=0.90)
print(f"Final solvent table shape: {SOLVENT_TABLE.shape}")

Original features: 140, After filtering: 56
Dropped 84 features
Final solvent table shape: (26, 56)


In [None]:
# Numeric feature engineering (from ens-model kernel)
def add_numeric_features(X_numeric):
    """Add engineered numeric features."""
    X_num = X_numeric.copy()
    
    # Convert Temperature to Kelvin
    T = X_num["Temperature"] + 273.15
    rt = X_num["Residence Time"]
    
    # Interaction term
    X_num["T_x_RT"] = T * rt
    
    # Log transformation
    X_num["RT_log"] = np.log(rt + 1e-6)
    
    # Inverse temperature
    X_num["T_inv"] = 1 / T
    
    # Scaled residence time
    X_num["RT_scaled"] = rt / rt.mean()
    
    return X_num

print('Numeric feature engineering defined')

In [None]:
# Featurizer class (from ens-model kernel)
class PrecomputedFeaturizer:
    """Featurizer for single solvent data."""
    
    def __init__(self):
        self.solvent_table = SOLVENT_TABLE
        self.scaler = StandardScaler()
        self.fitted = False
    
    def featurize(self, X):
        # Numeric features
        X_numeric = X[["Residence Time", "Temperature"]].copy()
        X_numeric = add_numeric_features(X_numeric)
        
        # Solvent features
        solvent_feats = self.solvent_table.loc[X["SOLVENT NAME"]].values
        
        # Combine
        combined = np.hstack([X_numeric.values, solvent_feats])
        
        # Scale
        if not self.fitted:
            combined = self.scaler.fit_transform(combined)
            self.fitted = True
        else:
            combined = self.scaler.transform(combined)
        
        return torch.tensor(combined, dtype=torch.double)

class PrecomputedFeaturizerMixed:
    """Featurizer for mixed solvent data."""
    
    def __init__(self):
        self.solvent_table = SOLVENT_TABLE
        self.scaler = StandardScaler()
        self.fitted = False
    
    def featurize(self, X):
        # Numeric features
        X_numeric = X[["Residence Time", "Temperature"]].copy()
        X_numeric = add_numeric_features(X_numeric)
        
        # Solvent A features
        A_feats = self.solvent_table.loc[X["SOLVENT A NAME"]].values
        
        # Solvent B features
        B_feats = self.solvent_table.loc[X["SOLVENT B NAME"]].values
        
        # Linear mixing
        r = X["SolventB%"].values.reshape(-1, 1)  # Already in [0, 1]
        mixed_feats = A_feats * (1 - r) + B_feats * r
        
        # Combine
        combined = np.hstack([X_numeric.values, mixed_feats])
        
        # Scale
        if not self.fitted:
            combined = self.scaler.fit_transform(combined)
            self.fitted = True
        else:
            combined = self.scaler.transform(combined)
        
        return torch.tensor(combined, dtype=torch.double)

print('Featurizers defined')

In [None]:
# CatBoost Model (from ens-model kernel)
class CatBoostModel:
    def __init__(self, data='single'):
        self.data_mode = data
        
        if data == 'single':
            self.featurizer = PrecomputedFeaturizer()
            self.cat_params = dict(
                iterations=700,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3.0,
                random_seed=42,
                verbose=False,
            )
        else:
            self.featurizer = PrecomputedFeaturizerMixed()
            self.cat_params = dict(
                iterations=500,
                learning_rate=0.03,
                depth=5,
                l2_leaf_reg=5.0,
                random_seed=42,
                verbose=False,
            )
        
        self.models = None
    
    def train_model(self, train_X, train_Y):
        X_tensor = self.featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        
        self.models = []
        for t in range(Y_np.shape[1]):
            m = CatBoostRegressor(**self.cat_params)
            m.fit(X_np, Y_np[:, t])
            self.models.append(m)
    
    def predict(self, X):
        X_tensor = self.featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        
        preds = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds)
        out = np.clip(out, 0.0, None)
        
        # Renormalize if sum > 1
        totals = out.sum(axis=1, keepdims=True)
        divisor = np.maximum(totals, 1.0)
        out = out / divisor
        
        return torch.tensor(out, dtype=torch.double)

print('CatBoostModel defined')

In [None]:
# XGBoost Model (from ens-model kernel)
class XGBModel:
    def __init__(self, data='single'):
        self.data_mode = data
        
        if data == 'single':
            self.featurizer = PrecomputedFeaturizer()
            self.xgb_params = dict(
                n_estimators=600,
                learning_rate=0.05,
                max_depth=5,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.1,
                reg_lambda=1.0,
                random_state=42,
                verbosity=0,
            )
        else:
            self.featurizer = PrecomputedFeaturizerMixed()
            self.xgb_params = dict(
                n_estimators=400,
                learning_rate=0.03,
                max_depth=4,
                subsample=0.7,
                colsample_bytree=0.7,
                reg_alpha=0.2,
                reg_lambda=2.0,
                random_state=42,
                verbosity=0,
            )
        
        self.models = None
    
    def train_model(self, train_X, train_Y):
        X_tensor = self.featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        
        self.models = []
        for t in range(Y_np.shape[1]):
            m = xgb.XGBRegressor(**self.xgb_params)
            m.fit(X_np, Y_np[:, t])
            self.models.append(m)
    
    def predict(self, X):
        X_tensor = self.featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        
        preds = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds)
        out = np.clip(out, 0.0, None)
        
        # Renormalize if sum > 1
        totals = out.sum(axis=1, keepdims=True)
        divisor = np.maximum(totals, 1.0)
        out = out / divisor
        
        return torch.tensor(out, dtype=torch.double)

print('XGBModel defined')

In [None]:
# Ensemble Model (from ens-model kernel)
class EnsembleModel:
    """Weighted ensemble of CatBoost and XGBoost."""
    
    def __init__(self, data='single'):
        self.data_mode = data
        
        # Optimized fixed weights per dataset (from ens-model kernel)
        if data == 'single':
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            # Full dataset
            cat_weight = 1.0
            xgb_weight = 2.0
        
        # Normalize ensemble weights
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum
        
        # Initialize base models
        self.cat_model = CatBoostModel(data=data)
        self.xgb_model = XGBModel(data=data)
    
    def train_model(self, train_X, train_Y):
        self.cat_model.train_model(train_X, train_Y)
        self.xgb_model.train_model(train_X, train_Y)
    
    def predict(self, X):
        cat_pred = self.cat_model.predict(X)
        xgb_pred = self.xgb_model.predict(X)
        
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        
        return out

print('EnsembleModel defined')
print(f'Single weights: CatBoost={7/13:.3f}, XGB={6/13:.3f}')
print(f'Full weights: CatBoost={1/3:.3f}, XGB={2/3:.3f}')

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
# Calculate CV score (for verification only - NOT part of submission)
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")

# Get actuals in same order as predictions
actuals_single = []
for solvent in sorted(X_single["SOLVENT NAME"].unique()):
    mask = X_single["SOLVENT NAME"] == solvent
    actuals_single.append(Y_single[mask].values)
actuals_single = np.vstack(actuals_single)

actuals_full = []
ramps = X_full[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
for _, row in ramps.iterrows():
    mask = (X_full["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X_full["SOLVENT B NAME"] == row["SOLVENT B NAME"])
    actuals_full.append(Y_full[mask].values)
actuals_full = np.vstack(actuals_full)

# Get predictions
preds_single = submission_single_solvent[['target_1', 'target_2', 'target_3']].values
preds_full = submission_full_data[['target_1', 'target_2', 'target_3']].values

# Calculate MSE
mse_single = np.mean((actuals_single - preds_single) ** 2)
mse_full = np.mean((actuals_full - preds_full) ** 2)
n_single = len(actuals_single)
n_full = len(actuals_full)
overall_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)

print(f'\n=== CV SCORE VERIFICATION ===')
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Overall MSE: {overall_mse:.6f}')
print(f'\nBest previous CV: 0.008092 (CatBoost+XGBoost)')
print(f'Best previous LB: 0.0877 (GP+MLP+LGBM)')
print(f'exp_030 baseline (GP+MLP+LGBM): CV 0.008298')
print(f'exp_090 (ens-model attempt): CV 0.010878')
print(f'\nThis (Exact ens-model replication): CV {overall_mse:.6f}')

if overall_mse < 0.008092:
    improvement = (0.008092 - overall_mse) / 0.008092 * 100
    print(f'\n✓ IMPROVEMENT: {improvement:.2f}% better than best CV!')
elif overall_mse < 0.008298:
    improvement = (0.008298 - overall_mse) / 0.008298 * 100
    print(f'\n✓ IMPROVEMENT: {improvement:.2f}% better than exp_030!')
else:
    degradation = (overall_mse - 0.008298) / 0.008298 * 100
    print(f'\n✗ WORSE: {degradation:.2f}% worse than exp_030')