# Experiment 061: Exact ens-model Kernel Replication

This notebook exactly replicates the matthewmaree ens-model kernel:
- ALL feature sources (spange, acs_pca, drfps, fragprints, smiles)
- Correlation filtering with threshold=0.80, priority-based
- CatBoost with MultiRMSE loss
- XGBoost per-target regressors
- Ensemble weights: Single (7:6), Full (1:2)
- Multi-target normalization

In [None]:
import numpy as np
import pandas as pd
import torch
import warnings
warnings.filterwarnings('ignore')

torch.set_default_dtype(torch.double)

# Define constants and functions locally (adapted from utils.py for local paths)
DATA_PATH = '/home/data/'

INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time",
    "Temperature",
    "SOLVENT A NAME",
    "SOLVENT B NAME",
    "SolventB%",
]

INPUT_LABELS_SINGLE_SOLVENT = [
    "Residence Time",
    "Temperature",
    "SOLVENT NAME",
]

INPUT_LABELS_NUMERIC = [
    "Residence Time",
    "Temperature",
]

TARGET_LABELS = [
    "Product 2",
    "Product 3",
    "SM",
]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    assert name in ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints", "smiles"]
    features = pd.read_csv(f'{DATA_PATH}{name}_lookup.csv', index_col=0)
    return features

def generate_leave_one_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvents."""
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvent ramps."""
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != solvent_pair).all(axis=1)
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

print('Imports and data loading functions defined')

In [None]:
# ============== FEATURE ENGINEERING (EXACT COPY FROM ENS-MODEL) ==============

_SOLVENT_TABLE_CACHE = None

def feature_priority(name: str) -> int:
    """Assign priority score to feature name based on prefix."""
    if name.startswith("spange_"):
        return 5
    if name.startswith("acs_"):
        return 4
    if name.startswith("drfps_"):
        return 3
    if name.startswith("frag_"):
        return 2
    if name.startswith("smiles_"):
        return 1
    return 0


def filter_correlated_features(df: pd.DataFrame, threshold: float = 0.8):
    """Drop columns that are highly correlated with any other column."""
    numeric_df = df.select_dtypes(include=[np.number])
    
    if numeric_df.shape[1] == 0:
        return df, []
    
    # Drop constant columns first
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        numeric_df = numeric_df.drop(columns=constant_cols)
    
    # Correlation matrix
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    
    cols = upper.columns.tolist()
    to_drop = set()
    
    # Find all pairs with corr > threshold
    high_corr_pairs = []
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                high_corr_pairs.append((col_i, col_j, cval))
    
    # For each pair, decide which column to drop
    for col_i, col_j, cval in high_corr_pairs:
        if col_i in to_drop or col_j in to_drop:
            continue
        
        p_i = feature_priority(col_i)
        p_j = feature_priority(col_j)
        
        if p_i > p_j:
            drop = col_j
        elif p_j > p_i:
            drop = col_i
        else:
            idx_i = df.columns.get_loc(col_i)
            idx_j = df.columns.get_loc(col_j)
            drop = col_i if idx_i > idx_j else col_j
        
        to_drop.add(drop)
    
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors="ignore")
    
    return df_filtered, all_to_drop


def add_numeric_features(X_numeric: pd.DataFrame) -> pd.DataFrame:
    """Add engineered numeric features."""
    X_num = X_numeric.copy()
    cols = set(X_num.columns)
    
    if {"Temperature", "Residence Time"} <= cols:
        X_num["Temperature"] = X_num["Temperature"] + 273.15
        T = X_num["Temperature"]
        rt = X_num["Residence Time"]
        
        X_num["T_x_RT"] = T * rt
        X_num["RT_log"] = np.log(rt + 1e-6)
        X_num["T_inv"] = 1 / T
        X_num["RT_scaled"] = rt / rt.mean()
    
    return X_num


def build_solvent_feature_table(threshold: float = 0.80):
    """Build combined solvent feature table from multiple sources."""
    global _SOLVENT_TABLE_CACHE
    
    if _SOLVENT_TABLE_CACHE is not None:
        return _SOLVENT_TABLE_CACHE
    
    print(">>> Building solvent feature table...")
    
    sources = [
        "spange_descriptors",
        "acs_pca_descriptors",
        "drfps_catechol",
        "fragprints",
        "smiles",
    ]
    
    dfs = []
    
    for src in sources:
        df_src = load_features(src).copy()
        
        if "SOLVENT NAME" not in df_src.columns:
            df_src = df_src.reset_index().rename(columns={"index": "SOLVENT NAME"})
        
        # Bit-table filtering for binary fingerprints
        if src in ["drfps_catechol", "fragprints"]:
            prefix = "drfps" if src == "drfps_catechol" else "frag"
            
            # Drop all-zero and all-one columns
            df_src = df_src.loc[:, (df_src != 0).any(axis=0)]
            df_src = df_src.loc[:, (df_src != 1).any(axis=0)]
            
            values = df_src.drop(columns={"SOLVENT NAME"})
            count = values.sum(axis=0).T
            drop_cols = count[count == 1].index
            df_src = df_src.drop(columns=drop_cols)
            
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        
        else:
            if src == "spange_descriptors":
                prefix = "spange"
            elif src == "acs_pca_descriptors":
                prefix = "acs"
            elif src == "smiles":
                prefix = "smiles"
            else:
                prefix = src
            
            cols_to_rename = [c for c in df_src.columns if c != "SOLVENT NAME"]
            df_src = df_src.rename(columns={c: f"{prefix}_{c}" for c in cols_to_rename})
        
        dfs.append(df_src)
    
    # Merge all feature sources
    from functools import reduce
    merged = reduce(
        lambda left, right: pd.merge(left, right, on="SOLVENT NAME", how="outer"),
        dfs
    )
    
    print(f"Merged shape before filtering: {merged.shape}")
    
    # Apply correlation filtering
    merged_filtered, dropped = filter_correlated_features(merged, threshold=threshold)
    
    print(f"Merged shape after filtering: {merged_filtered.shape}")
    print(f"Dropped {len(dropped)} features")
    
    _SOLVENT_TABLE_CACHE = merged_filtered
    return merged_filtered

print('Feature engineering functions defined')

In [None]:
# ============== FEATURIZERS (EXACT COPY FROM ENS-MODEL) ==============

from abc import ABC, abstractmethod

class BaseModel(ABC):
    def __init__(self):
        pass
    
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    
    def predict(self):
        raise NotImplementedError


class PrecomputedFeaturizer:
    """Featurizer for single-solvent data."""
    
    def __init__(self):
        self.solvent_table = build_solvent_feature_table(threshold=0.80)
        self.feature_cols = [
            c for c in self.solvent_table.columns if c != "SOLVENT NAME"
        ]
    
    def featurize(self, X: pd.DataFrame) -> torch.Tensor:
        # Use "Solvent" column if exists, otherwise "SOLVENT NAME"
        solvent_col = "Solvent" if "Solvent" in X.columns else "SOLVENT NAME"
        
        X_merged = X.merge(
            self.solvent_table,
            left_on=solvent_col,
            right_on="SOLVENT NAME",
            how="left"
        )
        
        numeric_cols = INPUT_LABELS_NUMERIC
        X_numeric = X_merged[numeric_cols].copy()
        X_numeric = add_numeric_features(X_numeric)
        
        X_solvent = X_merged[self.feature_cols].copy()
        X_combined = pd.concat([X_numeric, X_solvent], axis=1)
        
        X_combined = X_combined.fillna(0.0)
        
        return torch.tensor(X_combined.values, dtype=torch.double)


class PrecomputedFeaturizerMixed:
    """Featurizer for mixed-solvent (full) data."""
    
    def __init__(self):
        self.solvent_table = build_solvent_feature_table(threshold=0.80)
        self.feature_cols = [
            c for c in self.solvent_table.columns if c != "SOLVENT NAME"
        ]
    
    def featurize(self, X: pd.DataFrame) -> torch.Tensor:
        # Use correct column names for full data
        solvent1_col = "Solvent 1" if "Solvent 1" in X.columns else "SOLVENT A NAME"
        solvent2_col = "Solvent 2" if "Solvent 2" in X.columns else "SOLVENT B NAME"
        ratio_col = "Solvent Ratio" if "Solvent Ratio" in X.columns else "SolventB%"
        
        # Merge for Solvent 1
        X_merged = X.merge(
            self.solvent_table,
            left_on=solvent1_col,
            right_on="SOLVENT NAME",
            how="left",
            suffixes=("", "_s1")
        )
        
        # Merge for Solvent 2
        X_merged = X_merged.merge(
            self.solvent_table,
            left_on=solvent2_col,
            right_on="SOLVENT NAME",
            how="left",
            suffixes=("", "_s2")
        )
        
        numeric_cols = INPUT_LABELS_NUMERIC
        X_numeric = X_merged[numeric_cols].copy()
        X_numeric = add_numeric_features(X_numeric)
        
        # Features for both solvents
        s1_cols = self.feature_cols
        s2_cols = [f"{c}_s2" for c in self.feature_cols]
        
        X_s1 = X_merged[s1_cols].copy()
        X_s2 = X_merged[s2_cols].copy() if all(c in X_merged.columns for c in s2_cols) else pd.DataFrame()
        
        # Solvent ratio
        ratio = X_merged[ratio_col].values.reshape(-1, 1)
        ratio_df = pd.DataFrame(ratio, columns=["Solvent_Ratio"])
        
        if not X_s2.empty:
            X_combined = pd.concat([X_numeric, X_s1, X_s2, ratio_df], axis=1)
        else:
            X_combined = pd.concat([X_numeric, X_s1, ratio_df], axis=1)
        
        X_combined = X_combined.fillna(0.0)
        
        return torch.tensor(X_combined.values, dtype=torch.double)

print('Featurizers defined')

In [None]:
# ============== CATBOOST MODEL (EXACT COPY FROM ENS-MODEL) ==============

from catboost import CatBoostRegressor

class CatBoostModel(BaseModel):
    """CatBoost with MultiRMSE loss for multi-target regression."""
    
    def __init__(self, data: str = "single", random_state: int = 42, verbose: bool = False):
        self.data_mode = data
        self.verbose = verbose
        self.random_state = random_state
        
        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()
            self.cat_params = dict(
                random_state=random_state,
                loss_function="MultiRMSE",
                depth=3,
                learning_rate=0.07,
                n_estimators=1050,
                l2_leaf_reg=3.5,
                verbose=False,
            )
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            self.cat_params = dict(
                random_state=random_state,
                loss_function="MultiRMSE",
                depth=3,
                learning_rate=0.06,
                n_estimators=1100,
                l2_leaf_reg=2.5,
                verbose=False,
            )
        
        self.model = None
    
    def train_model(self, train_X, train_Y, device=None, verbose: bool = False):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        
        self.model = CatBoostRegressor(**self.cat_params)
        self.model.fit(X_np, Y_np)
    
    def predict(self, X):
        if self.model is None:
            raise RuntimeError("CatBoostModel not trained")
        
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        
        out = self.model.predict(X_np)
        out = np.clip(out, a_min=0.0, a_max=None)
        
        # Multi-target normalization
        if out.ndim == 2 and out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        
        return torch.tensor(out, dtype=torch.double)

print('CatBoostModel defined')

In [None]:
# ============== XGBOOST MODEL (EXACT COPY FROM ENS-MODEL) ==============

from xgboost import XGBRegressor

class XGBModel(BaseModel):
    """XGBoost with per-target regressors."""
    
    def __init__(self, data: str = "single", random_state: int = 42, verbose: bool = False):
        self.data_mode = data
        self.verbose = verbose
        self.random_state = random_state
        
        if data == "single":
            self.smiles_featurizer = PrecomputedFeaturizer()
            self.xgb_params = dict(
                random_state=random_state,
                n_estimators=1000,
                max_depth=4,
                learning_rate=0.02,
                subsample=0.5,
                colsample_bytree=0.8,
                verbosity=0,
            )
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed()
            self.xgb_params = dict(
                random_state=random_state,
                n_estimators=1000,
                max_depth=4,
                learning_rate=0.02,
                subsample=0.5,
                colsample_bytree=0.8,
                verbosity=0,
            )
        
        self.models = None
        self.n_targets = None
    
    def train_model(self, train_X, train_Y, device=None, verbose: bool = False):
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.detach().cpu().numpy()
        Y_np = train_Y.values
        self.n_targets = Y_np.shape[1]
        
        self.models = []
        for t in range(self.n_targets):
            m = XGBRegressor(**self.xgb_params)
            m.fit(X_np, Y_np[:, t])
            self.models.append(m)
    
    def predict(self, X):
        if self.models is None:
            raise RuntimeError("XGBModel not trained")
        
        X_tensor = self.smiles_featurizer.featurize(X)
        X_np = X_tensor.detach().cpu().numpy()
        
        preds_list = [m.predict(X_np) for m in self.models]
        out = np.column_stack(preds_list)
        out = np.clip(out, a_min=0.0, a_max=None)
        
        # Multi-target normalization
        if out.shape[1] > 1:
            totals = out.sum(axis=1, keepdims=True)
            divisor = np.maximum(totals, 1.0)
            out = out / divisor
        
        return torch.tensor(out, dtype=torch.double)

print('XGBModel defined')

In [None]:
# ============== ENSEMBLE MODEL (EXACT COPY FROM ENS-MODEL) ==============

class EnsembleModel(BaseModel):
    """Weighted ensemble of CatBoostModel and XGBModel."""
    
    def __init__(self, data: str = "single", verbose: bool = False):
        self.data_mode = data
        self.verbose = verbose
        
        # Optimised fixed weights per dataset
        if data == "single":
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            cat_weight = 1.0
            xgb_weight = 2.0
        
        # Normalise ensemble weights
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum
        
        # Initialise base models
        self.cat_model = CatBoostModel(data=data)
        self.xgb_model = XGBModel(data=data)
    
    def train_model(self, train_X, train_Y, device=None, verbose: bool = False):
        self.cat_model.train_model(train_X, train_Y)
        self.xgb_model.train_model(train_X, train_Y)
    
    def predict(self, X):
        cat_pred = self.cat_model.predict(X)
        xgb_pred = self.xgb_model.predict(X)
        
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        
        return out

print('EnsembleModel defined')

In [None]:
# Build the feature table once
_ = build_solvent_feature_table(threshold=0.80)

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel() # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
# Calculate CV score
import os
os.makedirs('/home/submission', exist_ok=True)
submission.to_csv('/home/submission/submission.csv', index=True)

# Load ground truth and calculate MSE
from sklearn.metrics import mean_squared_error

# Calculate CV for single solvent
X_single, Y_single = load_data("single_solvent")
split_gen = generate_leave_one_out_splits(X_single, Y_single)

all_y_true = []
all_y_pred = []

for fold_idx, split in enumerate(split_gen):
    (train_X, train_Y), (test_X, test_Y) = split
    
    # Get predictions for this fold
    fold_preds = submission_single_solvent[submission_single_solvent['fold'] == fold_idx]
    
    y_true = test_Y.values
    y_pred = fold_preds[['target_1', 'target_2', 'target_3']].values
    
    all_y_true.append(y_true)
    all_y_pred.append(y_pred)

all_y_true = np.vstack(all_y_true)
all_y_pred = np.vstack(all_y_pred)

mse_single = mean_squared_error(all_y_true, all_y_pred)
print(f'Single Solvent CV MSE: {mse_single:.6f}')

# Calculate CV for full data
X_full, Y_full = load_data("full")
split_gen = generate_leave_one_ramp_out_splits(X_full, Y_full)

all_y_true = []
all_y_pred = []

for fold_idx, split in enumerate(split_gen):
    (train_X, train_Y), (test_X, test_Y) = split
    
    # Get predictions for this fold
    fold_preds = submission_full_data[submission_full_data['fold'] == fold_idx]
    
    y_true = test_Y.values
    y_pred = fold_preds[['target_1', 'target_2', 'target_3']].values
    
    all_y_true.append(y_true)
    all_y_pred.append(y_pred)

all_y_true = np.vstack(all_y_true)
all_y_pred = np.vstack(all_y_pred)

mse_full = mean_squared_error(all_y_true, all_y_pred)
print(f'Full Data CV MSE: {mse_full:.6f}')

print(f'\n=== FINAL CV SCORES ===')
print(f'Single Solvent MSE: {mse_single:.6f}')
print(f'Full Data MSE: {mse_full:.6f}')
print(f'\nSubmission saved to /home/submission/submission.csv')