# Experiment 090: ens-model Kernel Replication

**Rationale**: The ens-model kernel (matthewmaree) uses techniques we haven't fully implemented:
1. Correlation-based feature filtering (threshold=0.90)
2. Feature priority: Spange > ACS > DRFP > Fragprints
3. Different ensemble weights: single (7:6 CatBoost:XGBoost), full (1:2 CatBoost:XGBoost)
4. Clipping and renormalization of predictions

**Key insight**: These techniques might change the CV-LB relationship.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
FRAGPRINTS_DF = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)

print(f'Spange: {SPANGE_DF.shape}, DRFP: {DRFP_DF.shape}, ACS PCA: {ACS_PCA_DF.shape}, Fragprints: {FRAGPRINTS_DF.shape}')

Spange: (26, 13), DRFP: (24, 2048), ACS PCA: (24, 5), Fragprints: (24, 2133)


In [4]:
# Feature priority function (from ens-model kernel)
def feature_priority(name):
    """Higher number = more important to keep during correlation filtering."""
    if name.startswith("spange_"):
        return 5
    if name.startswith("acs_"):
        return 4
    if name.startswith("drfps_"):
        return 3
    if name.startswith("frag_"):
        return 2
    return 0

def filter_correlated_features(df, threshold=0.90):
    """Drop columns that are highly correlated with any other column."""
    numeric_df = df.select_dtypes(include=[np.number])
    
    if numeric_df.shape[1] == 0:
        return df, []
    
    # Drop constant columns first
    std = numeric_df.std(axis=0)
    constant_cols = std[std == 0].index.tolist()
    if constant_cols:
        numeric_df = numeric_df.drop(columns=constant_cols)
    
    # Correlation matrix
    corr = numeric_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).fillna(0.0)
    
    cols = upper.columns.tolist()
    to_drop = set()
    
    # Find all pairs with corr > threshold
    high_corr_pairs = []
    for i, col_i in enumerate(cols):
        for j in range(i + 1, len(cols)):
            col_j = cols[j]
            cval = upper.iloc[i, j]
            if cval > threshold:
                high_corr_pairs.append((col_i, col_j, cval))
    
    # For each pair, decide which column to drop
    for col_i, col_j, cval in high_corr_pairs:
        if col_i in to_drop or col_j in to_drop:
            continue
        
        p_i = feature_priority(col_i)
        p_j = feature_priority(col_j)
        
        if p_i > p_j:
            drop = col_j
        elif p_j > p_i:
            drop = col_i
        else:
            # Same priority; drop the one that appears later
            idx_i = df.columns.get_loc(col_i) if col_i in df.columns else 0
            idx_j = df.columns.get_loc(col_j) if col_j in df.columns else 0
            drop = col_i if idx_i > idx_j else col_j
        
        to_drop.add(drop)
    
    all_to_drop = list(set(constant_cols).union(to_drop))
    df_filtered = df.drop(columns=all_to_drop, errors="ignore")
    
    return df_filtered, all_to_drop

print('Feature filtering functions defined')

Feature filtering functions defined


In [5]:
# Build combined solvent feature table with correlation filtering
def build_solvent_feature_table(threshold=0.90):
    """Build combined solvent feature table with correlation filtering."""
    
    # Prepare Spange features
    spange = SPANGE_DF.copy()
    spange.columns = [f'spange_{c}' for c in spange.columns]
    
    # Prepare ACS PCA features
    acs = ACS_PCA_DF.copy()
    acs.columns = [f'acs_{c}' for c in acs.columns]
    
    # Prepare DRFP features (filter zero-variance)
    drfp = DRFP_DF.copy()
    drfp = drfp.loc[:, (drfp != 0).any(axis=0)]  # Drop all-zero columns
    drfp = drfp.loc[:, (drfp != 1).any(axis=0)]  # Drop all-one columns
    drfp.columns = [f'drfps_{c}' for c in drfp.columns]
    
    # Prepare Fragprints features (filter zero-variance)
    frag = FRAGPRINTS_DF.copy()
    frag = frag.loc[:, (frag != 0).any(axis=0)]
    frag = frag.loc[:, (frag != 1).any(axis=0)]
    frag.columns = [f'frag_{c}' for c in frag.columns]
    
    # Merge all features
    combined = spange.join(acs, how='outer').join(drfp, how='outer').join(frag, how='outer')
    combined = combined.fillna(0)
    
    print(f'Combined features before filtering: {combined.shape}')
    
    # Apply correlation filtering
    combined_filtered, dropped = filter_correlated_features(combined, threshold=threshold)
    
    print(f'Combined features after filtering: {combined_filtered.shape}')
    print(f'Dropped {len(dropped)} features')
    
    return combined_filtered

# Build the feature table
SOLVENT_FEATURES = build_solvent_feature_table(threshold=0.90)
print(f'\nFinal solvent feature table: {SOLVENT_FEATURES.shape}')

Combined features before filtering: (26, 284)


Combined features after filtering: (26, 84)
Dropped 200 features

Final solvent feature table: (26, 84)


In [6]:
# Featurizer class
class Featurizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.solvent_features = SOLVENT_FEATURES
        
    def featurize(self, X):
        # Numeric features with engineering
        X_num = X[INPUT_LABELS_NUMERIC].copy()
        X_num['Temperature'] = X_num['Temperature'] + 273.15  # Convert to Kelvin
        T = X_num['Temperature']
        rt = X_num['Residence Time']
        X_num['T_x_RT'] = T * rt
        X_num['RT_log'] = np.log(rt + 1e-6)
        X_num['T_inv'] = 1 / T
        X_num['RT_scaled'] = rt / rt.mean()
        
        if self.mixed:
            # Get solvent features for both solvents
            A_feats = self.solvent_features.loc[X['SOLVENT A NAME']].values
            B_feats = self.solvent_features.loc[X['SOLVENT B NAME']].values
            pct = X['SolventB%'].values.reshape(-1, 1) / 100.0
            # Weighted average of solvent features
            solvent_feats = A_feats * (1 - pct) + B_feats * pct
        else:
            solvent_feats = self.solvent_features.loc[X['SOLVENT NAME']].values
        
        # Combine numeric and solvent features
        features = np.hstack([X_num.values, solvent_feats])
        return torch.tensor(features, dtype=torch.double)

print('Featurizer defined')

Featurizer defined


In [7]:
# CatBoost Model
class CatBoostModel:
    def __init__(self, data='single'):
        self.data_mode = data
        self.mixed = (data == 'full')
        self.featurizer = Featurizer(mixed=self.mixed)
        self.models = None
        
        if data == 'single':
            self.params = dict(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3,
                random_seed=42,
                verbose=False
            )
        else:
            self.params = dict(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                l2_leaf_reg=3,
                random_seed=42,
                verbose=False
            )
    
    def train_model(self, train_X, train_Y):
        X = self.featurizer.featurize(train_X).numpy()
        Y = train_Y.values
        
        self.models = []
        for i in range(Y.shape[1]):
            model = CatBoostRegressor(**self.params)
            model.fit(X, Y[:, i])
            self.models.append(model)
    
    def predict(self, test_X):
        X = self.featurizer.featurize(test_X).numpy()
        
        preds = np.zeros((len(test_X), len(self.models)))
        for i, model in enumerate(self.models):
            preds[:, i] = model.predict(X)
        
        # Clip and renormalize
        preds = np.clip(preds, 0, None)
        totals = preds.sum(axis=1, keepdims=True)
        divisor = np.maximum(totals, 1.0)
        preds = preds / divisor
        
        return torch.tensor(preds, dtype=torch.double)

print('CatBoostModel defined')

CatBoostModel defined


In [8]:
# XGBoost Model
class XGBModel:
    def __init__(self, data='single'):
        self.data_mode = data
        self.mixed = (data == 'full')
        self.featurizer = Featurizer(mixed=self.mixed)
        self.models = None
        
        if data == 'single':
            self.params = dict(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=6,
                reg_lambda=3,
                random_state=42,
                verbosity=0
            )
        else:
            self.params = dict(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=6,
                reg_lambda=3,
                random_state=42,
                verbosity=0
            )
    
    def train_model(self, train_X, train_Y):
        X = self.featurizer.featurize(train_X).numpy()
        Y = train_Y.values
        
        self.models = []
        for i in range(Y.shape[1]):
            model = xgb.XGBRegressor(**self.params)
            model.fit(X, Y[:, i])
            self.models.append(model)
    
    def predict(self, test_X):
        X = self.featurizer.featurize(test_X).numpy()
        
        preds = np.zeros((len(test_X), len(self.models)))
        for i, model in enumerate(self.models):
            preds[:, i] = model.predict(X)
        
        # Clip and renormalize
        preds = np.clip(preds, 0, None)
        totals = preds.sum(axis=1, keepdims=True)
        divisor = np.maximum(totals, 1.0)
        preds = preds / divisor
        
        return torch.tensor(preds, dtype=torch.double)

print('XGBModel defined')

XGBModel defined


In [9]:
# Ensemble Model with different weights for single vs full data
class EnsembleModel:
    def __init__(self, data='single'):
        self.data_mode = data
        
        # Optimized fixed weights per dataset (from ens-model kernel)
        if data == 'single':
            cat_weight = 7.0
            xgb_weight = 6.0
        else:
            cat_weight = 1.0
            xgb_weight = 2.0
        
        # Normalize ensemble weights
        w_sum = cat_weight + xgb_weight
        self.cat_weight = cat_weight / w_sum
        self.xgb_weight = xgb_weight / w_sum
        
        # Initialize base models
        self.cat_model = CatBoostModel(data=data)
        self.xgb_model = XGBModel(data=data)
    
    def train_model(self, train_X, train_Y):
        self.cat_model.train_model(train_X, train_Y)
        self.xgb_model.train_model(train_X, train_Y)
    
    def predict(self, test_X):
        cat_pred = self.cat_model.predict(test_X)
        xgb_pred = self.xgb_model.predict(test_X)
        
        out = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        return out

print('EnsembleModel defined')
print(f'Single weights: CatBoost={7/13:.3f}, XGBoost={6/13:.3f}')
print(f'Full weights: CatBoost={1/3:.3f}, XGBoost={2/3:.3f}')

EnsembleModel defined
Single weights: CatBoost=0.538, XGBoost=0.462
Full weights: CatBoost=0.333, XGBoost=0.667


In [10]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:01,  1.59s/it]

2it [00:03,  1.51s/it]

3it [00:04,  1.54s/it]

4it [00:06,  1.51s/it]

5it [00:07,  1.52s/it]

6it [00:09,  1.52s/it]

7it [00:10,  1.53s/it]

8it [00:12,  1.51s/it]

9it [00:13,  1.50s/it]

10it [00:15,  1.49s/it]

11it [00:16,  1.47s/it]

12it [00:18,  1.47s/it]

13it [00:19,  1.49s/it]

14it [00:21,  1.50s/it]

15it [00:22,  1.48s/it]

16it [00:23,  1.48s/it]

17it [00:25,  1.52s/it]

18it [00:27,  1.51s/it]

19it [00:28,  1.55s/it]

20it [00:30,  1.56s/it]

21it [00:31,  1.55s/it]

22it [00:33,  1.52s/it]

23it [00:34,  1.50s/it]

24it [00:36,  1.49s/it]

24it [00:36,  1.51s/it]




In [11]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:02,  2.49s/it]

2it [00:04,  2.43s/it]

3it [00:07,  2.41s/it]

4it [00:09,  2.47s/it]

5it [00:12,  2.49s/it]

6it [00:14,  2.47s/it]

7it [00:17,  2.49s/it]

8it [00:19,  2.47s/it]

9it [00:22,  2.47s/it]

10it [00:24,  2.51s/it]

11it [00:27,  2.54s/it]

12it [00:29,  2.54s/it]

13it [00:32,  2.54s/it]

13it [00:32,  2.50s/it]




In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
# Calculate CV score (for verification only - NOT part of submission)
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")

# Get actuals in same order as predictions
actuals_single = []
for solvent in sorted(X_single["SOLVENT NAME"].unique()):
    mask = X_single["SOLVENT NAME"] == solvent
    actuals_single.append(Y_single[mask].values)
actuals_single = np.vstack(actuals_single)

actuals_full = []
ramps = X_full[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
for _, row in ramps.iterrows():
    mask = (X_full["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X_full["SOLVENT B NAME"] == row["SOLVENT B NAME"])
    actuals_full.append(Y_full[mask].values)
actuals_full = np.vstack(actuals_full)

# Get predictions
preds_single = submission_single_solvent[['target_1', 'target_2', 'target_3']].values
preds_full = submission_full_data[['target_1', 'target_2', 'target_3']].values

# Calculate MSE
mse_single = np.mean((actuals_single - preds_single) ** 2)
mse_full = np.mean((actuals_full - preds_full) ** 2)
n_single = len(actuals_single)
n_full = len(actuals_full)
overall_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)

print(f'\n=== CV SCORE VERIFICATION ===')
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Overall MSE: {overall_mse:.6f}')
print(f'\nBest previous CV: 0.008092 (CatBoost+XGBoost)')
print(f'Best previous LB: 0.0877 (GP+MLP+LGBM)')
print(f'\nThis (ens-model replication): CV {overall_mse:.6f}')

if overall_mse < 0.008092:
    improvement = (0.008092 - overall_mse) / 0.008092 * 100
    print(f'\n✓ IMPROVEMENT: {improvement:.2f}% better than best CV!')
else:
    degradation = (overall_mse - 0.008092) / 0.008092 * 100
    print(f'\n✗ WORSE: {degradation:.2f}% worse than best CV')