# Experiment 009: Diverse Ensemble

**Key Insight from Loop 8**: More regularization made LB WORSE (0.0956 â†’ 0.0991). This means:
1. The problem is NOT traditional overfitting
2. We need BETTER features and model diversity
3. Top kernels use MLP + XGBoost + RF + LightGBM ensemble

**Implementation**:
- Combine PerTarget (HGB+ETR) + RandomForest + XGBoost + LightGBM
- Weighted averaging with weights [0.4, 0.2, 0.2, 0.2]
- Combined features: Spange + ACS_PCA + Arrhenius kinetics
- NO TTA

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import lightgbm as lgb
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print("Setup complete")

Setup complete


In [2]:
# --- UTILITY FUNCTIONS ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load feature sets
SPANGE_DF = load_features('spange_descriptors')
ACS_PCA_DF = load_features('acs_pca_descriptors')
print(f"Spange: {SPANGE_DF.shape}, ACS_PCA: {ACS_PCA_DF.shape}")

Spange: (26, 13), ACS_PCA: (24, 5)


In [3]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [4]:
# --- DIVERSE ENSEMBLE MODEL ---
class DiverseEnsemble(BaseModel):
    """Ensemble of diverse model families.
    
    Key insight: Top kernels use MLP + XGBoost + RF + LightGBM ensemble.
    We combine:
    - PerTarget (HGB for SM, ETR for Products) - our best CV model
    - RandomForest - different model family
    - XGBoost - gradient boosting
    - LightGBM - another gradient boosting variant
    
    Weighted averaging with weights [0.4, 0.2, 0.2, 0.2]
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.targets = ['Product 2', 'Product 3', 'SM']
        
        # Feature dataframes
        self.spange = SPANGE_DF
        self.acs_pca = ACS_PCA_DF
        
        # Scaler
        self.scaler = StandardScaler()
        
        # Models for each approach
        self.per_target_models = {}  # HGB for SM, ETR for Products
        self.rf_model = None
        self.xgb_model = None
        self.lgb_model = None
        
        # Ensemble weights
        self.weights = [0.4, 0.2, 0.2, 0.2]  # PerTarget, RF, XGB, LGB
    
    def _build_features(self, X):
        """Build combined features: Arrhenius + Spange + ACS_PCA."""
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        # Arrhenius kinetic features
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(rt + 1e-6)
        interaction = inv_temp * log_time
        
        process_feats = np.hstack([rt, temp, inv_temp, log_time, interaction])
        
        if self.mixed:
            pct = X['SolventB%'].values.reshape(-1, 1)
            
            # Spange features (weighted mix)
            A_spange = self.spange.loc[X['SOLVENT A NAME']].values
            B_spange = self.spange.loc[X['SOLVENT B NAME']].values
            spange_feats = A_spange * (1 - pct) + B_spange * pct
            
            # ACS_PCA features (weighted mix)
            A_acs = self.acs_pca.loc[X['SOLVENT A NAME']].values
            B_acs = self.acs_pca.loc[X['SOLVENT B NAME']].values
            acs_feats = A_acs * (1 - pct) + B_acs * pct
            
            return np.hstack([process_feats, pct, spange_feats, acs_feats])
        else:
            spange_feats = self.spange.loc[X['SOLVENT NAME']].values
            acs_feats = self.acs_pca.loc[X['SOLVENT NAME']].values
            return np.hstack([process_feats, spange_feats, acs_feats])
    
    def train_model(self, X_train, y_train):
        X_feat = self._build_features(X_train)
        X_scaled = self.scaler.fit_transform(X_feat)
        y = y_train.values
        
        # 1. Train per-target models (HGB for SM, ETR for Products)
        for i, target in enumerate(self.targets):
            y_target = y[:, i]
            if target == 'SM':
                model = HistGradientBoostingRegressor(
                    max_depth=7, max_iter=700, learning_rate=0.04, random_state=42
                )
            else:
                model = ExtraTreesRegressor(
                    n_estimators=500, max_depth=10, min_samples_leaf=2,
                    random_state=42, n_jobs=-1
                )
            model.fit(X_scaled, y_target)
            self.per_target_models[target] = model
        
        # 2. Train RandomForest
        self.rf_model = MultiOutputRegressor(
            RandomForestRegressor(
                n_estimators=200, max_depth=10, min_samples_leaf=2,
                random_state=42, n_jobs=-1
            )
        )
        self.rf_model.fit(X_scaled, y)
        
        # 3. Train XGBoost
        self.xgb_model = MultiOutputRegressor(
            xgb.XGBRegressor(
                n_estimators=300, max_depth=6, learning_rate=0.05,
                subsample=0.8, colsample_bytree=0.8,
                random_state=42, verbosity=0
            )
        )
        self.xgb_model.fit(X_scaled, y)
        
        # 4. Train LightGBM
        self.lgb_model = MultiOutputRegressor(
            lgb.LGBMRegressor(
                n_estimators=300, max_depth=6, learning_rate=0.05,
                subsample=0.8, colsample_bytree=0.8,
                random_state=42, verbosity=-1
            )
        )
        self.lgb_model.fit(X_scaled, y)
    
    def predict(self, X):
        X_feat = self._build_features(X)
        X_scaled = self.scaler.transform(X_feat)
        
        # 1. Per-target predictions
        per_target_preds = []
        for target in self.targets:
            p = self.per_target_models[target].predict(X_scaled)
            per_target_preds.append(p.reshape(-1, 1))
        per_target_preds = np.hstack(per_target_preds)
        
        # 2. RF predictions
        rf_preds = self.rf_model.predict(X_scaled)
        
        # 3. XGB predictions
        xgb_preds = self.xgb_model.predict(X_scaled)
        
        # 4. LGB predictions
        lgb_preds = self.lgb_model.predict(X_scaled)
        
        # Weighted ensemble
        final_preds = (
            self.weights[0] * per_target_preds +
            self.weights[1] * rf_preds +
            self.weights[2] * xgb_preds +
            self.weights[3] * lgb_preds
        )
        
        final_preds = np.clip(final_preds, 0, 1)
        return torch.tensor(final_preds, dtype=torch.double)

In [5]:
# --- QUICK VALIDATION TEST ---
print("Testing DiverseEnsemble...")
X_test, Y_test = load_data("single_solvent")

# Quick leave-one-out test on first 5 solvents
errors = []
split_gen = generate_leave_one_out_splits(X_test, Y_test)
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(split_gen):
    if i >= 5: break
    model = DiverseEnsemble(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nQuick test MAE (single): {np.mean(errors):.4f}")

# Also test on full data
print("\nTesting on full data...")
X_full, Y_full = load_data("full")
errors_full = []
split_gen = generate_leave_one_ramp_out_splits(X_full, Y_full)
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(split_gen):
    if i >= 3: break
    model = DiverseEnsemble(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors_full.append(mae)
    print(f"Fold {i}: MAE = {mae:.4f}")

print(f"\nQuick test MAE (full): {np.mean(errors_full):.4f}")

Testing DiverseEnsemble...


Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1717


Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1180


Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0335


Fold 3 (Acetonitrile): MAE = 0.1149


Fold 4 (Acetonitrile.Acetic Acid): MAE = 0.1098

Quick test MAE (single): 0.1096

Testing on full data...


Fold 0: MAE = 0.0626


Fold 1: MAE = 0.1038


Fold 2: MAE = 0.0598

Quick test MAE (full): 0.0754


## Template-Compliant Cross-Validation

The following 3 cells are the FINAL 3 cells - EXACTLY as in the template.

In [6]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = DiverseEnsemble(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:02,  2.05s/it]

2it [00:04,  2.06s/it]

3it [00:06,  2.08s/it]

4it [00:08,  2.05s/it]

5it [00:10,  2.05s/it]

6it [00:12,  2.04s/it]

7it [00:14,  2.05s/it]

8it [00:16,  2.05s/it]

9it [00:18,  2.03s/it]

10it [00:20,  2.05s/it]

11it [00:22,  2.02s/it]

12it [00:24,  2.01s/it]

13it [00:26,  2.02s/it]

14it [00:28,  2.05s/it]

15it [00:30,  2.08s/it]

16it [00:32,  2.11s/it]

17it [00:34,  2.10s/it]

18it [00:37,  2.08s/it]

19it [00:39,  2.07s/it]

20it [00:41,  2.05s/it]

21it [00:43,  2.03s/it]

22it [00:45,  2.03s/it]

23it [00:47,  2.01s/it]

24it [00:49,  2.01s/it]

24it [00:49,  2.05s/it]




In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = DiverseEnsemble(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:02,  2.45s/it]

2it [00:04,  2.44s/it]

3it [00:07,  2.45s/it]

4it [00:09,  2.40s/it]

5it [00:12,  2.43s/it]

6it [00:14,  2.42s/it]

7it [00:17,  2.43s/it]

8it [00:19,  2.41s/it]

9it [00:21,  2.43s/it]

10it [00:24,  2.42s/it]

11it [00:26,  2.42s/it]

12it [00:29,  2.42s/it]

13it [00:31,  2.41s/it]

13it [00:31,  2.42s/it]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################