# Stacking Ensemble: MLP + Tree-Based Models

This notebook combines:
1. MLP with Arrhenius features, BatchNorm, Dropout, HuberLoss (from exp_000)
2. Tree-based per-target ensemble (from exp_001)

The goal is to leverage diversity between model types to reduce variance.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from abc import ABC, abstractmethod
import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = True

torch.set_default_dtype(torch.double)

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# Data loading utilities
DATA_PATH = '/home/data'

INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load feature lookup tables
SPANGE_DF = load_features('spange_descriptors')
ACS_PCA_DF = load_features('acs_pca_descriptors')
print(f"Spange: {SPANGE_DF.shape}, ACS PCA: {ACS_PCA_DF.shape}")

Spange: (26, 13), ACS PCA: (24, 5)


In [3]:
# Base class
class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

In [4]:
# ============ MLP COMPONENT ============
# Kinetic Featurizer with Arrhenius features
class KineticMixingFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.featurizer = SPANGE_DF
        self.feats_dim = self.featurizer.shape[1] + 2 + 3  # 13 spange + 2 numeric + 3 kinetic

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        
        # Arrhenius kinetic features
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        
        X_kinetic = torch.tensor(np.hstack([X_vals, inv_temp, log_time, interaction]))
        
        if self.mixed:
            A = torch.tensor(self.featurizer.loc[X["SOLVENT A NAME"]].values)
            B = torch.tensor(self.featurizer.loc[X["SOLVENT B NAME"]].values)
            pct = torch.tensor(X["SolventB%"].values.reshape(-1, 1))
            if flip:
                X_chem = B * (1 - (1-pct)) + A * (1-pct)
            else:
                X_chem = A * (1 - pct) + B * pct
        else:
            X_chem = torch.tensor(self.featurizer.loc[X["SOLVENT NAME"]].values)
            
        return torch.cat([X_kinetic, X_chem], dim=1)

# MLP Internal Model
class MLPInternal(nn.Module):
    def __init__(self, input_dim):
        super(MLPInternal, self).__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(input_dim),
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 3),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

# MLP Model with bagging
class MLPWithArrhenius(nn.Module):
    def __init__(self, data='single', n_models=3, epochs=200):
        super().__init__()
        self.data_type = data
        self.featurizer = KineticMixingFeaturizer(mixed=(data=='full'))
        self.n_models = n_models
        self.epochs = epochs
        self.models = nn.ModuleList()

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = torch.tensor(y_train.values)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = torch.cat([X_std, X_flip], dim=0)
            y_all = torch.cat([y_vals, y_vals], dim=0)
        else:
            X_all = X_std
            y_all = y_vals
            
        input_dim = X_all.shape[1]
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        for i in range(self.n_models):
            model = MLPInternal(input_dim).to(device)
            model.train()
            self.models.append(model)
            
            dataset = TensorDataset(X_all, y_all)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
            criterion = nn.HuberLoss()
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20)
            
            for epoch in range(self.epochs):
                epoch_loss = 0.0
                for inputs, targets in loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    epoch_loss += loss.item() * inputs.size(0)
                scheduler.step(epoch_loss / len(dataset))

    def predict(self, X):
        device = next(self.models[0].parameters()).device
        
        if self.data_type == 'full':
            X_std = self.featurizer.featurize(X, flip=False).to(device)
            X_flip = self.featurizer.featurize(X, flip=True).to(device)
            pred_sum = torch.zeros((len(X), 3)).to(device)
            with torch.no_grad():
                for model in self.models:
                    model.eval()
                    p1 = model(X_std)
                    p2 = model(X_flip)
                    pred_sum += (p1 + p2) * 0.5
            avg_pred = pred_sum / self.n_models
        else:
            X_std = self.featurizer.featurize(X).to(device)
            pred_sum = torch.zeros((len(X), 3)).to(device)
            with torch.no_grad():
                for model in self.models:
                    model.eval()
                    pred_sum += model(X_std)
            avg_pred = pred_sum / self.n_models

        return avg_pred.cpu()

print("MLP component defined")

MLP component defined


In [5]:
# ============ TREE COMPONENT ============
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler

class TreeEnsembleModel(BaseModel):
    def __init__(self, data='single'):
        super().__init__()
        self.data_type = data
        self.spange_df = SPANGE_DF
        self.acs_pca_df = ACS_PCA_DF
        self.scaler = StandardScaler()
        self.models = {}
        
    def _create_features(self, X, flip=False):
        time_m = X["Residence Time"].values.reshape(-1, 1)
        temp_c = X["Temperature"].values.reshape(-1, 1)
        
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        
        if self.data_type == 'full':
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                spange_A = self.spange_df.loc[X["SOLVENT B NAME"]].values
                spange_B = self.spange_df.loc[X["SOLVENT A NAME"]].values
                acs_A = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
                acs_B = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
                pct_use = 1 - pct
            else:
                spange_A = self.spange_df.loc[X["SOLVENT A NAME"]].values
                spange_B = self.spange_df.loc[X["SOLVENT B NAME"]].values
                acs_A = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
                acs_B = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
                pct_use = pct
            spange_feat = spange_A * (1 - pct_use) + spange_B * pct_use
            acs_feat = acs_A * (1 - pct_use) + acs_B * pct_use
        else:
            spange_feat = self.spange_df.loc[X["SOLVENT NAME"]].values
            acs_feat = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        features = np.hstack([time_m, temp_c, inv_temp, log_time, interaction, spange_feat, acs_feat])
        return features
    
    def train_model(self, X_train, y_train):
        X_feat = self._create_features(X_train, flip=False)
        
        if self.data_type == 'full':
            X_flip = self._create_features(X_train, flip=True)
            X_feat = np.vstack([X_feat, X_flip])
            y_train_aug = pd.concat([y_train, y_train], ignore_index=True)
        else:
            y_train_aug = y_train
        
        X_scaled = self.scaler.fit_transform(X_feat)
        
        self.models['SM'] = HistGradientBoostingRegressor(
            max_depth=7, max_iter=700, learning_rate=0.04, random_state=42, early_stopping=False
        )
        self.models['SM'].fit(X_scaled, y_train_aug['SM'].values)
        
        self.models['Product 2'] = ExtraTreesRegressor(n_estimators=500, min_samples_leaf=2, random_state=42, n_jobs=-1)
        self.models['Product 2'].fit(X_scaled, y_train_aug['Product 2'].values)
        
        self.models['Product 3'] = ExtraTreesRegressor(n_estimators=500, min_samples_leaf=2, random_state=42, n_jobs=-1)
        self.models['Product 3'].fit(X_scaled, y_train_aug['Product 3'].values)
    
    def predict(self, X_test):
        X_feat = self._create_features(X_test, flip=False)
        X_scaled = self.scaler.transform(X_feat)
        
        pred_p2 = self.models['Product 2'].predict(X_scaled)
        pred_p3 = self.models['Product 3'].predict(X_scaled)
        pred_sm = self.models['SM'].predict(X_scaled)
        
        if self.data_type == 'full':
            X_flip = self._create_features(X_test, flip=True)
            X_flip_scaled = self.scaler.transform(X_flip)
            
            pred_p2 = (pred_p2 + self.models['Product 2'].predict(X_flip_scaled)) / 2
            pred_p3 = (pred_p3 + self.models['Product 3'].predict(X_flip_scaled)) / 2
            pred_sm = (pred_sm + self.models['SM'].predict(X_flip_scaled)) / 2
        
        pred_p2 = np.clip(pred_p2, 0, 1)
        pred_p3 = np.clip(pred_p3, 0, 1)
        pred_sm = np.clip(pred_sm, 0, 1)
        
        predictions = np.column_stack([pred_p2, pred_p3, pred_sm])
        return torch.tensor(predictions)

print("Tree component defined")

Tree component defined


In [6]:
# ============ STACKING ENSEMBLE ============
class StackingEnsembleModel(BaseModel):
    """Combines MLP and Tree-based models for ensemble diversity."""
    
    def __init__(self, data='single', mlp_weight=0.5):
        super().__init__()
        self.data_type = data
        self.mlp_weight = mlp_weight
        self.tree_weight = 1 - mlp_weight
        
        # Initialize components
        self.mlp = MLPWithArrhenius(data=data, n_models=3, epochs=200)
        self.trees = TreeEnsembleModel(data=data)
    
    def train_model(self, X_train, y_train):
        """Train both MLP and Tree components."""
        # Train MLP
        self.mlp.train_model(X_train, y_train)
        # Train Trees
        self.trees.train_model(X_train, y_train)
    
    def predict(self, X_test):
        """Average predictions from both models."""
        pred_mlp = self.mlp.predict(X_test)
        pred_trees = self.trees.predict(X_test)
        
        # Weighted average
        combined = self.mlp_weight * pred_mlp + self.tree_weight * pred_trees
        
        # Clip to [0, 1]
        combined = torch.clamp(combined, 0, 1)
        
        return combined

print("StackingEnsembleModel defined")

StackingEnsembleModel defined


In [7]:
# Quick test
print("Testing stacking ensemble...")
X, Y = load_data("single_solvent")
print(f"Single solvent: {X.shape}, {Y.shape}")

# Test on first fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

model = StackingEnsembleModel(data='single', mlp_weight=0.5)
model.train_model(train_X, train_Y)
preds = model.predict(test_X)
print(f"Predictions shape: {preds.shape}")
print(f"Sample predictions: {preds[:3]}")

Testing stacking ensemble...
Single solvent: (656, 3), (656, 3)


Predictions shape: torch.Size([37, 3])
Sample predictions: tensor([[0.0039, 0.0044, 0.8770],
        [0.0075, 0.0085, 0.8817],
        [0.0256, 0.0289, 0.8124]])


In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
all_actuals = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split

    model = StackingEnsembleModel(data='single', mlp_weight=0.5)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()
    all_actuals.append(test_Y.values)

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

# Calculate CV score
all_actuals_np = np.vstack(all_actuals)
all_preds_np = np.array([[p['target_1'], p['target_2'], p['target_3']] for p in all_predictions])
single_mse = np.mean((all_actuals_np - all_preds_np) ** 2)
print(f"\nSingle Solvent CV MSE: {single_mse:.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:26<10:04, 26.28s/it]

  8%|▊         | 2/24 [00:52<09:43, 26.54s/it]

 12%|█▎        | 3/24 [01:18<09:03, 25.87s/it]

 17%|█▋        | 4/24 [01:43<08:32, 25.61s/it]

 21%|██        | 5/24 [02:09<08:10, 25.83s/it]

 25%|██▌       | 6/24 [02:35<07:47, 25.94s/it]

 29%|██▉       | 7/24 [03:01<07:22, 26.04s/it]

 33%|███▎      | 8/24 [03:28<06:57, 26.08s/it]

 38%|███▊      | 9/24 [03:54<06:34, 26.27s/it]

 42%|████▏     | 10/24 [04:21<06:10, 26.43s/it]

 46%|████▌     | 11/24 [04:48<05:44, 26.46s/it]

 50%|█████     | 12/24 [05:14<05:17, 26.44s/it]

 54%|█████▍    | 13/24 [05:40<04:50, 26.44s/it]

 58%|█████▊    | 14/24 [06:07<04:24, 26.43s/it]

 62%|██████▎   | 15/24 [06:33<03:57, 26.43s/it]

 67%|██████▋   | 16/24 [07:00<03:31, 26.39s/it]

 71%|███████   | 17/24 [07:27<03:07, 26.74s/it]

 75%|███████▌  | 18/24 [07:54<02:39, 26.66s/it]

 79%|███████▉  | 19/24 [08:20<02:12, 26.55s/it]

 83%|████████▎ | 20/24 [08:46<01:45, 26.47s/it]

 88%|████████▊ | 21/24 [09:13<01:19, 26.53s/it]

 92%|█████████▏| 22/24 [09:39<00:53, 26.56s/it]

 96%|█████████▌| 23/24 [10:06<00:26, 26.47s/it]

100%|██████████| 24/24 [10:32<00:00, 26.42s/it]

100%|██████████| 24/24 [10:32<00:00, 26.36s/it]


Single Solvent CV MSE: 0.009713





In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions_full = []
all_actuals_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split

    model = StackingEnsembleModel(data='full', mlp_weight=0.5)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()
    all_actuals_full.append(test_Y.values)

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions_full.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions_full)

# Calculate CV score
all_actuals_full_np = np.vstack(all_actuals_full)
all_preds_full_np = np.array([[p['target_1'], p['target_2'], p['target_3']] for p in all_predictions_full])
full_mse = np.mean((all_actuals_full_np - all_preds_full_np) ** 2)
print(f"\nFull Data CV MSE: {full_mse:.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

  0%|          | 0/13 [00:00<?, ?it/s]

  8%|▊         | 1/13 [01:33<18:36, 93.03s/it]

 15%|█▌        | 2/13 [03:01<16:34, 90.37s/it]

 23%|██▎       | 3/13 [04:31<15:04, 90.41s/it]

 31%|███       | 4/13 [06:00<13:25, 89.54s/it]

 38%|███▊      | 5/13 [07:28<11:52, 89.06s/it]

 46%|████▌     | 6/13 [08:57<10:22, 88.94s/it]

 54%|█████▍    | 7/13 [10:26<08:53, 88.92s/it]

 62%|██████▏   | 8/13 [11:55<07:25, 89.19s/it]

 69%|██████▉   | 9/13 [13:24<05:56, 89.08s/it]

 77%|███████▋  | 10/13 [15:00<04:33, 91.10s/it]

 85%|████████▍ | 11/13 [16:36<03:05, 92.66s/it]

 92%|█████████▏| 12/13 [18:12<01:33, 93.65s/it]

100%|██████████| 13/13 [19:48<00:00, 94.29s/it]

100%|██████████| 13/13 [19:48<00:00, 91.39s/it]


Full Data CV MSE: 0.010610





In [10]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

# Calculate overall CV score
total_samples = len(all_actuals_np) + len(all_actuals_full_np)
overall_mse = (single_mse * len(all_actuals_np) + full_mse * len(all_actuals_full_np)) / total_samples

print(f"\n=== FINAL RESULTS ===")
print(f"Single Solvent MSE: {single_mse:.6f}")
print(f"Full Data MSE: {full_mse:.6f}")
print(f"Overall CV MSE: {overall_mse:.6f}")
print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Submission shape: {submission.shape}")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################


=== FINAL RESULTS ===
Single Solvent MSE: 0.009713
Full Data MSE: 0.010610
Overall CV MSE: 0.010298

Submission saved to /home/submission/submission.csv
Submission shape: (1883, 7)
