# Experiment 010: MLP + GBDT Ensemble (Like Top Kernel)

**Key Insight from Loop 9**: Top kernel (lishellliang) uses MLP + XGBoost + RF + LightGBM ensemble.
Our diverse ensemble (exp_009) had WORSE CV than per-target (exp_004) because we were missing MLP.

**Implementation**:
- MLP: [128, 64, 32] with BatchNorm + ReLU + Dropout(0.2), Sigmoid output
- XGBoost: n_estimators=200, max_depth=6, learning_rate=0.05
- RandomForest: n_estimators=200, max_depth=10
- LightGBM: n_estimators=200, max_depth=6, learning_rate=0.05
- Ensemble Weights: [0.35, 0.25, 0.25, 0.15] for MLP, XGB, RF, LGB
- Features: Spange descriptors only (simpler may generalize better)

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import lightgbm as lgb
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# --- UTILITY FUNCTIONS ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load Spange descriptors only (like top kernel)
SPANGE_DF = load_features('spange_descriptors')
print(f"Spange: {SPANGE_DF.shape}")

Spange: (26, 13)


In [3]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [4]:
# --- MLP ARCHITECTURE (from top kernel) ---
class EnhancedMLP(nn.Module):
    """MLP with BatchNorm + ReLU + Dropout, Sigmoid output."""
    def __init__(self, input_dim, hidden_dims=[128, 64, 32], output_dim=3, dropout=0.2):
        super().__init__()
        layers = []
        
        # Input BatchNorm
        layers.append(nn.BatchNorm1d(input_dim))
        
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())  # Yields are 0-1
        
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x)

In [5]:
# --- MLP + GBDT ENSEMBLE MODEL ---
class MLPGBDTEnsemble(BaseModel):
    """Ensemble of MLP + XGBoost + RandomForest + LightGBM.
    
    This is what top kernels use successfully.
    
    Key components:
    - MLP: Captures non-linear patterns that trees miss
    - XGBoost: Gradient boosting
    - RandomForest: Bagging ensemble
    - LightGBM: Fast gradient boosting
    
    Ensemble weights: [0.35, 0.25, 0.25, 0.15] for MLP, XGB, RF, LGB
    Features: Spange descriptors only (simpler may generalize better)
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.spange = SPANGE_DF
        self.scaler = StandardScaler()
        
        # Models
        self.mlp = None
        self.xgb_model = None
        self.rf_model = None
        self.lgb_model = None
        
        # Ensemble weights
        self.weights = [0.35, 0.25, 0.25, 0.15]  # MLP, XGB, RF, LGB
    
    def _build_features(self, X):
        """Build features: Time + Temp + Spange descriptors."""
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        if self.mixed:
            pct = X['SolventB%'].values.reshape(-1, 1)
            A_spange = self.spange.loc[X['SOLVENT A NAME']].values
            B_spange = self.spange.loc[X['SOLVENT B NAME']].values
            spange_feats = A_spange * (1 - pct) + B_spange * pct
            return np.hstack([rt, temp, pct, spange_feats])
        else:
            spange_feats = self.spange.loc[X['SOLVENT NAME']].values
            return np.hstack([rt, temp, spange_feats])
    
    def train_model(self, X_train, y_train):
        X_feat = self._build_features(X_train)
        X_scaled = self.scaler.fit_transform(X_feat)
        y = y_train.values
        
        input_dim = X_scaled.shape[1]
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # 1. Train MLP
        self.mlp = EnhancedMLP(input_dim, hidden_dims=[128, 64, 32], output_dim=3, dropout=0.2).to(device)
        self.mlp.train()
        
        X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(device)
        y_tensor = torch.tensor(y, dtype=torch.double).to(device)
        
        dataset = TensorDataset(X_tensor, y_tensor)
        loader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=5e-4, weight_decay=1e-5)
        criterion = nn.HuberLoss()
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=20)
        
        for epoch in range(200):
            epoch_loss = 0.0
            for inputs, targets in loader:
                optimizer.zero_grad()
                outputs = self.mlp(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.mlp.parameters(), 1.0)
                optimizer.step()
                epoch_loss += loss.item() * inputs.size(0)
            scheduler.step(epoch_loss / len(dataset))
        
        # 2. Train XGBoost
        self.xgb_model = MultiOutputRegressor(
            xgb.XGBRegressor(
                n_estimators=200, max_depth=6, learning_rate=0.05,
                subsample=0.8, colsample_bytree=0.8,
                random_state=42, verbosity=0
            )
        )
        self.xgb_model.fit(X_scaled, y)
        
        # 3. Train RandomForest
        self.rf_model = MultiOutputRegressor(
            RandomForestRegressor(
                n_estimators=200, max_depth=10, min_samples_leaf=2,
                random_state=42, n_jobs=-1
            )
        )
        self.rf_model.fit(X_scaled, y)
        
        # 4. Train LightGBM
        self.lgb_model = MultiOutputRegressor(
            lgb.LGBMRegressor(
                n_estimators=200, max_depth=6, learning_rate=0.05,
                subsample=0.8, colsample_bytree=0.8,
                random_state=42, verbosity=-1
            )
        )
        self.lgb_model.fit(X_scaled, y)
    
    def predict(self, X):
        X_feat = self._build_features(X)
        X_scaled = self.scaler.transform(X_feat)
        
        device = next(self.mlp.parameters()).device
        
        # MLP predictions
        self.mlp.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(device)
            mlp_preds = self.mlp(X_tensor).cpu().numpy()
        
        # GBDT predictions
        xgb_preds = self.xgb_model.predict(X_scaled)
        rf_preds = self.rf_model.predict(X_scaled)
        lgb_preds = self.lgb_model.predict(X_scaled)
        
        # Weighted ensemble
        final_preds = (
            self.weights[0] * mlp_preds +
            self.weights[1] * xgb_preds +
            self.weights[2] * rf_preds +
            self.weights[3] * lgb_preds
        )
        
        final_preds = np.clip(final_preds, 0, 1)
        return torch.tensor(final_preds, dtype=torch.double)

In [6]:
# --- QUICK VALIDATION TEST ---
print("Testing MLPGBDTEnsemble...")
X_test, Y_test = load_data("single_solvent")

# Quick leave-one-out test on first 3 solvents
errors = []
split_gen = generate_leave_one_out_splits(X_test, Y_test)
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(split_gen):
    if i >= 3: break
    model = MLPGBDTEnsemble(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nQuick test MAE (single): {np.mean(errors):.4f}")

Testing MLPGBDTEnsemble...


Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1714


Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1275


Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0261

Quick test MAE (single): 0.1084


## Template-Compliant Cross-Validation

The following 3 cells are the FINAL 3 cells - EXACTLY as in the template.

In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MLPGBDTEnsemble(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:07,  7.73s/it]

2it [00:15,  7.72s/it]

3it [00:22,  7.59s/it]

4it [00:30,  7.66s/it]

5it [00:38,  7.72s/it]

6it [00:46,  7.73s/it]

7it [00:53,  7.74s/it]

8it [01:01,  7.73s/it]

9it [01:09,  7.73s/it]

10it [01:17,  7.77s/it]

11it [01:24,  7.75s/it]

12it [01:33,  7.86s/it]

13it [01:41,  7.91s/it]

14it [01:48,  7.87s/it]

15it [01:56,  7.86s/it]

16it [02:04,  7.91s/it]

17it [02:13,  8.15s/it]

18it [02:21,  8.04s/it]

19it [02:28,  7.94s/it]

20it [02:36,  7.90s/it]

21it [02:44,  7.85s/it]

22it [02:52,  7.80s/it]

23it [02:59,  7.77s/it]

24it [03:07,  7.75s/it]

24it [03:07,  7.82s/it]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MLPGBDTEnsemble(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:13, 13.14s/it]

2it [00:26, 13.12s/it]

3it [00:39, 13.27s/it]

4it [00:52, 13.21s/it]

5it [01:05, 13.16s/it]

6it [01:19, 13.17s/it]

7it [01:32, 13.17s/it]

8it [01:45, 13.21s/it]

9it [01:58, 13.19s/it]

10it [02:13, 13.57s/it]

11it [02:28, 14.06s/it]

12it [02:43, 14.27s/it]

13it [02:57, 14.37s/it]

13it [02:57, 13.66s/it]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################