# Experiment 003: Ensemble of MLP + LightGBM + XGBoost

Combining diverse model architectures:
- Physics-Informed MLP (our best baseline)
- LightGBM with per-target optimization
- XGBoost with per-target optimization

Using weighted averaging with optimized weights.

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
import tqdm
import warnings
warnings.filterwarnings('ignore')

print(f"GPU available: {torch.cuda.is_available()}")
torch.set_default_dtype(torch.double)

GPU available: True


In [2]:
# Data loading utilities
DATA_PATH = '/home/data'

INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    return X, df[TARGET_LABELS]

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent_name in sorted(X["SOLVENT NAME"].unique()):
        train_mask = X["SOLVENT NAME"] != solvent_name
        yield (X[train_mask], Y[train_mask]), (X[~train_mask], Y[~train_mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates().sort_values(["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, row in ramps.iterrows():
        train_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != row).any(axis=1)
        yield (X[train_mask], Y[train_mask]), (X[~train_mask], Y[~train_mask])

SPANGE_DF = load_features('spange_descriptors')
print(f"Spange descriptors: {SPANGE_DF.shape}")

Spange descriptors: (26, 13)


In [3]:
# Featurizer with Arrhenius kinetics
class KineticFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.featurizer = SPANGE_DF
        self.feats_dim = self.featurizer.shape[1] + 2 + 3

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c, time_m = X_vals[:, 1:2], X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A = self.featurizer.loc[X["SOLVENT A NAME"]].values
            B = self.featurizer.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            X_chem = B * (1 - (1-pct)) + A * (1-pct) if flip else A * (1 - pct) + B * pct
        else:
            X_chem = self.featurizer.loc[X["SOLVENT NAME"]].values
        return np.hstack([X_kinetic, X_chem])

In [4]:
# MLP Internal Model
class MLPInternal(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 128, 64], dropout=0.2):
        super().__init__()
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([nn.Linear(prev_dim, h_dim), nn.BatchNorm1d(h_dim), nn.ReLU(), nn.Dropout(dropout)])
            prev_dim = h_dim
        layers.extend([nn.Linear(prev_dim, 3), nn.Sigmoid()])
        self.net = nn.Sequential(*layers)
    def forward(self, x): return self.net(x)

In [5]:
# Ensemble Model combining MLP + LightGBM + XGBoost
class EnsembleModel:
    def __init__(self, data='single', n_mlp_models=3, mlp_weight=0.5, lgb_weight=0.25, xgb_weight=0.25):
        self.data_type = data
        self.use_tta = (data == 'full')
        self.featurizer = KineticFeaturizer(mixed=(data=='full'))
        self.n_mlp_models = n_mlp_models
        
        # Ensemble weights
        self.mlp_weight = mlp_weight
        self.lgb_weight = lgb_weight
        self.xgb_weight = xgb_weight
        
        self.mlp_models = []
        self.lgb_models = []
        self.xgb_models = []
        
    def train_model(self, X_train, y_train):
        X_feats = self.featurizer.featurize(X_train, flip=False)
        
        if self.use_tta:
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_feats, X_flip])
            y_all = np.vstack([y_train.values, y_train.values])
        else:
            X_all = X_feats
            y_all = y_train.values
        
        input_dim = X_all.shape[1]
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Train MLP ensemble
        X_tensor = torch.tensor(X_all)
        y_tensor = torch.tensor(y_all)
        
        self.mlp_models = []
        for i in range(self.n_mlp_models):
            torch.manual_seed(42 + i)
            model = MLPInternal(input_dim).to(device)
            model.train()
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-5)
            criterion = nn.HuberLoss()
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=20)
            
            for epoch in range(200):
                epoch_loss = 0.0
                for inputs, targets in loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    epoch_loss += loss.item() * inputs.size(0)
                scheduler.step(epoch_loss / len(dataset))
            self.mlp_models.append(model)
        
        # Train LightGBM per target
        self.lgb_models = []
        lgb_params = {'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.03,
                      'num_leaves': 31, 'max_depth': 6, 'verbose': -1, 'seed': 42}
        for i in range(3):
            X_tr, X_val, y_tr, y_val = train_test_split(X_all, y_all[:, i], test_size=0.15, random_state=42)
            train_data = lgb.Dataset(X_tr, label=y_tr)
            val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
            model = lgb.train(lgb_params, train_data, num_boost_round=500,
                             valid_sets=[val_data], callbacks=[lgb.early_stopping(50, verbose=False)])
            self.lgb_models.append(model)
        
        # Train XGBoost per target
        self.xgb_models = []
        xgb_params = {'objective': 'reg:squarederror', 'learning_rate': 0.03, 'max_depth': 6,
                      'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42, 'verbosity': 0}
        for i in range(3):
            X_tr, X_val, y_tr, y_val = train_test_split(X_all, y_all[:, i], test_size=0.15, random_state=42)
            dtrain = xgb.DMatrix(X_tr, label=y_tr)
            dval = xgb.DMatrix(X_val, label=y_val)
            model = xgb.train(xgb_params, dtrain, num_boost_round=500,
                             evals=[(dval, 'val')], early_stopping_rounds=50, verbose_eval=False)
            self.xgb_models.append(model)
    
    def predict(self, X_test):
        device = next(self.mlp_models[0].parameters()).device
        X_feats = self.featurizer.featurize(X_test, flip=False)
        
        if self.use_tta:
            X_flip = self.featurizer.featurize(X_test, flip=True)
            
            # MLP predictions with TTA
            X_std_t = torch.tensor(X_feats).to(device)
            X_flip_t = torch.tensor(X_flip).to(device)
            mlp_preds = torch.zeros((len(X_test), 3)).to(device)
            with torch.no_grad():
                for model in self.mlp_models:
                    model.eval()
                    mlp_preds += (model(X_std_t) + model(X_flip_t)) * 0.5
            mlp_preds = (mlp_preds / self.n_mlp_models).cpu().numpy()
            
            # LightGBM predictions with TTA
            lgb_std = np.column_stack([m.predict(X_feats) for m in self.lgb_models])
            lgb_flip = np.column_stack([m.predict(X_flip) for m in self.lgb_models])
            lgb_preds = (lgb_std + lgb_flip) / 2
            
            # XGBoost predictions with TTA
            dstd = xgb.DMatrix(X_feats)
            dflip = xgb.DMatrix(X_flip)
            xgb_std = np.column_stack([m.predict(dstd) for m in self.xgb_models])
            xgb_flip = np.column_stack([m.predict(dflip) for m in self.xgb_models])
            xgb_preds = (xgb_std + xgb_flip) / 2
        else:
            # MLP predictions
            X_t = torch.tensor(X_feats).to(device)
            mlp_preds = torch.zeros((len(X_test), 3)).to(device)
            with torch.no_grad():
                for model in self.mlp_models:
                    model.eval()
                    mlp_preds += model(X_t)
            mlp_preds = (mlp_preds / self.n_mlp_models).cpu().numpy()
            
            # LightGBM predictions
            lgb_preds = np.column_stack([m.predict(X_feats) for m in self.lgb_models])
            
            # XGBoost predictions
            dtest = xgb.DMatrix(X_feats)
            xgb_preds = np.column_stack([m.predict(dtest) for m in self.xgb_models])
        
        # Weighted ensemble
        preds = self.mlp_weight * mlp_preds + self.lgb_weight * lgb_preds + self.xgb_weight * xgb_preds
        preds = np.clip(preds, 0, 1)
        return torch.tensor(preds)

In [6]:
# Test on single fold
print("Testing Ensemble on single fold...")
X, Y = load_data("single_solvent")
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)
print(f"Train: {len(train_X)}, Test: {len(test_X)}")

model = EnsembleModel(data='single', n_mlp_models=3)
model.train_model(train_X, train_Y)

preds = model.predict(test_X)
rmse = np.sqrt(((preds.numpy() - test_Y.values) ** 2).mean())
print(f"Single fold RMSE: {rmse:.4f}")

Testing Ensemble on single fold...
Train: 619, Test: 37


Single fold RMSE: 0.1957


In [None]:
# Full CV for single solvent task
print("\n" + "="*50)
print("TASK 0: Single Solvent (Leave-One-Solvent-Out CV)")
print("="*50)

X, Y = load_data("single_solvent")
split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
fold_rmses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = EnsembleModel(data='single', n_mlp_models=3)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    fold_rmse = np.sqrt(((predictions - test_Y.values) ** 2).mean())
    fold_rmses.append(fold_rmse)
    
    for row_idx, row in enumerate(predictions):
        all_predictions.append({"task": 0, "fold": fold_idx, "row": row_idx,
                               "target_1": row[0], "target_2": row[1], "target_3": row[2]})

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"\nSingle Solvent CV RMSE: {np.mean(fold_rmses):.5f} ± {np.std(fold_rmses):.5f}")

In [None]:
# Full CV for full data task
print("\n" + "="*50)
print("TASK 1: Full Data (Leave-One-Ramp-Out CV)")
print("="*50)

X, Y = load_data("full")
split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []
fold_rmses_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = EnsembleModel(data='full', n_mlp_models=3)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    fold_rmse = np.sqrt(((predictions - test_Y.values) ** 2).mean())
    fold_rmses_full.append(fold_rmse)
    
    for row_idx, row in enumerate(predictions):
        all_predictions.append({"task": 1, "fold": fold_idx, "row": row_idx,
                               "target_1": row[0], "target_2": row[1], "target_3": row[2]})

submission_full_data = pd.DataFrame(all_predictions)
print(f"\nFull Data CV RMSE: {np.mean(fold_rmses_full):.5f} ± {np.std(fold_rmses_full):.5f}")

In [None]:
# Save submission
submission = pd.concat([submission_single_solvent, submission_full_data]).reset_index()
submission.index.name = "id"
submission.to_csv("/home/code/experiments/003_ensemble/submission.csv", index=True)
submission.to_csv("/home/submission/submission.csv", index=True)
print(f"Submission saved with {len(submission)} rows")

In [None]:
# Final summary
print("\n" + "="*50)
print("FINAL RESULTS - Ensemble (MLP + LightGBM + XGBoost)")
print("="*50)
print(f"Single Solvent CV RMSE: {np.mean(fold_rmses):.5f} ± {np.std(fold_rmses):.5f}")
print(f"Full Data CV RMSE: {np.mean(fold_rmses_full):.5f} ± {np.std(fold_rmses_full):.5f}")
overall_rmse = (np.mean(fold_rmses) + np.mean(fold_rmses_full)) / 2
print(f"\nOverall CV RMSE: {overall_rmse:.5f}")
print(f"\nComparison:")
print(f"  Baseline MLP: 0.08819")
print(f"  LightGBM:     0.10019")
print(f"  Ensemble:     {overall_rmse:.5f}")
print(f"  Target:       0.04740")