# Experiment 047: Diverse Ensemble Model

**Inspiration:** The 'mixall' kernel achieves good CV/LB with an ensemble of MLP + XGBoost + RandomForest + LightGBM.

**Hypothesis:** Our current ensemble (GP + MLP + LGBM) may be too homogeneous. A more diverse ensemble could capture different patterns and potentially change the CV-LB relationship.

**Implementation:**
1. MLP: [128, 64, 32] with BatchNorm, ReLU, Dropout(0.1)
2. XGBoost: n_estimators=300, max_depth=6
3. RandomForest: n_estimators=300, max_depth=15
4. LightGBM: n_estimators=300, num_leaves=31
5. Weighted ensemble: [0.4, 0.2, 0.2, 0.2]

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading
DATA_PATH = '/home/data'

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[["SM", "Product 2", "Product 3"]]
    return X, Y

# Load feature lookup tables
spange_df = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
SPANGE_COLS = [c for c in spange_df.columns if c != 'solvent smiles']

print(f'Spange: {len(SPANGE_COLS)} features')

# Load data
X_single, Y_single = load_data('single_solvent')
X_full, Y_full = load_data('full')

print(f'Single solvent: {len(X_single)} samples')
print(f'Full data: {len(X_full)} samples')

Spange: 13 features
Single solvent: 656 samples
Full data: 1227 samples


In [3]:
# Feature extraction - simpler features (Spange + kinetics only, no DRFP)
def get_features_simple(X, data_type='single'):
    """Extract simpler features: Spange descriptors + kinetics only."""
    features_list = []
    
    for idx, row in X.iterrows():
        time_m = row['Residence Time']
        temp_c = row['Temperature']
        temp_k = temp_c + 273.15
        
        kinetics = np.array([
            time_m, temp_c, 1.0 / temp_k,
            np.log(time_m + 1), time_m / temp_k
        ], dtype=np.float32)
        
        if data_type == 'single':
            solvent = row['SOLVENT NAME']
            spange = spange_df.loc[solvent, SPANGE_COLS].values.astype(np.float32) if solvent in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
        else:
            solvent_a = row['SOLVENT A NAME']
            solvent_b = row['SOLVENT B NAME']
            pct_b = row['SolventB%'] / 100.0
            pct_a = 1 - pct_b
            
            sp_a = spange_df.loc[solvent_a, SPANGE_COLS].values.astype(np.float32) if solvent_a in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            sp_b = spange_df.loc[solvent_b, SPANGE_COLS].values.astype(np.float32) if solvent_b in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            spange = pct_a * sp_a + pct_b * sp_b
        
        features = np.concatenate([kinetics, spange])
        features_list.append(features)
    
    return np.array(features_list, dtype=np.float32)

print('Feature extraction defined')
print(f'Feature dimension: 5 (kinetics) + {len(SPANGE_COLS)} (Spange) = {5 + len(SPANGE_COLS)}')

Feature extraction defined
Feature dimension: 5 (kinetics) + 13 (Spange) = 18


In [4]:
# Enhanced MLP Model
class EnhancedMLP(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64, 32], dropout=0.1):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print('EnhancedMLP defined')

EnhancedMLP defined


In [5]:
# Diverse Ensemble Model (MLP + XGBoost + RandomForest + LightGBM)
class DiverseEnsembleModel:
    """Diverse ensemble inspired by 'mixall' kernel.
    
    Uses MLP + XGBoost + RandomForest + LightGBM with weighted averaging.
    """
    
    def __init__(self, data='single', weights=[0.4, 0.2, 0.2, 0.2]):
        self.data_type = data
        self.weights = weights  # MLP, XGB, RF, LGBM
        
        self.scaler = None
        self.mlp_models = []
        self.xgb_model = None
        self.rf_model = None
        self.lgbm_model = None
    
    def train_model(self, X_train, y_train, epochs=200):
        X_feat = get_features_simple(X_train, self.data_type)
        y_np = y_train.values.astype(np.float32)
        
        # Scale features
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # Train MLP ensemble (3 models)
        self.mlp_models = []
        for _ in range(3):
            model = EnhancedMLP(X_scaled.shape[1], hidden_dims=[128, 64, 32], dropout=0.1).to(device)
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
            
            X_tensor = torch.tensor(X_scaled).to(device)
            y_tensor = torch.tensor(y_np).to(device)
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    loss = nn.MSELoss()(pred, y_batch)
                    loss.backward()
                    optimizer.step()
                scheduler.step()
            
            model.eval()
            self.mlp_models.append(model)
        
        # Train XGBoost
        self.xgb_model = MultiOutputRegressor(
            xgb.XGBRegressor(
                n_estimators=300, learning_rate=0.05, max_depth=6,
                subsample=0.8, colsample_bytree=0.8, random_state=42, verbosity=0
            )
        )
        self.xgb_model.fit(X_scaled, y_np)
        
        # Train RandomForest
        self.rf_model = MultiOutputRegressor(
            RandomForestRegressor(
                n_estimators=300, max_depth=15, random_state=42, n_jobs=-1
            )
        )
        self.rf_model.fit(X_scaled, y_np)
        
        # Train LightGBM
        self.lgbm_model = MultiOutputRegressor(
            lgb.LGBMRegressor(
                n_estimators=300, learning_rate=0.05, num_leaves=31,
                random_state=42, verbose=-1
            )
        )
        self.lgbm_model.fit(X_scaled, y_np)
        
        return self
    
    def predict(self, X_test):
        X_feat = get_features_simple(X_test, self.data_type)
        X_scaled = self.scaler.transform(X_feat)
        
        # MLP predictions (average of ensemble)
        mlp_preds = []
        for model in self.mlp_models:
            X_tensor = torch.tensor(X_scaled).to(device)
            with torch.no_grad():
                pred = model(X_tensor).cpu().numpy()
            mlp_preds.append(pred)
        mlp_preds = np.mean(mlp_preds, axis=0)
        
        # XGBoost predictions
        xgb_preds = self.xgb_model.predict(X_scaled)
        
        # RandomForest predictions
        rf_preds = self.rf_model.predict(X_scaled)
        
        # LightGBM predictions
        lgbm_preds = self.lgbm_model.predict(X_scaled)
        
        # Weighted ensemble
        ensemble_preds = (self.weights[0] * mlp_preds + 
                          self.weights[1] * xgb_preds + 
                          self.weights[2] * rf_preds + 
                          self.weights[3] * lgbm_preds)
        
        ensemble_preds = np.clip(ensemble_preds, 0, 1)
        
        return torch.tensor(ensemble_preds, dtype=torch.float32)

print('DiverseEnsembleModel defined')

DiverseEnsembleModel defined


In [6]:
# Test diverse ensemble on single solvent data
print("Testing diverse ensemble on single solvent data...")
print()

all_solvents = sorted(X_single["SOLVENT NAME"].unique())
fold_mses = []

for test_solvent in all_solvents:
    mask = X_single["SOLVENT NAME"] != test_solvent
    
    model = DiverseEnsembleModel(data='single', weights=[0.4, 0.2, 0.2, 0.2])
    model.train_model(X_single[mask], Y_single[mask], epochs=150)
    preds = model.predict(X_single[~mask])
    
    actuals = Y_single[~mask].values
    mse = np.mean((actuals - preds.numpy())**2)
    fold_mses.append(mse)
    print(f"{test_solvent}: MSE = {mse:.6f}")

mean_mse = np.mean(fold_mses)
std_mse = np.std(fold_mses)
print(f"\n=== Diverse Ensemble CV Results ===")
print(f"Mean MSE: {mean_mse:.6f} +/- {std_mse:.6f}")
print(f"Baseline (exp_030): CV = 0.008298")

Testing diverse ensemble on single solvent data...



1,1,1,3,3,3-Hexafluoropropan-2-ol: MSE = 0.040208


2,2,2-Trifluoroethanol: MSE = 0.017506


2-Methyltetrahydrofuran [2-MeTHF]: MSE = 0.002296


Acetonitrile: MSE = 0.016375


Acetonitrile.Acetic Acid: MSE = 0.025274


Butanone [MEK]: MSE = 0.003126


Cyclohexane: MSE = 0.014711


DMA [N,N-Dimethylacetamide]: MSE = 0.006389


Decanol: MSE = 0.008167


Diethyl Ether [Ether]: MSE = 0.012435


Dihydrolevoglucosenone (Cyrene): MSE = 0.007206


Dimethyl Carbonate: MSE = 0.008959


Ethanol: MSE = 0.002158


Ethyl Acetate: MSE = 0.005683


Ethyl Lactate: MSE = 0.003473


Ethylene Glycol [1,2-Ethanediol]: MSE = 0.016504


IPA [Propan-2-ol]: MSE = 0.008462


MTBE [tert-Butylmethylether]: MSE = 0.004697


Methanol: MSE = 0.003262


Methyl Propionate: MSE = 0.000900


THF [Tetrahydrofuran]: MSE = 0.000591


Water.2,2,2-Trifluoroethanol: MSE = 0.004290


Water.Acetonitrile: MSE = 0.009525


tert-Butanol [2-Methylpropan-2-ol]: MSE = 0.003240

=== Diverse Ensemble CV Results ===
Mean MSE: 0.009393 +/- 0.008862
Baseline (exp_030): CV = 0.008298


In [7]:
# Summary of diverse ensemble experiment
print("=== Summary of Diverse Ensemble Experiment ===")
print()
print(f"Diverse Ensemble CV MSE: {mean_mse:.6f} +/- {std_mse:.6f}")
print(f"Baseline (exp_030): CV = 0.008298")
print(f"Improvement: {(0.008298 - mean_mse) / 0.008298 * 100:.1f}%")
print()
print("Per-solvent comparison (top 5 hardest):")
print(f"  HFIP: MSE = 0.040208 (was 0.096369 in baseline)")
print(f"  Cyclohexane: MSE = 0.014711 (was 0.198108 in baseline)")
print(f"  TFE: MSE = 0.017506 (was 0.041910 in baseline)")
print(f"  Acetonitrile.Acetic Acid: MSE = 0.025274")
print(f"  Ethylene Glycol: MSE = 0.016504")
print()
print("Key Insight:")
print("The diverse ensemble with simpler features (18 vs 2066) does NOT improve CV.")
print("However, it significantly reduces error on Cyclohexane (0.014 vs 0.198).")
print("This suggests the DRFP features may be causing overfitting for some solvents.")
print()
print("CONCLUSION: Diverse ensemble does NOT help overall CV.")
print("The baseline (exp_030) remains the best model.")

=== Summary of Diverse Ensemble Experiment ===

Diverse Ensemble CV MSE: 0.009393 +/- 0.008862
Baseline (exp_030): CV = 0.008298
Improvement: -13.2%

Per-solvent comparison (top 5 hardest):
  HFIP: MSE = 0.040208 (was 0.096369 in baseline)
  Cyclohexane: MSE = 0.014711 (was 0.198108 in baseline)
  TFE: MSE = 0.017506 (was 0.041910 in baseline)
  Acetonitrile.Acetic Acid: MSE = 0.025274
  Ethylene Glycol: MSE = 0.016504

Key Insight:
The diverse ensemble with simpler features (18 vs 2066) does NOT improve CV.
However, it significantly reduces error on Cyclohexane (0.014 vs 0.198).
This suggests the DRFP features may be causing overfitting for some solvents.

CONCLUSION: Diverse ensemble does NOT help overall CV.
The baseline (exp_030) remains the best model.
