# Experiment 046: Sophisticated Ensemble

**Inspired by:** lishellliang/mixall kernel

**Key techniques:**
1. Ensemble of 4 models: MLP + XGBoost + RandomForest + LightGBM
2. Weighted ensemble with learned weights
3. Standard leave-one-solvent-out CV (NOT GroupKFold)

**Hypothesis:** Diverse models may capture different aspects of the data, potentially reducing structural error and changing the CV-LB relationship.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading
DATA_PATH = '/home/data'

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[["SM", "Product 2", "Product 3"]]
    return X, Y

# Load feature lookup tables
spange_df = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
drfp_df = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)

SPANGE_COLS = [c for c in spange_df.columns if c != 'solvent smiles']
DRFP_COLS = [c for c in drfp_df.columns if str(c).isdigit() or isinstance(c, int)]

print(f'Spange: {len(SPANGE_COLS)} features')
print(f'DRFP: {len(DRFP_COLS)} features')

# Load data
X_single, Y_single = load_data('single_solvent')
X_full, Y_full = load_data('full')

print(f'Single solvent: {len(X_single)} samples')
print(f'Full data: {len(X_full)} samples')

Spange: 13 features
DRFP: 2048 features
Single solvent: 656 samples
Full data: 1227 samples


In [3]:
# Feature extraction
def get_features(X, data_type='single'):
    features_list = []
    
    for idx, row in X.iterrows():
        time_m = row['Residence Time']
        temp_c = row['Temperature']
        temp_k = temp_c + 273.15
        
        kinetics = np.array([
            time_m, temp_c, 1.0 / temp_k,
            np.log(time_m + 1), time_m / temp_k
        ], dtype=np.float32)
        
        if data_type == 'single':
            solvent = row['SOLVENT NAME']
            spange = spange_df.loc[solvent, SPANGE_COLS].values.astype(np.float32) if solvent in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            drfp = drfp_df.loc[solvent, DRFP_COLS].values.astype(np.float32) if solvent in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
        else:
            solvent_a = row['SOLVENT A NAME']
            solvent_b = row['SOLVENT B NAME']
            pct_b = row['SolventB%'] / 100.0
            pct_a = 1 - pct_b
            
            sp_a = spange_df.loc[solvent_a, SPANGE_COLS].values.astype(np.float32) if solvent_a in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            sp_b = spange_df.loc[solvent_b, SPANGE_COLS].values.astype(np.float32) if solvent_b in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            spange = pct_a * sp_a + pct_b * sp_b
            
            dr_a = drfp_df.loc[solvent_a, DRFP_COLS].values.astype(np.float32) if solvent_a in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            dr_b = drfp_df.loc[solvent_b, DRFP_COLS].values.astype(np.float32) if solvent_b in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            drfp = pct_a * dr_a + pct_b * dr_b
        
        features = np.concatenate([kinetics, spange, drfp])
        features_list.append(features)
    
    return np.array(features_list, dtype=np.float32)

print('Feature extraction defined')

Feature extraction defined


In [4]:
# MLP Model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64, 32], dropout=0.3):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print('MLPModel defined')

MLPModel defined


In [5]:
# Sophisticated Ensemble Model (MLP + XGBoost + RandomForest + LightGBM)
class SophisticatedEnsemble:
    def __init__(self, data='single', weights=[0.35, 0.25, 0.15, 0.25]):
        self.data_type = data
        self.weights = weights  # [MLP, XGBoost, RF, LightGBM]
        
        self.scaler = None
        self.mlp_models = []
        self.xgb_models = []
        self.rf_models = []
        self.lgbm_models = []
    
    def train_model(self, X_train, y_train, epochs=150):
        X_feat = get_features(X_train, self.data_type)
        y_np = y_train.values.astype(np.float32)
        
        # Scale features
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # Train MLP models (ensemble of 3)
        self.mlp_models = []
        for _ in range(3):
            model = MLPModel(X_scaled.shape[1], hidden_dims=[128, 64, 32], dropout=0.3).to(device)
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
            
            X_tensor = torch.tensor(X_scaled).to(device)
            y_tensor = torch.tensor(y_np).to(device)
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    loss = nn.MSELoss()(pred, y_batch)
                    loss.backward()
                    optimizer.step()
                scheduler.step()
            
            model.eval()
            self.mlp_models.append(model)
        
        # Train XGBoost models (one per target)
        self.xgb_models = []
        for i in range(3):
            xgb_model = xgb.XGBRegressor(
                n_estimators=200, max_depth=6, learning_rate=0.05,
                subsample=0.8, colsample_bytree=0.8, random_state=42, verbosity=0
            )
            xgb_model.fit(X_scaled, y_np[:, i])
            self.xgb_models.append(xgb_model)
        
        # Train RandomForest models (one per target)
        self.rf_models = []
        for i in range(3):
            rf_model = RandomForestRegressor(
                n_estimators=100, max_depth=10, min_samples_leaf=5, random_state=42, n_jobs=-1
            )
            rf_model.fit(X_scaled, y_np[:, i])
            self.rf_models.append(rf_model)
        
        # Train LightGBM models (one per target)
        self.lgbm_models = []
        for i in range(3):
            lgbm_model = lgb.LGBMRegressor(
                n_estimators=200, max_depth=6, learning_rate=0.05,
                num_leaves=31, subsample=0.8, random_state=42, verbose=-1
            )
            lgbm_model.fit(X_scaled, y_np[:, i])
            self.lgbm_models.append(lgbm_model)
        
        return self
    
    def predict(self, X_test):
        X_feat = get_features(X_test, self.data_type)
        X_scaled = self.scaler.transform(X_feat)
        
        # MLP predictions (average of ensemble)
        mlp_preds = []
        for model in self.mlp_models:
            X_tensor = torch.tensor(X_scaled).to(device)
            with torch.no_grad():
                pred = model(X_tensor).cpu().numpy()
            mlp_preds.append(pred)
        mlp_preds = np.mean(mlp_preds, axis=0)
        
        # XGBoost predictions
        xgb_preds = np.zeros((len(X_test), 3))
        for i, xgb_model in enumerate(self.xgb_models):
            xgb_preds[:, i] = xgb_model.predict(X_scaled)
        
        # RandomForest predictions
        rf_preds = np.zeros((len(X_test), 3))
        for i, rf_model in enumerate(self.rf_models):
            rf_preds[:, i] = rf_model.predict(X_scaled)
        
        # LightGBM predictions
        lgbm_preds = np.zeros((len(X_test), 3))
        for i, lgbm_model in enumerate(self.lgbm_models):
            lgbm_preds[:, i] = lgbm_model.predict(X_scaled)
        
        # Weighted ensemble
        final_preds = (self.weights[0] * mlp_preds + 
                       self.weights[1] * xgb_preds + 
                       self.weights[2] * rf_preds + 
                       self.weights[3] * lgbm_preds)
        
        final_preds = np.clip(final_preds, 0, 1)
        return torch.tensor(final_preds, dtype=torch.float32)

print('SophisticatedEnsemble defined')

SophisticatedEnsemble defined


In [None]:
# Test on single solvent data
print("Testing sophisticated ensemble on single solvent data...")
print()

all_solvents = sorted(X_single["SOLVENT NAME"].unique())
fold_mses = []

for test_solvent in all_solvents:
    mask = X_single["SOLVENT NAME"] != test_solvent
    
    model = SophisticatedEnsemble(data='single', weights=[0.35, 0.25, 0.15, 0.25])
    model.train_model(X_single[mask], Y_single[mask], epochs=100)
    preds = model.predict(X_single[~mask])
    
    actuals = Y_single[~mask].values
    mse = np.mean((actuals - preds.numpy())**2)
    fold_mses.append(mse)

mean_mse = np.mean(fold_mses)
std_mse = np.std(fold_mses)
print(f"Sophisticated Ensemble CV MSE: {mean_mse:.6f} +/- {std_mse:.6f}")
print(f"Baseline (exp_030): CV = 0.008298")
if mean_mse < 0.008298:
    print(f"Improvement: {(0.008298 - mean_mse) / 0.008298 * 100:.2f}%")
else:
    print(f"Degradation: {(mean_mse - 0.008298) / 0.008298 * 100:.2f}%")