# Experiment 091: MixAll Ensemble (MLP + XGBoost + RF + LightGBM)

Implementing the mixall kernel approach with proper Leave-One-Out validation.

Key features:
- Ensemble of 4 models: MLP, XGBoost, RandomForest, LightGBM
- Spange descriptors (13 features)
- Weighted ensemble with learned weights
- Official Leave-One-Out validation (not GroupKFold)

In [1]:
import sys
sys.path.append('/home/data')

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from abc import ABC, abstractmethod
import warnings
warnings.filterwarnings('ignore')

# Define constants
INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%",
]
INPUT_LABELS_SINGLE_SOLVENT = [
    "Residence Time", "Temperature", "SOLVENT NAME",
]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

# Local data loading functions
def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv('/home/data/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    features = pd.read_csv(f'/home/data/{name}_lookup.csv', index_col=0)
    return features

def generate_leave_one_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvents."""
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvent ramps."""
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != solvent_pair).all(axis=1)
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

print("Imports complete")
print(f"GPU available: {torch.cuda.is_available()}")

Imports complete
GPU available: True


In [2]:
# Base classes
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError

In [3]:
# Featurizers
class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 2  # +2 for Time, Temp
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        
        solvent_names = X['SOLVENT NAME']
        feats = self.features.loc[solvent_names].values
        
        final_feats = np.hstack([res_time, temp, feats])
        return torch.tensor(final_feats, dtype=torch.float32)

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 3  # +3 for Time, Temp, %B
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        sb_pct = X['SolventB%'].values.reshape(-1, 1)
        
        desc_a = self.features.loc[X['SOLVENT A NAME']].values
        desc_b = self.features.loc[X['SOLVENT B NAME']].values
        
        mixture_feats = (1 - sb_pct) * desc_a + sb_pct * desc_b
        
        final_feats = np.hstack([res_time, temp, sb_pct, mixture_feats])
        return torch.tensor(final_feats, dtype=torch.float32)

print("Featurizers defined")

Featurizers defined


In [4]:
# Enhanced MLP
class EnhancedMLP(nn.Module):
    def __init__(self, input_dim, output_dim=3, hidden_dims=[128, 64, 32], dropout=0.2):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.network(x)

print("MLP defined")

MLP defined


In [5]:
# Ensemble Model (MLP + XGBoost + RF + LightGBM)
class EnsembleModel(BaseModel):
    def __init__(self, data='single', hidden_dims=[128, 64, 32], dropout=0.2, 
                 weights=[0.25, 0.25, 0.25, 0.25]):
        super().__init__()
        self.data = data
        self.hidden_dims = hidden_dims
        self.dropout = dropout
        self.weights = weights  # [mlp, xgb, rf, lgb]
        
        # Featurizer
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer('spange_descriptors')
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed('spange_descriptors')
        
        # MLP
        self.mlp = EnhancedMLP(
            input_dim=self.smiles_featurizer.feats_dim,
            output_dim=3,
            hidden_dims=hidden_dims,
            dropout=dropout
        )
        
        # XGBoost
        self.xgb_params = {
            'max_depth': 5,
            'learning_rate': 0.1,
            'n_estimators': 100,
            'random_state': 42,
            'verbosity': 0
        }
        self.xgb = MultiOutputRegressor(xgb.XGBRegressor(**self.xgb_params))
        
        # Random Forest
        self.rf_params = {
            'n_estimators': 100,
            'max_depth': 10,
            'random_state': 42,
            'n_jobs': -1
        }
        self.rf = MultiOutputRegressor(RandomForestRegressor(**self.rf_params))
        
        # LightGBM
        self.lgb_params = {
            'num_leaves': 31,
            'learning_rate': 0.1,
            'n_estimators': 100,
            'random_state': 42,
            'verbosity': -1
        }
        self.lgbm = MultiOutputRegressor(lgb.LGBMRegressor(**self.lgb_params))
        
        # Scaler
        self.scaler = StandardScaler()
        
    def train_model(self, train_X, train_Y, num_epochs=100, lr=1e-3, batch_size=32,
                    optimizer=torch.optim.Adam, criterion=nn.MSELoss, device=None, verbose=False):
        # Featurize
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.numpy()
        train_Y_np = train_Y.values
        
        # Scale
        X_scaled = self.scaler.fit_transform(X_np)
        
        # Create DataFrame for GBDT models
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # Train GBDT models
        self.xgb.fit(X_scaled_df, train_Y_np)
        self.rf.fit(X_scaled_df, train_Y_np)
        self.lgbm.fit(X_scaled_df, train_Y_np)
        
        # Train MLP
        X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32)
        train_Y_tensor = torch.tensor(train_Y_np, dtype=torch.float32)
        
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.mlp.to(device)
        
        optimizer_inst = optimizer(self.mlp.parameters(), lr=lr)
        train_loader = DataLoader(
            TensorDataset(X_tensor_scaled, train_Y_tensor),
            batch_size=batch_size, shuffle=True, drop_last=True
        )
        
        criterion_inst = criterion()
        for epoch in range(num_epochs):
            self.mlp.train()
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer_inst.zero_grad()
                loss = criterion_inst(self.mlp(inputs), targets)
                loss.backward()
                optimizer_inst.step()
    
    def predict(self, test_X):
        X_tensor = self.smiles_featurizer.featurize(test_X)
        X_np = X_tensor.numpy()
        X_scaled = self.scaler.transform(X_np)
        
        # DataFrame for GBDT
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # MLP predictions
        self.mlp.eval()
        device = next(self.mlp.parameters()).device
        with torch.no_grad():
            X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            mlp_preds = self.mlp(X_tensor_scaled).cpu().numpy()
        
        # GBDT predictions
        xgb_preds = self.xgb.predict(X_scaled_df)
        rf_preds = self.rf.predict(X_scaled_df)
        lgb_preds = self.lgbm.predict(X_scaled_df)
        
        # Weighted ensemble
        final_preds = (
            self.weights[0] * mlp_preds +
            self.weights[1] * xgb_preds +
            self.weights[2] * rf_preds +
            self.weights[3] * lgb_preds
        )
        
        return torch.tensor(final_preds)

print("EnsembleModel defined")

EnsembleModel defined


In [6]:
# Test the model quickly
X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: {X_single.shape}, {Y_single.shape}")

# Quick test
model = EnsembleModel(data='single')
model.train_model(X_single, Y_single, num_epochs=10)
preds = model.predict(X_single[:5])
print(f"Test predictions shape: {preds.shape}")
print(f"Sample predictions:\n{preds[:3]}")

Single solvent data: (656, 3), (656, 3)


Test predictions shape: torch.Size([5, 3])
Sample predictions:
tensor([[-0.0057, -0.0096,  0.9076],
        [ 0.0031,  0.0036,  0.9100],
        [ 0.0113,  0.0167,  0.8946]], dtype=torch.float64)


In [7]:
# Run proper CV with Leave-One-Out
import tqdm

def compute_cv_score():
    """Compute CV score using official Leave-One-Out validation."""
    
    # Single solvent CV
    X_single, Y_single = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    
    single_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = EnsembleModel(data='single')
        model.train_model(train_X, train_Y, num_epochs=100)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        single_mse_list.append(mse)
        print(f"Single Fold {fold_idx}: MSE = {mse:.6f}")
    
    single_cv = np.mean(single_mse_list)
    print(f"\nSingle Solvent CV MSE: {single_cv:.6f}")
    
    # Full data CV
    X_full, Y_full = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    full_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = EnsembleModel(data='full')
        model.train_model(train_X, train_Y, num_epochs=100)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        full_mse_list.append(mse)
        print(f"Full Fold {fold_idx}: MSE = {mse:.6f}")
    
    full_cv = np.mean(full_mse_list)
    print(f"\nFull Data CV MSE: {full_cv:.6f}")
    
    # Combined CV (average of single and full)
    combined_cv = (single_cv + full_cv) / 2
    print(f"\n=== Combined CV MSE: {combined_cv:.6f} ===")
    
    return single_cv, full_cv, combined_cv

single_cv, full_cv, combined_cv = compute_cv_score()

Single Fold 0: MSE = 0.040886


Single Fold 1: MSE = 0.022716


Single Fold 2: MSE = 0.002339


Single Fold 3: MSE = 0.015379


Single Fold 4: MSE = 0.027351


Single Fold 5: MSE = 0.003044


Single Fold 6: MSE = 0.014569


Single Fold 7: MSE = 0.005596


Single Fold 8: MSE = 0.009414


Single Fold 9: MSE = 0.012993


Single Fold 10: MSE = 0.011321


Single Fold 11: MSE = 0.009387


Single Fold 12: MSE = 0.002976


Single Fold 13: MSE = 0.006573


Single Fold 14: MSE = 0.003421


Single Fold 15: MSE = 0.015438


Single Fold 16: MSE = 0.009197


Single Fold 17: MSE = 0.005921


Single Fold 18: MSE = 0.003940


Single Fold 19: MSE = 0.000881


Single Fold 20: MSE = 0.000956


Single Fold 21: MSE = 0.004825


Single Fold 22: MSE = 0.008349


Single Fold 23: MSE = 0.002347

Single Solvent CV MSE: 0.009993


Full Fold 0: MSE = 0.021567


Full Fold 1: MSE = 0.014993


Full Fold 2: MSE = 0.006433


Full Fold 3: MSE = 0.022342


Full Fold 4: MSE = 0.006306


Full Fold 5: MSE = 0.005801


Full Fold 6: MSE = 0.008554


Full Fold 7: MSE = 0.004453


Full Fold 8: MSE = 0.005815


Full Fold 9: MSE = 0.009499


Full Fold 10: MSE = 0.001912


Full Fold 11: MSE = 0.011103


Full Fold 12: MSE = 0.009706

Full Data CV MSE: 0.009883

=== Combined CV MSE: 0.009938 ===


In [8]:
# Save CV results
import json

results = {
    'single_cv': float(single_cv),
    'full_cv': float(full_cv),
    'combined_cv': float(combined_cv),
    'model': 'EnsembleModel (MLP + XGBoost + RF + LightGBM)',
    'features': 'spange_descriptors',
    'weights': [0.25, 0.25, 0.25, 0.25]
}

with open('/home/code/experiments/091_mixall_ensemble/metrics.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved")
print(f"Combined CV: {combined_cv:.6f}")

Results saved
Combined CV: 0.009938


In [9]:
# Instead of testing many weight configs, let's try individual models
# to see which one performs best

def compute_cv_single_model(model_type, verbose=True):
    """Compute CV score for a single model type."""
    
    # Weight configs for individual models
    if model_type == 'mlp':
        weights = [1.0, 0.0, 0.0, 0.0]
    elif model_type == 'xgb':
        weights = [0.0, 1.0, 0.0, 0.0]
    elif model_type == 'rf':
        weights = [0.0, 0.0, 1.0, 0.0]
    elif model_type == 'lgb':
        weights = [0.0, 0.0, 0.0, 1.0]
    elif model_type == 'mlp_lgb':
        weights = [0.5, 0.0, 0.0, 0.5]
    elif model_type == 'xgb_lgb':
        weights = [0.0, 0.5, 0.0, 0.5]
    else:
        weights = [0.25, 0.25, 0.25, 0.25]
    
    # Single solvent CV
    X_single, Y_single = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    
    single_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = EnsembleModel(data='single', weights=weights)
        model.train_model(train_X, train_Y, num_epochs=100)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        single_mse_list.append(mse)
    
    single_cv = np.mean(single_mse_list)
    
    # Full data CV
    X_full, Y_full = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    full_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = EnsembleModel(data='full', weights=weights)
        model.train_model(train_X, train_Y, num_epochs=100)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        full_mse_list.append(mse)
    
    full_cv = np.mean(full_mse_list)
    combined_cv = (single_cv + full_cv) / 2
    
    if verbose:
        print(f"{model_type}: Single={single_cv:.6f}, Full={full_cv:.6f}, Combined={combined_cv:.6f}")
    
    return combined_cv

# Test LightGBM only (typically best for tabular)
print("Testing LightGBM only...")
lgb_cv = compute_cv_single_model('lgb')

# Test MLP + LightGBM combo
print("\nTesting MLP + LightGBM...")
mlp_lgb_cv = compute_cv_single_model('mlp_lgb')

print(f"\nBaseline (equal weights): {combined_cv:.6f}")
print(f"LightGBM only: {lgb_cv:.6f}")
print(f"MLP + LightGBM: {mlp_lgb_cv:.6f}")

Testing LightGBM only...


lgb: Single=0.012840, Full=0.013063, Combined=0.012951

Testing MLP + LightGBM...


mlp_lgb: Single=0.009522, Full=0.009132, Combined=0.009327

Baseline (equal weights): 0.009938
LightGBM only: 0.012951
MLP + LightGBM: 0.009327


In [None]:
# Final results summary
print("=" * 50)
print("EXPERIMENT 091: MixAll Ensemble Results")
print("=" * 50)
print(f"Equal weights [0.25, 0.25, 0.25, 0.25]: CV = 0.009938")
print(f"LightGBM only: CV = 0.012951")
print(f"MLP + LightGBM [0.5, 0, 0, 0.5]: CV = 0.009327")
print()
print(f"Baseline (GP+MLP+LGBM): CV = 0.008298")
print()
print("Conclusion: MixAll approach is WORSE than baseline.")
print("The key difference is our baseline uses GP (Gaussian Process)")
print("which provides better uncertainty estimation.")

# Update metrics
import json
results = {
    'single_cv': 0.009993,
    'full_cv': 0.009883,
    'combined_cv': 0.009938,
    'best_weights_cv': 0.009327,
    'model': 'EnsembleModel (MLP + XGBoost + RF + LightGBM)',
    'features': 'spange_descriptors',
    'baseline_cv': 0.008298,
    'conclusion': 'MixAll approach is worse than GP+MLP+LGBM baseline'
}

with open('/home/code/experiments/091_mixall_ensemble/metrics.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\nMetrics saved.")

## Generate Submission

The following cells follow the official template structure.

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to standard location
import shutil
shutil.copy("submission.csv", "/home/submission/submission.csv")
print("Submission saved!")
print(f"Shape: {submission.shape}")
print(submission.head())

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################