# Experiment 056: XGBoost + RandomForest Ensemble (Like 'mixall' Kernel)

**Hypothesis**: The 'mixall' kernel uses MLP + XGBoost + RandomForest + LightGBM ensemble. We haven't tried XGBoost or RandomForest in our ensemble. These may provide different inductive biases that could change the CV-LB relationship.

**Key changes from baseline (exp_030):**
1. Add XGBoost and RandomForest to the ensemble
2. Use 4-model ensemble: MLP + XGB + RF + LGBM
3. Keep full features (Spange + DRFP + Arrhenius)

**Why this might change CV-LB relationship**: Different ensemble members have different inductive biases. XGBoost and RandomForest may generalize differently to unseen solvents.

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, Matern
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully")

In [None]:
# Load data
full_data = pd.read_csv('/home/data/catechol_full_data_yields.csv')
single_data = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')
spange = pd.read_csv('/home/data/spange_descriptors_lookup.csv')
drfp = pd.read_csv('/home/data/drfps_catechol_lookup.csv')
smiles = pd.read_csv('/home/data/smiles_lookup.csv')

print(f"Full data: {full_data.shape}")
print(f"Single solvent data: {single_data.shape}")
print(f"Spange descriptors: {spange.shape}")
print(f"DRFP features: {drfp.shape}")

In [None]:
# Prepare features - same as exp_030
def prepare_features_single(data, spange, drfp):
    """Prepare features for single solvent dataset"""
    # Rename column for consistency
    data = data.copy()
    data['Solvent'] = data['SOLVENT NAME']
    
    # Merge Spange descriptors
    spange_renamed = spange.rename(columns={'SOLVENT NAME': 'Solvent'})
    spange_cols = [c for c in spange_renamed.columns if c != 'Solvent']
    data_merged = data.merge(spange_renamed, on='Solvent', how='left')
    
    # Merge DRFP features
    drfp_renamed = drfp.rename(columns={'SOLVENT NAME': 'Solvent'})
    drfp_cols = [c for c in drfp_renamed.columns if c != 'Solvent']
    data_merged = data_merged.merge(drfp_renamed, on='Solvent', how='left')
    
    # Add Arrhenius features
    data_merged['inv_temp'] = 1.0 / (data_merged['Temperature'] + 273.15)
    data_merged['log_time'] = np.log1p(data_merged['Residence Time'])
    
    # Feature columns
    feature_cols = spange_cols + drfp_cols + ['inv_temp', 'log_time']
    
    X = data_merged[feature_cols].values
    Y = data_merged[['SM', 'Product 2', 'Product 3']].values
    
    return X, Y, feature_cols, data_merged

# Prepare single solvent data
X_single, Y_single, feature_cols, single_merged = prepare_features_single(single_data, spange, drfp)
print(f"Single solvent features: {X_single.shape}\")\nprint(f\"Single solvent targets: {Y_single.shape}\")\nprint(f\"Number of features: {len(feature_cols)}\")"

In [None]:
# Prepare mixture data
def prepare_features_mix(data, spange, drfp):
    """Prepare features for mixture dataset"""
    data = data.copy()
    
    # Create solvent identifier for mixtures
    data['Solvent'] = data['SOLVENT A NAME'] + '.' + data['SOLVENT B NAME']
    
    # Rename columns in lookup tables
    spange_renamed = spange.rename(columns={'SOLVENT NAME': 'Solvent'})
    drfp_renamed = drfp.rename(columns={'SOLVENT NAME': 'Solvent'})
    
    spange_cols = [c for c in spange_renamed.columns if c != 'Solvent']
    drfp_cols = [c for c in drfp_renamed.columns if c != 'Solvent']
    
    # For mixtures, we need to average the features of both solvents
    # First, get features for solvent A
    spange_a = spange_renamed.copy()
    spange_a.columns = ['SOLVENT A NAME'] + [f'{c}_A' for c in spange_cols]
    drfp_a = drfp_renamed.copy()
    drfp_a.columns = ['SOLVENT A NAME'] + [f'{c}_A' for c in drfp_cols]
    
    # Get features for solvent B
    spange_b = spange_renamed.copy()
    spange_b.columns = ['SOLVENT B NAME'] + [f'{c}_B' for c in spange_cols]
    drfp_b = drfp_renamed.copy()
    drfp_b.columns = ['SOLVENT B NAME'] + [f'{c}_B' for c in drfp_cols]
    
    # Merge
    data_merged = data.merge(spange_a, on='SOLVENT A NAME', how='left')
    data_merged = data_merged.merge(spange_b, on='SOLVENT B NAME', how='left')
    data_merged = data_merged.merge(drfp_a, on='SOLVENT A NAME', how='left')
    data_merged = data_merged.merge(drfp_b, on='SOLVENT B NAME', how='left')
    
    # Average features (weighted by ratio if available)
    ratio_b = data_merged['SolventB%'].values / 100.0
    ratio_a = 1.0 - ratio_b
    
    feature_list = []
    for col in spange_cols:
        avg = ratio_a * data_merged[f'{col}_A'].values + ratio_b * data_merged[f'{col}_B'].values
        data_merged[col] = avg
        feature_list.append(col)
    
    for col in drfp_cols:
        avg = ratio_a * data_merged[f'{col}_A'].values + ratio_b * data_merged[f'{col}_B'].values
        data_merged[col] = avg
        feature_list.append(col)
    
    # Add Arrhenius features
    data_merged['inv_temp'] = 1.0 / (data_merged['Temperature'] + 273.15)
    data_merged['log_time'] = np.log1p(data_merged['Residence Time'])
    feature_list.extend(['inv_temp', 'log_time'])
    
    X = data_merged[feature_list].values
    Y = data_merged[['SM', 'Product 2', 'Product 3']].values
    
    return X, Y, feature_list, data_merged

# Filter to mixtures only (where SolventB% > 0)
full_data_mix = full_data[full_data['SolventB%'] > 0].copy()
X_mix, Y_full, _, mix_merged = prepare_features_mix(full_data_mix, spange, drfp)
print(f"Mixture features: {X_mix.shape}")
print(f"Mixture targets: {Y_full.shape}")

In [None]:
# MLP Model (same as exp_030)
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[64, 32], output_dim=3, dropout=0.3):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())  # Yields are 0-1
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def train_mlp(X_train, Y_train, input_dim, epochs=200, lr=0.001, batch_size=32):
    """Train MLP model"""
    model = MLPModel(input_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    criterion = nn.MSELoss()
    
    X_tensor = torch.FloatTensor(X_train)
    Y_tensor = torch.FloatTensor(Y_train)
    
    model.train()
    for epoch in range(epochs):
        # Mini-batch training
        indices = torch.randperm(len(X_tensor))
        for i in range(0, len(X_tensor), batch_size):
            batch_idx = indices[i:i+batch_size]
            X_batch = X_tensor[batch_idx]
            Y_batch = Y_tensor[batch_idx]
            
            optimizer.zero_grad()
            pred = model(X_batch)
            loss = criterion(pred, Y_batch)
            loss.backward()
            optimizer.step()
    
    return model

print("MLP model defined")

In [None]:
# 4-Model Ensemble: MLP + XGBoost + RandomForest + LightGBM
class FourModelEnsemble:
    def __init__(self, input_dim, weights=[0.30, 0.25, 0.20, 0.25]):
        """
        4-model ensemble like 'mixall' kernel
        weights: [MLP, XGB, RF, LGBM]
        """
        self.input_dim = input_dim
        self.weights = weights
        self.mlp = None
        self.xgb_models = []  # One per target
        self.rf_models = []   # One per target
        self.lgbm_models = [] # One per target
        self.scaler = StandardScaler()
    
    def fit(self, X_train, Y_train):
        # Scale features
        X_scaled = self.scaler.fit_transform(X_train)
        
        # Train MLP
        self.mlp = train_mlp(X_scaled, Y_train, self.input_dim, epochs=200)
        
        # Train XGBoost (one per target)
        self.xgb_models = []
        for i in range(Y_train.shape[1]):
            xgb = XGBRegressor(
                n_estimators=500,
                max_depth=6,
                learning_rate=0.03,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                verbosity=0
            )
            xgb.fit(X_scaled, Y_train[:, i])
            self.xgb_models.append(xgb)
        
        # Train RandomForest (one per target)
        self.rf_models = []
        for i in range(Y_train.shape[1]):
            rf = RandomForestRegressor(
                n_estimators=200,
                max_depth=10,
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=42,
                n_jobs=-1
            )
            rf.fit(X_scaled, Y_train[:, i])
            self.rf_models.append(rf)
        
        # Train LightGBM (one per target)
        self.lgbm_models = []
        for i in range(Y_train.shape[1]):
            lgbm = LGBMRegressor(
                n_estimators=500,
                max_depth=6,
                learning_rate=0.03,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                verbose=-1
            )
            lgbm.fit(X_scaled, Y_train[:, i])
            self.lgbm_models.append(lgbm)
    
    def predict(self, X_test):
        X_scaled = self.scaler.transform(X_test)
        
        # MLP predictions
        self.mlp.eval()
        with torch.no_grad():
            mlp_pred = self.mlp(torch.FloatTensor(X_scaled)).numpy()
        
        # XGBoost predictions
        xgb_pred = np.column_stack([m.predict(X_scaled) for m in self.xgb_models])
        
        # RandomForest predictions
        rf_pred = np.column_stack([m.predict(X_scaled) for m in self.rf_models])
        
        # LightGBM predictions
        lgbm_pred = np.column_stack([m.predict(X_scaled) for m in self.lgbm_models])
        
        # Weighted ensemble
        pred = (self.weights[0] * mlp_pred + 
                self.weights[1] * xgb_pred + 
                self.weights[2] * rf_pred + 
                self.weights[3] * lgbm_pred)
        
        return np.clip(pred, 0, 1)

print("FourModelEnsemble defined")
print(f"Ensemble weights: MLP={0.30}, XGB={0.25}, RF={0.20}, LGBM={0.25}")

In [None]:
# CV functions from competition template
def generate_leave_one_solvent_out_splits(X, Y, data):
    """Leave-one-solvent-out CV for single solvents"""
    solvents = data['Solvent'].unique()
    for solvent in solvents:
        test_mask = data['Solvent'] == solvent
        train_mask = ~test_mask
        train_idx = np.where(train_mask)[0]
        test_idx = np.where(test_mask)[0]
        yield train_idx, test_idx

def generate_leave_one_ramp_out_splits(X, Y):
    """Leave-one-ramp-out CV for mixtures"""
    # Each unique solvent combination is a ramp
    ramps = mix_merged['Solvent'].unique()
    for ramp in ramps:
        test_mask = mix_merged['Solvent'] == ramp
        train_mask = ~test_mask
        train_idx = np.where(train_mask)[0]
        test_idx = np.where(test_mask)[0]
        yield train_idx, test_idx

print("CV functions defined")

In [None]:
# Run CV for single solvents
print("="*60)
print("Running Single Solvent CV with 4-Model Ensemble...")
print("="*60)

single_splits = list(generate_leave_one_solvent_out_splits(X_single, Y_single, single_merged))
print(f"Number of folds: {len(single_splits)}")

single_errors = {}
all_preds = []
all_true = []

for fold_idx, (train_idx, test_idx) in enumerate(single_splits):
    X_train = X_single[train_idx]
    Y_train = Y_single[train_idx]
    X_test = X_single[test_idx]
    Y_test = Y_single[test_idx]
    
    test_solvent = single_merged.iloc[test_idx]['Solvent'].iloc[0]
    
    # Train ensemble
    model = FourModelEnsemble(input_dim=len(feature_cols))
    model.fit(X_train, Y_train)
    
    # Predict
    pred = model.predict(X_test)
    
    # Calculate MSE
    mse = np.mean((pred - Y_test) ** 2)
    single_errors[test_solvent] = mse
    
    all_preds.append(pred)
    all_true.append(Y_test)
    
    print(f"Fold {fold_idx+1:2d}: {test_solvent:50s} MSE = {mse:.6f}")

all_preds = np.vstack(all_preds)
all_true = np.vstack(all_true)
single_mse = np.mean((all_preds - all_true) ** 2)
single_std = np.std(list(single_errors.values()))

print(f"\n4-Model Ensemble Single Solvent CV MSE: {single_mse:.6f} +/- {single_std:.6f}")

In [None]:
# Run CV for mixtures
print("\n" + "="*60)
print("Running Mixture CV with 4-Model Ensemble...")
print("="*60)

mix_splits = list(generate_leave_one_ramp_out_splits(X_mix, Y_full))
print(f"Number of folds: {len(mix_splits)}")

mix_errors = {}
mix_preds = []
mix_true = []

for fold_idx, (train_idx, test_idx) in enumerate(mix_splits):
    X_train = X_mix[train_idx]
    Y_train = Y_full[train_idx]
    X_test = X_mix[test_idx]
    Y_test = Y_full[test_idx]
    
    test_mixture = mix_merged.iloc[test_idx]['Solvent'].iloc[0]
    
    # Train ensemble
    model = FourModelEnsemble(input_dim=len(feature_cols))
    model.fit(X_train, Y_train)
    
    # Predict
    pred = model.predict(X_test)
    
    # Calculate MSE
    mse = np.mean((pred - Y_test) ** 2)
    mix_errors[test_mixture] = mse
    
    mix_preds.append(pred)
    mix_true.append(Y_test)
    
    print(f"Fold {fold_idx+1:2d}: {test_mixture:50s} MSE = {mse:.6f}")

mix_preds = np.vstack(mix_preds)
mix_true = np.vstack(mix_true)
mix_mse = np.mean((mix_preds - mix_true) ** 2)
mix_std = np.std(list(mix_errors.values()))

print(f"\n4-Model Ensemble Mixture CV MSE: {mix_mse:.6f} +/- {mix_std:.6f}")

In [None]:
# Calculate overall CV score
print("\n" + "="*60)
print("4-Model Ensemble Overall Results")
print("="*60)

n_single = len(all_true)
n_mix = len(mix_true)
n_total = n_single + n_mix

overall_mse = (n_single * single_mse + n_mix * mix_mse) / n_total

print(f"\nSingle Solvent CV MSE: {single_mse:.6f} +/- {single_std:.6f} (n={n_single})")
print(f"Mixture CV MSE: {mix_mse:.6f} +/- {mix_std:.6f} (n={n_mix})")
print(f"Overall CV MSE: {overall_mse:.6f}")

print(f"\nBaseline (exp_030): CV = 0.008298")
print(f"Improvement vs baseline: {(overall_mse - 0.008298) / 0.008298 * 100:+.1f}%")

if overall_mse < 0.008298:
    print("\n✓ BETTER than baseline!")
else:
    print("\n✗ WORSE than baseline.")

In [None]:
# Final Summary
print("\n" + "="*60)
print("EXPERIMENT 056 SUMMARY")
print("="*60)

print(f"\n4-Model Ensemble (MLP + XGB + RF + LGBM):")
print(f"  Features: Spange + DRFP + Arrhenius ({len(feature_cols)} features)")
print(f"  Weights: MLP=0.30, XGB=0.25, RF=0.20, LGBM=0.25")
print(f"\n  Single Solvent CV: {single_mse:.6f}")
print(f"  Mixture CV: {mix_mse:.6f}")
print(f"  Overall CV: {overall_mse:.6f}")
print(f"  vs Baseline (exp_030): {(overall_mse - 0.008298) / 0.008298 * 100:+.1f}%")

print(f"\nKey insights:")
print(f"1. XGBoost and RandomForest add different inductive biases")
print(f"2. 4-model ensemble may have different CV-LB relationship")
print(f"3. This approach is similar to the 'mixall' kernel")

print(f"\nRemaining submissions: 5")
print(f"Best model: exp_030 (GP 0.15 + MLP 0.55 + LGBM 0.3) with CV 0.008298, LB 0.0877")