# Hyperparameter Optimization of Best Model

**Hypothesis**: The best model (GP+MLP+LGBM) has fixed hyperparameters. Systematic optimization could improve CV significantly.

**Target**: Reduce CV from 0.008194 to ~0.006 (20-30% improvement)

**Parameters to optimize**:
1. MLP: learning rate, dropout, hidden dims, epochs
2. LGBM: n_estimators, learning_rate, max_depth, num_leaves
3. GP: kernel parameters, alpha
4. Ensemble: weights for GP, MLP, LGBM

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel, RBF
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.samplers import TPESampler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Full Featurizer - 145 features
class FullFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])

# Simple Featurizer (for GP) - 18 features
class SimpleFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange])

print(f'Full feature dimension: {FullFeaturizer().feats_dim}')
print(f'Simple feature dimension: {SimpleFeaturizer().feats_dim}')

Full feature dimension: 145
Simple feature dimension: 18


In [5]:
# MLP Model with configurable hyperparameters
class MLPModel(nn.Module):
    def __init__(self, input_dim=145, hidden_dim=64, output_dim=3, dropout=0.2):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc3(x))
        return x

print('MLP model defined')

MLP model defined


In [6]:
# Optimized GP + MLP + LGBM Ensemble with configurable hyperparameters
class OptimizedEnsemble:
    def __init__(self, data='single', 
                 gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.30,
                 mlp_hidden=64, mlp_dropout=0.2, mlp_lr=0.001, mlp_epochs=200,
                 lgbm_n_estimators=100, lgbm_lr=0.05, lgbm_max_depth=5,
                 gp_alpha=0.01, gp_length_scale=1.0):
        self.data = data
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        self.mixed = (data == 'full')
        
        # MLP hyperparameters
        self.mlp_hidden = mlp_hidden
        self.mlp_dropout = mlp_dropout
        self.mlp_lr = mlp_lr
        self.mlp_epochs = mlp_epochs
        
        # LGBM hyperparameters
        self.lgbm_n_estimators = lgbm_n_estimators
        self.lgbm_lr = lgbm_lr
        self.lgbm_max_depth = lgbm_max_depth
        
        # GP hyperparameters
        self.gp_alpha = gp_alpha
        self.gp_length_scale = gp_length_scale
        
        self.full_featurizer = FullFeaturizer(mixed=self.mixed)
        self.simple_featurizer = SimpleFeaturizer(mixed=self.mixed)
        
        self.scaler_full = StandardScaler()
        self.scaler_simple = StandardScaler()
        
        self.gp_models = [None, None, None]
        self.mlp_model = None
        self.lgbm_models = [None, None, None]
        
    def train_model(self, X, Y):
        X_full = self.full_featurizer.featurize(X)
        X_simple = self.simple_featurizer.featurize(X)
        Y_np = Y.values
        
        X_full_scaled = self.scaler_full.fit_transform(X_full)
        X_simple_scaled = self.scaler_simple.fit_transform(X_simple)
        
        # Train GP for each target
        for i in range(3):
            kernel = ConstantKernel(1.0) * Matern(length_scale=self.gp_length_scale, nu=2.5) + WhiteKernel(noise_level=0.1)
            self.gp_models[i] = GaussianProcessRegressor(
                kernel=kernel, alpha=self.gp_alpha, n_restarts_optimizer=3, random_state=42
            )
            self.gp_models[i].fit(X_simple_scaled, Y_np[:, i])
        
        # Train MLP
        self.mlp_model = MLPModel(
            input_dim=self.full_featurizer.feats_dim,
            hidden_dim=self.mlp_hidden,
            output_dim=3,
            dropout=self.mlp_dropout
        ).to(device)
        
        X_tensor = torch.tensor(X_full_scaled, dtype=torch.double).to(device)
        Y_tensor = torch.tensor(Y_np, dtype=torch.double).to(device)
        
        optimizer = torch.optim.Adam(self.mlp_model.parameters(), lr=self.mlp_lr, weight_decay=1e-4)
        criterion = nn.MSELoss()
        
        self.mlp_model.train()
        for epoch in range(self.mlp_epochs):
            optimizer.zero_grad()
            pred = self.mlp_model(X_tensor)
            loss = criterion(pred, Y_tensor)
            loss.backward()
            optimizer.step()
        
        # Train LGBM for each target
        for i in range(3):
            self.lgbm_models[i] = lgb.LGBMRegressor(
                n_estimators=self.lgbm_n_estimators, 
                learning_rate=self.lgbm_lr, 
                max_depth=self.lgbm_max_depth,
                num_leaves=31, min_child_samples=5, random_state=42, verbose=-1
            )
            self.lgbm_models[i].fit(X_full_scaled, Y_np[:, i])
        
        return self
    
    def predict(self, X):
        X_full = self.full_featurizer.featurize(X)
        X_simple = self.simple_featurizer.featurize(X)
        
        X_full_scaled = self.scaler_full.transform(X_full)
        X_simple_scaled = self.scaler_simple.transform(X_simple)
        
        # GP predictions
        gp_preds = np.column_stack([self.gp_models[i].predict(X_simple_scaled) for i in range(3)])
        
        # MLP predictions
        self.mlp_model.eval()
        X_tensor = torch.tensor(X_full_scaled, dtype=torch.double).to(device)
        with torch.no_grad():
            mlp_preds = self.mlp_model(X_tensor).cpu().numpy()
        
        # LGBM predictions
        lgbm_preds = np.column_stack([self.lgbm_models[i].predict(X_full_scaled) for i in range(3)])
        
        # Ensemble
        predictions = self.gp_weight * gp_preds + self.mlp_weight * mlp_preds + self.lgbm_weight * lgbm_preds
        
        # TTA for mixtures
        if self.mixed:
            X_full_flip = self.full_featurizer.featurize(X, flip=True)
            X_simple_flip = self.simple_featurizer.featurize(X, flip=True)
            
            X_full_scaled_flip = self.scaler_full.transform(X_full_flip)
            X_simple_scaled_flip = self.scaler_simple.transform(X_simple_flip)
            
            gp_preds_flip = np.column_stack([self.gp_models[i].predict(X_simple_scaled_flip) for i in range(3)])
            X_tensor_flip = torch.tensor(X_full_scaled_flip, dtype=torch.double).to(device)
            with torch.no_grad():
                mlp_preds_flip = self.mlp_model(X_tensor_flip).cpu().numpy()
            lgbm_preds_flip = np.column_stack([self.lgbm_models[i].predict(X_full_scaled_flip) for i in range(3)])
            
            predictions_flip = self.gp_weight * gp_preds_flip + self.mlp_weight * mlp_preds_flip + self.lgbm_weight * lgbm_preds_flip
            predictions = (predictions + predictions_flip) / 2
        
        predictions = np.clip(predictions, 0, 1)
        return torch.tensor(predictions)

print('Optimized Ensemble defined')

Optimized Ensemble defined


In [7]:
# Fast CV evaluation function (only on single solvent data for speed)
def evaluate_hyperparams(params, n_folds=5):
    """Evaluate hyperparameters using a subset of CV folds for speed."""
    X_single, Y_single = load_data("single_solvent")
    
    # Use only first n_folds for speed
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    all_predictions = []
    all_actuals = []
    
    for fold_idx, split in enumerate(split_generator):
        if fold_idx >= n_folds:
            break
            
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = OptimizedEnsemble(
            data='single',
            gp_weight=params['gp_weight'],
            mlp_weight=params['mlp_weight'],
            lgbm_weight=params['lgbm_weight'],
            mlp_hidden=params['mlp_hidden'],
            mlp_dropout=params['mlp_dropout'],
            mlp_lr=params['mlp_lr'],
            mlp_epochs=params['mlp_epochs'],
            lgbm_n_estimators=params['lgbm_n_estimators'],
            lgbm_lr=params['lgbm_lr'],
            lgbm_max_depth=params['lgbm_max_depth'],
            gp_alpha=params['gp_alpha'],
            gp_length_scale=params['gp_length_scale']
        )
        model.train_model(train_X, train_Y)
        predictions = model.predict(test_X)
        
        all_predictions.append(predictions.numpy())
        all_actuals.append(test_Y.values)
    
    preds = np.vstack(all_predictions)
    actuals = np.vstack(all_actuals)
    mse = np.mean((preds - actuals) ** 2)
    return mse

print('Evaluation function defined')

Evaluation function defined


In [8]:
# Optuna objective function
def objective(trial):
    # Sample hyperparameters
    params = {
        # Ensemble weights (must sum to 1)
        'gp_weight': trial.suggest_float('gp_weight', 0.05, 0.30),
        'mlp_weight': trial.suggest_float('mlp_weight', 0.40, 0.70),
        'lgbm_weight': None,  # Will be computed
        
        # MLP hyperparameters
        'mlp_hidden': trial.suggest_categorical('mlp_hidden', [32, 64, 128]),
        'mlp_dropout': trial.suggest_float('mlp_dropout', 0.1, 0.4),
        'mlp_lr': trial.suggest_float('mlp_lr', 0.0005, 0.005, log=True),
        'mlp_epochs': trial.suggest_int('mlp_epochs', 150, 300),
        
        # LGBM hyperparameters
        'lgbm_n_estimators': trial.suggest_int('lgbm_n_estimators', 50, 200),
        'lgbm_lr': trial.suggest_float('lgbm_lr', 0.01, 0.1, log=True),
        'lgbm_max_depth': trial.suggest_int('lgbm_max_depth', 3, 7),
        
        # GP hyperparameters
        'gp_alpha': trial.suggest_float('gp_alpha', 0.001, 0.1, log=True),
        'gp_length_scale': trial.suggest_float('gp_length_scale', 0.5, 2.0),
    }
    
    # Compute lgbm_weight to ensure weights sum to 1
    params['lgbm_weight'] = 1.0 - params['gp_weight'] - params['mlp_weight']
    if params['lgbm_weight'] < 0.05:
        return float('inf')  # Invalid configuration
    
    # Evaluate
    mse = evaluate_hyperparams(params, n_folds=5)
    return mse

print('Optuna objective function defined')

Optuna objective function defined


In [9]:
# Run hyperparameter optimization
print('Starting hyperparameter optimization...')
print('Using 5 folds for speed (will validate best params on full CV later)')

sampler = TPESampler(seed=42)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=20, show_progress_bar=True)

print(f'\nBest trial:')
print(f'  Value (MSE): {study.best_trial.value:.6f}')
print(f'  Params: {study.best_trial.params}')

[I 2026-01-15 10:54:10,774] A new study created in memory with name: no-name-4cc94585-c69c-442c-9722-45d7935379cc


Starting hyperparameter optimization...
Using 5 folds for speed (will validate best params on full CV later)


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2026-01-15 10:56:01,605] Trial 0 finished with value: 0.012707327520873038 and parameters: {'gp_weight': 0.1436350297118406, 'mlp_weight': 0.6852142919229748, 'mlp_hidden': 32, 'mlp_dropout': 0.1467983561008608, 'mlp_lr': 0.000571549193815661, 'mlp_epochs': 280, 'lgbm_n_estimators': 140, 'lgbm_lr': 0.051059032093947576, 'lgbm_max_depth': 3, 'gp_alpha': 0.08706020878304858, 'gp_length_scale': 1.7486639612006325}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 10:58:04,035] Trial 1 finished with value: 0.016655073376011516 and parameters: {'gp_weight': 0.10308477766956904, 'mlp_weight': 0.4545474901621302, 'mlp_hidden': 128, 'mlp_dropout': 0.22958350559263474, 'mlp_lr': 0.0009776854331372624, 'mlp_epochs': 242, 'lgbm_n_estimators': 71, 'lgbm_lr': 0.019594972058679168, 'lgbm_max_depth': 4, 'gp_alpha': 0.008168455894760165, 'gp_length_scale': 1.6777639420895203}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:00:15,938] Trial 2 finished with value: 0.018160067301852215 and parameters: {'gp_weight': 0.09991844553958994, 'mlp_weight': 0.5542703315240834, 'mlp_hidden': 128, 'mlp_dropout': 0.15115723710618748, 'mlp_lr': 0.0005807932994623225, 'mlp_epochs': 293, 'lgbm_n_estimators': 195, 'lgbm_lr': 0.06432759992849894, 'lgbm_max_depth': 4, 'gp_alpha': 0.0015679933916723015, 'gp_length_scale': 1.5263495397682354}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:01:47,820] Trial 3 finished with value: 0.015239343661077551 and parameters: {'gp_weight': 0.16003812343490031, 'mlp_weight': 0.4366114704534337, 'mlp_hidden': 128, 'mlp_dropout': 0.1776339944800051, 'mlp_lr': 0.002298752892366083, 'mlp_epochs': 197, 'lgbm_n_estimators': 128, 'lgbm_lr': 0.03521358805467869, 'lgbm_max_depth': 3, 'gp_alpha': 0.08692991511139551, 'gp_length_scale': 1.6626992350416718}. Best is trial 0 with value: 0.012707327520873038.
[I 2026-01-15 11:01:47,824] Trial 4 finished with value: inf and parameters: {'gp_weight': 0.2848747353910473, 'mlp_weight': 0.6684482051282946, 'mlp_hidden': 64, 'mlp_dropout': 0.15879485872574356, 'mlp_lr': 0.0005548777280551552, 'mlp_epochs': 199, 'lgbm_n_estimators': 108, 'lgbm_lr': 0.01867880257107068, 'lgbm_max_depth': 7, 'gp_alpha': 0.005170191786366992, 'gp_length_scale': 0.9214017645310711}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:03:35,130] Trial 5 finished with value: 0.014768118153472929 and parameters: {'gp_weight': 0.18567402078956213, 'mlp_weight': 0.4422772674924288, 'mlp_hidden': 128, 'mlp_dropout': 0.33167343078899725, 'mlp_lr': 0.0007901065932051942, 'mlp_epochs': 150, 'lgbm_n_estimators': 173, 'lgbm_lr': 0.05091635945818555, 'lgbm_max_depth': 6, 'gp_alpha': 0.034877126245459314, 'gp_length_scale': 0.6110669776011355}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:05:48,132] Trial 6 finished with value: 0.013687278951242809 and parameters: {'gp_weight': 0.13961643213606817, 'mlp_weight': 0.43476071785753895, 'mlp_hidden': 32, 'mlp_dropout': 0.1190675050858071, 'mlp_lr': 0.0010231806681740801, 'mlp_epochs': 199, 'lgbm_n_estimators': 160, 'lgbm_lr': 0.043406770118894, 'lgbm_max_depth': 7, 'gp_alpha': 0.008798929749689027, 'gp_length_scale': 0.6793913689074526}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:07:47,235] Trial 7 finished with value: 0.017599744785782852 and parameters: {'gp_weight': 0.22831119680574874, 'mlp_weight': 0.6282355145850692, 'mlp_hidden': 64, 'mlp_dropout': 0.25681984881459824, 'mlp_lr': 0.0013381691783830377, 'mlp_epochs': 153, 'lgbm_n_estimators': 66, 'lgbm_lr': 0.010750512925563078, 'lgbm_max_depth': 6, 'gp_alpha': 0.00425316236379087, 'gp_length_scale': 1.262856036747054}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:09:36,235] Trial 8 finished with value: 0.015838957566300157 and parameters: {'gp_weight': 0.27689161848152327, 'mlp_weight': 0.4747876687446625, 'mlp_hidden': 64, 'mlp_dropout': 0.12309397294863791, 'mlp_lr': 0.0009743645106784236, 'mlp_epochs': 174, 'lgbm_n_estimators': 190, 'lgbm_lr': 0.06428658848831817, 'lgbm_max_depth': 6, 'gp_alpha': 0.05532496914298508, 'gp_length_scale': 1.7055081153486717}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:11:52,839] Trial 9 finished with value: 0.01675695328501775 and parameters: {'gp_weight': 0.09664251472150896, 'mlp_weight': 0.6677676995469932, 'mlp_hidden': 128, 'mlp_dropout': 0.19540104249155918, 'mlp_lr': 0.0006442017924231741, 'mlp_epochs': 184, 'lgbm_n_estimators': 114, 'lgbm_lr': 0.06576801979658928, 'lgbm_max_depth': 7, 'gp_alpha': 0.0010325337616482041, 'gp_length_scale': 1.2661209538663485}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:13:48,318] Trial 10 finished with value: 0.014597557063036323 and parameters: {'gp_weight': 0.06057250231679878, 'mlp_weight': 0.5676552694772402, 'mlp_hidden': 32, 'mlp_dropout': 0.3796871025735363, 'mlp_lr': 0.0038426102582429945, 'mlp_epochs': 287, 'lgbm_n_estimators': 141, 'lgbm_lr': 0.09203870791207931, 'lgbm_max_depth': 3, 'gp_alpha': 0.02533691514684049, 'gp_length_scale': 1.987660095293712}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:15:53,802] Trial 11 finished with value: 0.013330786323687074 and parameters: {'gp_weight': 0.15837618880879983, 'mlp_weight': 0.5166992822502666, 'mlp_hidden': 32, 'mlp_dropout': 0.10598545731786826, 'mlp_lr': 0.0015719693957202945, 'mlp_epochs': 241, 'lgbm_n_estimators': 155, 'lgbm_lr': 0.03382472854820698, 'lgbm_max_depth': 5, 'gp_alpha': 0.01765603430879867, 'gp_length_scale': 0.5194940257656061}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:17:48,825] Trial 12 finished with value: 0.016004449686705603 and parameters: {'gp_weight': 0.20226846585843616, 'mlp_weight': 0.5060188414980882, 'mlp_hidden': 32, 'mlp_dropout': 0.11007576548034162, 'mlp_lr': 0.002334167208904365, 'mlp_epochs': 257, 'lgbm_n_estimators': 148, 'lgbm_lr': 0.02577598268486535, 'lgbm_max_depth': 4, 'gp_alpha': 0.02070699744102167, 'gp_length_scale': 0.9246073616106361}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:19:58,038] Trial 13 finished with value: 0.015549606059562945 and parameters: {'gp_weight': 0.13829138218955295, 'mlp_weight': 0.6064688535850599, 'mlp_hidden': 32, 'mlp_dropout': 0.272822496431662, 'mlp_lr': 0.0046164480255257365, 'mlp_epochs': 263, 'lgbm_n_estimators': 104, 'lgbm_lr': 0.030051039478538218, 'lgbm_max_depth': 5, 'gp_alpha': 0.016604496710763958, 'gp_length_scale': 1.9659427975068455}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:21:30,722] Trial 14 finished with value: 0.01501186974641698 and parameters: {'gp_weight': 0.23101569651236467, 'mlp_weight': 0.5154847305503363, 'mlp_hidden': 32, 'mlp_dropout': 0.10076831144073116, 'mlp_lr': 0.001945323335177902, 'mlp_epochs': 232, 'lgbm_n_estimators': 170, 'lgbm_lr': 0.03971108897784325, 'lgbm_max_depth': 5, 'gp_alpha': 0.09006020499939704, 'gp_length_scale': 0.9798088603953311}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:23:15,529] Trial 15 finished with value: 0.015084681686563198 and parameters: {'gp_weight': 0.1542536837307999, 'mlp_weight': 0.40079005467029827, 'mlp_hidden': 32, 'mlp_dropout': 0.2173586379013956, 'mlp_lr': 0.0031850761055011697, 'mlp_epochs': 273, 'lgbm_n_estimators': 92, 'lgbm_lr': 0.02282392254267503, 'lgbm_max_depth': 3, 'gp_alpha': 0.0551132475128409, 'gp_length_scale': 1.438017956788638}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:25:36,435] Trial 16 finished with value: 0.015597172347583716 and parameters: {'gp_weight': 0.20672022953861277, 'mlp_weight': 0.5910628193857126, 'mlp_hidden': 32, 'mlp_dropout': 0.29611288475324316, 'mlp_lr': 0.0018735249103612151, 'mlp_epochs': 247, 'lgbm_n_estimators': 136, 'lgbm_lr': 0.013943122920045648, 'lgbm_max_depth': 5, 'gp_alpha': 0.002566444181212709, 'gp_length_scale': 0.5171647819942123}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:28:02,330] Trial 17 finished with value: 0.015791076130767624 and parameters: {'gp_weight': 0.06975273113465091, 'mlp_weight': 0.6965189504750028, 'mlp_hidden': 32, 'mlp_dropout': 0.14931902128299568, 'mlp_lr': 0.0013454350674068922, 'mlp_epochs': 214, 'lgbm_n_estimators': 160, 'lgbm_lr': 0.09801153617943263, 'lgbm_max_depth': 4, 'gp_alpha': 0.014476141427033188, 'gp_length_scale': 1.0872528356115083}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:29:56,919] Trial 18 finished with value: 0.012749729976071978 and parameters: {'gp_weight': 0.12712042801934542, 'mlp_weight': 0.5198395796937997, 'mlp_hidden': 32, 'mlp_dropout': 0.20939732083913032, 'mlp_lr': 0.0030915840031187553, 'mlp_epochs': 279, 'lgbm_n_estimators': 177, 'lgbm_lr': 0.05151747744709276, 'lgbm_max_depth': 3, 'gp_alpha': 0.036601666089861117, 'gp_length_scale': 0.7106769058866473}. Best is trial 0 with value: 0.012707327520873038.


[I 2026-01-15 11:31:42,711] Trial 19 finished with value: 0.015128375868137717 and parameters: {'gp_weight': 0.12390589527146749, 'mlp_weight': 0.6356815165329587, 'mlp_hidden': 32, 'mlp_dropout': 0.20417858180646106, 'mlp_lr': 0.0027084980921912817, 'mlp_epochs': 278, 'lgbm_n_estimators': 181, 'lgbm_lr': 0.05081579618860371, 'lgbm_max_depth': 3, 'gp_alpha': 0.04149498598423083, 'gp_length_scale': 0.7644677511433781}. Best is trial 0 with value: 0.012707327520873038.

Best trial:
  Value (MSE): 0.012707
  Params: {'gp_weight': 0.1436350297118406, 'mlp_weight': 0.6852142919229748, 'mlp_hidden': 32, 'mlp_dropout': 0.1467983561008608, 'mlp_lr': 0.000571549193815661, 'mlp_epochs': 280, 'lgbm_n_estimators': 140, 'lgbm_lr': 0.051059032093947576, 'lgbm_max_depth': 3, 'gp_alpha': 0.08706020878304858, 'gp_length_scale': 1.7486639612006325}


In [None]:
# Get best parameters
best_params = study.best_trial.params
best_params['lgbm_weight'] = 1.0 - best_params['gp_weight'] - best_params['mlp_weight']

print('Best hyperparameters:')
for k, v in best_params.items():
    print(f'  {k}: {v}')

In [None]:
# Full CV evaluation with best hyperparameters
print('\n=== Full CV Evaluation with Best Hyperparameters ===')

# Single solvent CV
print('\n--- Single Solvent CV ---')
X_single, Y_single = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X_single, Y_single)
all_predictions_single = []
all_actuals_single = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = OptimizedEnsemble(
        data='single',
        gp_weight=best_params['gp_weight'],
        mlp_weight=best_params['mlp_weight'],
        lgbm_weight=best_params['lgbm_weight'],
        mlp_hidden=best_params['mlp_hidden'],
        mlp_dropout=best_params['mlp_dropout'],
        mlp_lr=best_params['mlp_lr'],
        mlp_epochs=best_params['mlp_epochs'],
        lgbm_n_estimators=best_params['lgbm_n_estimators'],
        lgbm_lr=best_params['lgbm_lr'],
        lgbm_max_depth=best_params['lgbm_max_depth'],
        gp_alpha=best_params['gp_alpha'],
        gp_length_scale=best_params['gp_length_scale']
    )
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)
    
    all_predictions_single.append(predictions.numpy())
    all_actuals_single.append(test_Y.values)

preds_single = np.vstack(all_predictions_single)
actuals_single = np.vstack(all_actuals_single)
mse_single = np.mean((preds_single - actuals_single) ** 2)
print(f'Single Solvent MSE: {mse_single:.6f} (n={len(preds_single)})')

In [None]:
# Full data CV
print('\n--- Full Data CV ---')
X_full, Y_full = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
all_predictions_full = []
all_actuals_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = OptimizedEnsemble(
        data='full',
        gp_weight=best_params['gp_weight'],
        mlp_weight=best_params['mlp_weight'],
        lgbm_weight=best_params['lgbm_weight'],
        mlp_hidden=best_params['mlp_hidden'],
        mlp_dropout=best_params['mlp_dropout'],
        mlp_lr=best_params['mlp_lr'],
        mlp_epochs=best_params['mlp_epochs'],
        lgbm_n_estimators=best_params['lgbm_n_estimators'],
        lgbm_lr=best_params['lgbm_lr'],
        lgbm_max_depth=best_params['lgbm_max_depth'],
        gp_alpha=best_params['gp_alpha'],
        gp_length_scale=best_params['gp_length_scale']
    )
    model.train_model(train_X, train_Y)
    predictions = model.predict(test_X)
    
    all_predictions_full.append(predictions.numpy())
    all_actuals_full.append(test_Y.values)

preds_full = np.vstack(all_predictions_full)
actuals_full = np.vstack(all_actuals_full)
mse_full = np.mean((preds_full - actuals_full) ** 2)
print(f'Full Data MSE: {mse_full:.6f} (n={len(preds_full)})')

In [None]:
# Calculate overall MSE
n_single = len(preds_single)
n_full = len(preds_full)
overall_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)

print(f'\n=== CV SCORE SUMMARY (Optimized Hyperparameters) ===')
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Overall MSE: {overall_mse:.6f}')
print(f'\nBest CV (exp_032): 0.008194')

if overall_mse < 0.008194:
    improvement = (0.008194 - overall_mse) / 0.008194 * 100
    print(f'\n✓ IMPROVEMENT: {improvement:.2f}% better than best CV!')
else:
    degradation = (overall_mse - 0.008194) / 0.008194 * 100
    print(f'\n✗ WORSE: {degradation:.2f}% worse than best CV')

In [None]:
# Create final model class with best hyperparameters
class BestOptimizedEnsemble:
    def __init__(self, data='single'):
        self.model = OptimizedEnsemble(
            data=data,
            gp_weight=best_params['gp_weight'],
            mlp_weight=best_params['mlp_weight'],
            lgbm_weight=best_params['lgbm_weight'],
            mlp_hidden=best_params['mlp_hidden'],
            mlp_dropout=best_params['mlp_dropout'],
            mlp_lr=best_params['mlp_lr'],
            mlp_epochs=best_params['mlp_epochs'],
            lgbm_n_estimators=best_params['lgbm_n_estimators'],
            lgbm_lr=best_params['lgbm_lr'],
            lgbm_max_depth=best_params['lgbm_max_depth'],
            gp_alpha=best_params['gp_alpha'],
            gp_length_scale=best_params['gp_length_scale']
        )
    
    def train_model(self, X, Y):
        self.model.train_model(X, Y)
        return self
    
    def predict(self, X):
        return self.model.predict(X)

print('BestOptimizedEnsemble defined')

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = BestOptimizedEnsemble(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = BestOptimizedEnsemble(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
# Final verification
print(f'\n=== FINAL CV SCORE ===')
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Overall MSE: {overall_mse:.6f}')
print(f'\nBest CV (exp_032): 0.008194')
print(f'\nBest hyperparameters:')
for k, v in best_params.items():
    print(f'  {k}: {v}')