# Experiment 066: MLP+LGBM with Yield Normalization

**Goal:** Test yield normalization constraint with a faster model (no GP).

**Hypothesis:** Product 2 + Product 3 + SM should sum to ~1 (mass balance). Enforcing this constraint may improve generalization.

**Implementation:**
- MLP + LGBM ensemble (faster than GP+MLP+LGBM)
- Add yield normalization: if sum > 1, divide by sum

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Featurizer
class FullFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])
    
    def featurize_torch(self, X, flip=False):
        return torch.tensor(self.featurize(X, flip), dtype=torch.double)

print(f'FullFeaturizer defined with {FullFeaturizer().feats_dim} features')

FullFeaturizer defined with 145 features


In [5]:
# MLP Model
class MLPModelInternal(nn.Module):
    def __init__(self, input_dim, hidden_dims=[64, 32], output_dim=3, dropout=0.1):
        super(MLPModelInternal, self).__init__()
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([nn.Linear(prev_dim, h_dim), nn.BatchNorm1d(h_dim), nn.ReLU(), nn.Dropout(dropout)])
            prev_dim = h_dim
        layers.extend([nn.Linear(prev_dim, output_dim), nn.Sigmoid()])
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

class MLPWrapper:
    def __init__(self, data='single', n_models=3):
        self.data_type = data
        self.n_models = n_models
        self.featurizer = FullFeaturizer(mixed=(data=='full'))
        self.models = []
        self.scaler = None

    def train_model(self, X_train, y_train, epochs=150, batch_size=32, lr=5e-4):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_all)
        
        X_tensor = torch.tensor(X_scaled, dtype=torch.double)
        y_tensor = torch.tensor(y_all, dtype=torch.double)
        
        input_dim = X_tensor.shape[1]
        self.models = []
        
        for i in range(self.n_models):
            model = MLPModelInternal(input_dim, [64, 32], 3, dropout=0.1).double()
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            criterion = nn.MSELoss()
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
            
            for epoch in range(epochs):
                model.train()
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    loss = criterion(pred, y_batch)
                    loss.backward()
                    optimizer.step()
            
            model.eval()
            self.models.append(model)

    def predict(self, X_test):
        X_std = self.featurizer.featurize(X_test, flip=False)
        X_scaled = self.scaler.transform(X_std)
        X_tensor = torch.tensor(X_scaled, dtype=torch.double)
        
        preds = []
        with torch.no_grad():
            for model in self.models:
                preds.append(model(X_tensor))
        
        return torch.stack(preds).mean(dim=0)

print('MLPWrapper defined')

MLPWrapper defined


In [6]:
# LightGBM Wrapper
class LGBMWrapper:
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = FullFeaturizer(mixed=(data=='full'))
        self.models = []
        self.scaler = None

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_all)
        
        self.models = []
        params = {'objective': 'regression', 'metric': 'mse', 'boosting_type': 'gbdt',
                  'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9,
                  'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': -1, 'seed': 42}
        
        for i in range(3):
            train_data = lgb.Dataset(X_scaled, label=y_all[:, i])
            model = lgb.train(params, train_data, num_boost_round=100)
            self.models.append(model)

    def predict(self, X_test):
        X_std = self.featurizer.featurize(X_test, flip=False)
        X_scaled = self.scaler.transform(X_std)
        
        preds = []
        for model in self.models:
            preds.append(model.predict(X_scaled))
        
        return torch.tensor(np.column_stack(preds), dtype=torch.double)

print('LGBMWrapper defined')

LGBMWrapper defined


In [7]:
# MLP + LGBM Ensemble with Yield Normalization
class MLPLGBMNormalizedEnsemble:
    def __init__(self, data='single'):
        self.data_type = data
        self.mlp = MLPWrapper(data=data, n_models=3)
        self.lgbm = LGBMWrapper(data=data)
        # Weights: MLP 0.5, LGBM 0.5
        self.weights = {'mlp': 0.5, 'lgbm': 0.5}

    def train_model(self, X_train, y_train):
        self.mlp.train_model(X_train, y_train)
        self.lgbm.train_model(X_train, y_train)

    def predict(self, X_test):
        mlp_pred = self.mlp.predict(X_test)
        lgbm_pred = self.lgbm.predict(X_test)
        
        combined = (self.weights['mlp'] * mlp_pred + 
                    self.weights['lgbm'] * lgbm_pred)
        
        # Clip to [0, 1]
        combined = torch.clamp(combined, 0, 1)
        
        # YIELD NORMALIZATION: Ensure predictions sum to 1 (mass balance)
        # Product 2 + Product 3 + SM should sum to ~1
        totals = combined.sum(dim=1, keepdim=True)
        # Only normalize if sum > 1 (don't inflate small predictions)
        divisor = torch.maximum(totals, torch.ones_like(totals))
        combined = combined / divisor
        
        return combined

print('MLPLGBMNormalizedEnsemble defined with YIELD NORMALIZATION')

MLPLGBMNormalizedEnsemble defined with YIELD NORMALIZATION


In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MLPLGBMNormalizedEnsemble(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:12, 12.34s/it]

2it [00:24, 11.94s/it]

3it [00:35, 11.56s/it]

4it [00:48, 12.09s/it]

5it [00:59, 11.95s/it]

6it [01:11, 11.84s/it]

7it [01:22, 11.77s/it]

8it [01:34, 11.70s/it]

9it [01:46, 11.69s/it]

10it [01:58, 11.82s/it]

11it [02:10, 11.81s/it]

12it [02:21, 11.80s/it]

13it [02:33, 11.80s/it]

14it [02:45, 11.76s/it]

15it [02:56, 11.73s/it]

16it [03:08, 11.69s/it]

17it [03:20, 11.86s/it]

18it [03:32, 11.82s/it]

19it [03:44, 11.76s/it]

20it [03:55, 11.73s/it]

21it [04:07, 11.71s/it]

22it [04:19, 11.70s/it]

23it [04:30, 11.69s/it]

24it [04:42, 11.69s/it]

24it [04:42, 11.77s/it]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MLPLGBMNormalizedEnsemble(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:39, 39.71s/it]

2it [01:20, 40.21s/it]

3it [02:02, 41.30s/it]

4it [02:44, 41.36s/it]

5it [03:25, 41.24s/it]

6it [04:06, 41.20s/it]

7it [04:47, 41.09s/it]

8it [05:28, 41.23s/it]

9it [06:09, 41.08s/it]

10it [06:53, 42.04s/it]

11it [07:38, 42.75s/it]

12it [08:21, 42.95s/it]

13it [09:05, 43.32s/it]

13it [09:05, 41.98s/it]




In [10]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [11]:
# CV CALCULATION - This cell is AFTER the final submission cell
import os
from sklearn.metrics import mean_squared_error

os.makedirs('/home/submission', exist_ok=True)
submission.to_csv('/home/submission/submission.csv', index=True)

# Single solvent CV
X_single, Y_single = load_data("single_solvent")
split_gen = list(generate_leave_one_out_splits(X_single, Y_single))
all_y_true, all_y_pred = [], []
for fold_idx, split in enumerate(split_gen):
    (_, _), (_, test_Y) = split
    fold_preds = submission_single_solvent[submission_single_solvent['fold'] == fold_idx]
    all_y_true.append(test_Y.values)
    all_y_pred.append(fold_preds[['target_1', 'target_2', 'target_3']].values)
mse_single = mean_squared_error(np.vstack(all_y_true), np.vstack(all_y_pred))

# Full data CV
X_full, Y_full = load_data("full")
split_gen = list(generate_leave_one_ramp_out_splits(X_full, Y_full))
all_y_true, all_y_pred = [], []
for fold_idx, split in enumerate(split_gen):
    (_, _), (_, test_Y) = split
    fold_preds = submission_full_data[submission_full_data['fold'] == fold_idx]
    all_y_true.append(test_Y.values)
    all_y_pred.append(fold_preds[['target_1', 'target_2', 'target_3']].values)
mse_full = mean_squared_error(np.vstack(all_y_true), np.vstack(all_y_pred))

print(f'Single Solvent CV MSE: {mse_single:.6f}')
print(f'Full Data CV MSE: {mse_full:.6f}')
print(f'Submission saved with {len(submission)} rows')

Single Solvent CV MSE: 0.021210
Full Data CV MSE: 0.022834
Submission saved with 1883 rows
