# Experiment 019: MLP with Strong Regularization + exp_004 Ensemble

**Key insight from analysis:**
- The 53% CV-LB gap is the REAL problem
- Tree-based models memorize training solvents, don't generalize
- MLP with strong regularization may have smaller CV-LB gap

**Architecture:**
- Combine exp_004's proven architecture (HGB+ETR per-target) with MLP
- MLP: [256, 128, 64] with strong dropout (0.4), weight decay (1e-3)
- Ensemble: 0.7 * exp_004_pred + 0.3 * mlp_pred
- Features: Spange + ACS_PCA + Arrhenius kinetics

**Hypothesis:**
- MLP may capture different patterns than trees
- Ensemble diversity may improve generalization
- Strong regularization helps OOD prediction

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')

CUDA available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# --- UTILITY FUNCTIONS ---
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & 
                 (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load feature dataframes
SPANGE_DF = load_features('spange_descriptors')
ACS_PCA_DF = load_features('acs_pca_descriptors')
print(f"Spange: {SPANGE_DF.shape}, ACS_PCA: {ACS_PCA_DF.shape}")

Spange: (26, 13), ACS_PCA: (24, 5)


In [3]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [4]:
# --- REGULARIZED MLP ---
class RegularizedMLP(nn.Module):
    """MLP with strong regularization for better generalization.
    
    Key features:
    - Dropout 0.4 between layers (strong regularization)
    - BatchNorm for stability
    - Weight decay applied via optimizer
    - Sigmoid output for [0,1] range
    """
    def __init__(self, input_dim, hidden_dims=[256, 128, 64], output_dim=3, dropout=0.4):
        super().__init__()
        layers = []
        layers.append(nn.BatchNorm1d(input_dim))
        
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))  # Strong dropout
            prev_dim = h_dim
        
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())  # Output in [0, 1]
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        if x.size(0) == 1 and self.training:
            self.eval()
            out = self.network(x)
            self.train()
            return out
        return self.network(x)

print("RegularizedMLP defined")

RegularizedMLP defined


In [5]:
# --- HYBRID MODEL: exp_004 + MLP ---
class HybridMLPModel(BaseModel):
    """Combines exp_004's proven architecture with regularized MLP.
    
    Architecture:
    - exp_004: Per-target HGB+ETR with prediction combination (0.8*acs + 0.2*spange)
    - MLP: Regularized MLP with dropout=0.4, weight_decay=1e-3
    - Ensemble: 0.7 * exp_004_pred + 0.3 * mlp_pred
    
    Hypothesis: MLP may capture different patterns, ensemble diversity helps generalization.
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.targets = ['Product 2', 'Product 3', 'SM']
        
        # Feature dataframes
        self.spange = SPANGE_DF
        self.acs_pca = ACS_PCA_DF
        
        # Scalers
        self.scaler_spange = StandardScaler()
        self.scaler_acs = StandardScaler()
        self.scaler_mlp = StandardScaler()
        
        # exp_004 models
        self.models = {}
        
        # MLP
        self.mlp = None
        
        # Ensemble weights
        self.exp004_weight = 0.7
        self.mlp_weight = 0.3

    def _build_features(self, X, feature_df):
        """Build features with Arrhenius kinetics."""
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        # Arrhenius kinetic features
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(rt + 1e-6)
        interaction = inv_temp * log_time
        
        if self.mixed:
            pct = X['SolventB%'].values.reshape(-1, 1)
            A = feature_df.loc[X['SOLVENT A NAME']].values
            B = feature_df.loc[X['SOLVENT B NAME']].values
            solvent_feats = A * (1 - pct) + B * pct
            return np.hstack([rt, temp, inv_temp, log_time, interaction, pct, solvent_feats])
        else:
            solvent_feats = feature_df.loc[X['SOLVENT NAME']].values
            return np.hstack([rt, temp, inv_temp, log_time, interaction, solvent_feats])

    def _build_mlp_features(self, X):
        """Build combined features for MLP."""
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        # Arrhenius kinetic features
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(rt + 1e-6)
        interaction = inv_temp * log_time
        
        if self.mixed:
            pct = X['SolventB%'].values.reshape(-1, 1)
            spange_A = self.spange.loc[X['SOLVENT A NAME']].values
            spange_B = self.spange.loc[X['SOLVENT B NAME']].values
            acs_A = self.acs_pca.loc[X['SOLVENT A NAME']].values
            acs_B = self.acs_pca.loc[X['SOLVENT B NAME']].values
            spange_mix = spange_A * (1 - pct) + spange_B * pct
            acs_mix = acs_A * (1 - pct) + acs_B * pct
            return np.hstack([rt, temp, inv_temp, log_time, interaction, pct, spange_mix, acs_mix])
        else:
            spange_feats = self.spange.loc[X['SOLVENT NAME']].values
            acs_feats = self.acs_pca.loc[X['SOLVENT NAME']].values
            return np.hstack([rt, temp, inv_temp, log_time, interaction, spange_feats, acs_feats])

    def train_model(self, X_train, y_train):
        # Build features
        X_spange = self._build_features(X_train, self.spange)
        X_acs = self._build_features(X_train, self.acs_pca)
        X_mlp = self._build_mlp_features(X_train)
        
        # Scale
        X_spange_sc = self.scaler_spange.fit_transform(X_spange)
        X_acs_sc = self.scaler_acs.fit_transform(X_acs)
        X_mlp_sc = self.scaler_mlp.fit_transform(X_mlp)
        
        y = y_train.values
        
        # --- Train exp_004 models (per-target) ---
        for i, target in enumerate(self.targets):
            y_target = y[:, i]
            
            if target == 'SM':
                model_spange = HistGradientBoostingRegressor(
                    max_depth=7, max_iter=700, learning_rate=0.04, random_state=42
                )
                model_acs = HistGradientBoostingRegressor(
                    max_depth=7, max_iter=700, learning_rate=0.04, random_state=42
                )
            else:
                model_spange = ExtraTreesRegressor(
                    n_estimators=500, max_depth=10, min_samples_leaf=2,
                    random_state=42, n_jobs=-1
                )
                model_acs = ExtraTreesRegressor(
                    n_estimators=500, max_depth=10, min_samples_leaf=2,
                    random_state=42, n_jobs=-1
                )
            
            model_spange.fit(X_spange_sc, y_target)
            model_acs.fit(X_acs_sc, y_target)
            self.models[target] = {'spange': model_spange, 'acs': model_acs}
        
        # --- Train MLP ---
        input_dim = X_mlp_sc.shape[1]
        self.mlp = RegularizedMLP(input_dim, hidden_dims=[256, 128, 64], output_dim=3, dropout=0.4).to(device)
        
        X_tensor = torch.tensor(X_mlp_sc, dtype=torch.double).to(device)
        y_tensor = torch.tensor(y, dtype=torch.double).to(device)
        dataset = TensorDataset(X_tensor, y_tensor)
        loader = DataLoader(dataset, batch_size=32, shuffle=True)
        
        # Strong weight decay for regularization
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=1e-3, weight_decay=1e-3)
        criterion = nn.MSELoss()
        
        self.mlp.train()
        for epoch in range(200):  # More epochs with early stopping potential
            for batch_X, batch_y in loader:
                optimizer.zero_grad()
                pred = self.mlp(batch_X)
                loss = criterion(pred, batch_y)
                loss.backward()
                optimizer.step()

    def predict(self, X):
        # Build features
        X_spange = self._build_features(X, self.spange)
        X_acs = self._build_features(X, self.acs_pca)
        X_mlp = self._build_mlp_features(X)
        
        X_spange_sc = self.scaler_spange.transform(X_spange)
        X_acs_sc = self.scaler_acs.transform(X_acs)
        X_mlp_sc = self.scaler_mlp.transform(X_mlp)
        
        # --- exp_004 predictions ---
        preds_exp004 = []
        for target in self.targets:
            p_spange = self.models[target]['spange'].predict(X_spange_sc)
            p_acs = self.models[target]['acs'].predict(X_acs_sc)
            p_combined = 0.8 * p_acs + 0.2 * p_spange
            preds_exp004.append(p_combined.reshape(-1, 1))
        preds_exp004 = np.hstack(preds_exp004)
        
        # --- MLP predictions ---
        self.mlp.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_mlp_sc, dtype=torch.double).to(device)
            preds_mlp = self.mlp(X_tensor).cpu().numpy()
        
        # --- Ensemble ---
        final_pred = self.exp004_weight * preds_exp004 + self.mlp_weight * preds_mlp
        final_pred = np.clip(final_pred, 0, 1)
        
        return torch.tensor(final_pred, dtype=torch.double)

print("HybridMLPModel defined")

HybridMLPModel defined


In [6]:
# --- QUICK VALIDATION TEST ---
print("Testing HybridMLPModel...")
X_test, Y_test = load_data("single_solvent")

errors = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_test, Y_test)):
    if i >= 3: break
    model = HybridMLPModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Single Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nSingle solvent quick test MAE: {np.mean(errors):.4f}")

# Test full data
print("\nTesting on full data...")
X_full, Y_full = load_data("full")
errors_full = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    if i >= 3: break
    model = HybridMLPModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors_full.append(mae)
    print(f"Full Fold {i}: MAE = {mae:.4f}")

print(f"\nFull data quick test MAE: {np.mean(errors_full):.4f}")

Testing HybridMLPModel...


Single Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1483


Single Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1010


Single Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0361

Single solvent quick test MAE: 0.0952

Testing on full data...


Full Fold 0: MAE = 0.0526


Full Fold 1: MAE = 0.0992


Full Fold 2: MAE = 0.0638

Full data quick test MAE: 0.0719


In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = HybridMLPModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:08,  8.13s/it]

2it [00:16,  8.01s/it]

3it [00:23,  7.82s/it]

4it [00:31,  7.74s/it]

5it [00:39,  7.82s/it]

6it [00:47,  7.90s/it]

7it [00:55,  7.93s/it]

8it [01:03,  7.93s/it]

9it [01:10,  7.88s/it]

10it [01:18,  7.88s/it]

11it [01:26,  7.93s/it]

12it [01:34,  7.97s/it]

13it [01:42,  7.94s/it]

14it [01:50,  7.92s/it]

15it [01:58,  7.90s/it]

16it [02:06,  7.89s/it]

17it [02:14,  7.99s/it]

18it [02:22,  7.96s/it]

19it [02:30,  7.97s/it]

20it [02:38,  7.94s/it]

21it [02:46,  7.91s/it]

22it [02:54,  7.89s/it]

23it [03:01,  7.88s/it]

24it [03:09,  7.89s/it]

24it [03:09,  7.91s/it]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = HybridMLPModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:12, 12.41s/it]

2it [00:24, 12.46s/it]

3it [00:37, 12.57s/it]

4it [00:49, 12.49s/it]

5it [01:02, 12.47s/it]

6it [01:14, 12.47s/it]

7it [01:27, 12.45s/it]

8it [01:40, 12.54s/it]

9it [01:52, 12.54s/it]

10it [02:05, 12.79s/it]

11it [02:19, 12.94s/it]

12it [02:32, 13.10s/it]

13it [02:46, 13.22s/it]

13it [02:46, 12.78s/it]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################