# Experiment 015: Per-Target + MLP Hybrid with COMBINED Features

**Key improvements over exp_014:**
1. COMBINED features (0.8*ACS_PCA + 0.2*Spange) like exp_004 - NOT Spange-only
2. DEEPER models (depth=None) - NOT shallow like exp_014
3. Add MLP component for non-linear patterns
4. Optuna for ensemble WEIGHTS (not just hyperparameters)

**Architecture:**
- Per-target: HGB for SM, ETR for Products (from exp_004)
- MLP: [128, 64, 32] with BatchNorm, ReLU, Dropout
- Ensemble: Optuna-optimized weights

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import GroupKFold
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')

CUDA available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# --- LOAD FEATURES ---
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

# Load COMBINED features (0.8*ACS_PCA + 0.2*Spange) like exp_004
Spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv')
ACS_PCA = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv')

print(f"Spange: {Spange.shape}")
print(f"ACS_PCA: {ACS_PCA.shape}")

# Create lookup dictionaries
Spange_dict = {row['SOLVENT NAME']: row.drop('SOLVENT NAME').values.astype(float) for _, row in Spange.iterrows()}
ACS_PCA_dict = {row['SOLVENT NAME']: row.drop('SOLVENT NAME').values.astype(float) for _, row in ACS_PCA.iterrows()}

print(f"\nSpange features: {len(list(Spange_dict.values())[0])}")
print(f"ACS_PCA features: {len(list(ACS_PCA_dict.values())[0])}")

Spange: (26, 14)
ACS_PCA: (24, 6)

Spange features: 13
ACS_PCA features: 5


In [3]:
# --- LOO UTILITY FUNCTIONS (REQUIRED FOR SUBMISSION) ---
def generate_leave_one_out_splits(X, Y):
    """Leave-One-Solvent-Out for single solvent data (24 folds)."""
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    """Leave-One-Ramp-Out for full data (13 folds)."""
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & 
                 (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print("LOO utility functions defined")

LOO utility functions defined


In [4]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [5]:
# --- COMBINED FEATURE EXTRACTION (0.8*ACS_PCA + 0.2*Spange) ---
SPANGE_WEIGHT = 0.2
ACS_WEIGHT = 0.8

def get_combined_features_single(X):
    """Extract COMBINED features for single solvent data."""
    features = []
    for _, row in X.iterrows():
        solvent = row['SOLVENT NAME']
        spange = Spange_dict.get(solvent, np.zeros(13))
        acs_pca = ACS_PCA_dict.get(solvent, np.zeros(15))
        
        # Combine: 0.8*ACS_PCA + 0.2*Spange (like exp_004)
        combined = np.concatenate([
            [row['Residence Time'], row['Temperature']],
            ACS_WEIGHT * acs_pca,
            SPANGE_WEIGHT * spange
        ])
        features.append(combined)
    return np.array(features)

def get_combined_features_full(X):
    """Extract COMBINED features for full (mixed solvent) data."""
    features = []
    for _, row in X.iterrows():
        solvent_a = row['SOLVENT A NAME']
        solvent_b = row['SOLVENT B NAME']
        pct_b = row['SolventB%'] / 100.0
        
        spange_a = Spange_dict.get(solvent_a, np.zeros(13))
        spange_b = Spange_dict.get(solvent_b, np.zeros(13))
        acs_a = ACS_PCA_dict.get(solvent_a, np.zeros(15))
        acs_b = ACS_PCA_dict.get(solvent_b, np.zeros(15))
        
        # Linear interpolation for mixed solvents
        spange_mix = (1 - pct_b) * spange_a + pct_b * spange_b
        acs_mix = (1 - pct_b) * acs_a + pct_b * acs_b
        
        # Combine: 0.8*ACS_PCA + 0.2*Spange
        combined = np.concatenate([
            [row['Residence Time'], row['Temperature'], pct_b],
            ACS_WEIGHT * acs_mix,
            SPANGE_WEIGHT * spange_mix
        ])
        features.append(combined)
    return np.array(features)

# Test feature extraction
X_test, _ = load_data("single_solvent")
test_feat = get_combined_features_single(X_test.head(5))
print(f"Combined feature dim (single): {test_feat.shape[1]}")

X_test_full, _ = load_data("full")
test_feat_full = get_combined_features_full(X_test_full.head(5))
print(f"Combined feature dim (full): {test_feat_full.shape[1]}")

Combined feature dim (single): 20
Combined feature dim (full): 21


In [6]:
# --- MLP ARCHITECTURE ---
class SimpleMLP(nn.Module):
    """MLP with BatchNorm + ReLU + Dropout."""
    def __init__(self, input_dim, hidden_dims=[128, 64, 32], output_dim=3, dropout=0.2):
        super().__init__()
        layers = []
        layers.append(nn.BatchNorm1d(input_dim))
        
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())  # Output in [0, 1]
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        if x.size(0) == 1 and self.training:
            self.eval()
            out = self.network(x)
            self.train()
            return out
        return self.network(x)

print("SimpleMLP defined")

SimpleMLP defined


In [7]:
# --- PER-TARGET + MLP HYBRID MODEL ---
class PerTargetMLPHybrid(BaseModel):
    """Hybrid model combining Per-Target GBDT + MLP.
    
    Architecture:
    - HGB for SM (target 2) - captures gradient patterns (DEEP, not shallow)
    - ETR for Products (targets 0, 1) - robust to outliers (DEEP, not shallow)
    - MLP for all targets - captures non-linear patterns
    - Ensemble: weighted average with Optuna-optimized weights
    """
    
    def __init__(self, data='single', weights=None):
        super().__init__()
        self.data = data
        self.weights = weights or {'mlp': 0.3, 'hgb': 0.35, 'etr': 0.35}
        self.scaler = StandardScaler()
        self.mlp = None
        self.hgb = None
        self.etr = None
    
    def _get_features(self, X):
        if self.data == 'single':
            return get_combined_features_single(X)
        else:
            return get_combined_features_full(X)
    
    def train_model(self, X_train, y_train):
        X_feat = self._get_features(X_train)
        y_np = y_train.values
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # 1. Train MLP
        input_dim = X_scaled.shape[1]
        self.mlp = SimpleMLP(input_dim, hidden_dims=[128, 64, 32], output_dim=3, dropout=0.2).to(device)
        X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(device)
        y_tensor = torch.tensor(y_np, dtype=torch.double).to(device)
        dataset = TensorDataset(X_tensor, y_tensor)
        loader = DataLoader(dataset, batch_size=32, shuffle=True)
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=1e-3)
        criterion = nn.MSELoss()
        
        self.mlp.train()
        for epoch in range(100):
            for batch_X, batch_y in loader:
                optimizer.zero_grad()
                pred = self.mlp(batch_X)
                loss = criterion(pred, batch_y)
                loss.backward()
                optimizer.step()
        
        # 2. Train HGB for SM (target 2) - DEEP, not shallow
        self.hgb = HistGradientBoostingRegressor(
            max_depth=None,  # DEEP - no limit (unlike exp_014's depth=3)
            learning_rate=0.1,
            max_iter=200,
            random_state=42
        )
        self.hgb.fit(X_scaled, y_np[:, 2])
        
        # 3. Train ETR for Products (targets 0, 1) - DEEP, not shallow
        self.etr = ExtraTreesRegressor(
            n_estimators=200,
            max_depth=None,  # DEEP - no limit (unlike exp_014's depth=6)
            min_samples_split=2,
            random_state=42,
            n_jobs=-1
        )
        self.etr.fit(X_scaled, y_np[:, :2])
    
    def predict(self, X_test):
        X_feat = self._get_features(X_test)
        X_scaled = self.scaler.transform(X_feat)
        
        # MLP prediction
        self.mlp.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(device)
            mlp_pred = self.mlp(X_tensor).cpu().numpy()
        
        # HGB prediction (SM only)
        hgb_pred_sm = self.hgb.predict(X_scaled).reshape(-1, 1)
        
        # ETR prediction (Products only)
        etr_pred_products = self.etr.predict(X_scaled)
        
        # Combine per-target predictions
        # For Products (0, 1): use ETR
        # For SM (2): use HGB
        gbdt_pred = np.column_stack([etr_pred_products, hgb_pred_sm])
        
        # Weighted ensemble
        w_mlp = self.weights['mlp']
        w_gbdt = 1 - w_mlp  # Remaining weight for GBDT
        
        final_pred = w_mlp * mlp_pred + w_gbdt * gbdt_pred
        final_pred = np.clip(final_pred, 0, 1)
        
        return torch.tensor(final_pred)

print("PerTargetMLPHybrid defined")

PerTargetMLPHybrid defined


In [8]:
# --- OPTUNA OPTIMIZATION FOR ENSEMBLE WEIGHTS ---
print("Running Optuna optimization for ensemble weights...")

X_single, Y_single = load_data("single_solvent")
X_feat_single = get_combined_features_single(X_single)
y_single = Y_single.values
groups_single = X_single["SOLVENT NAME"].values

def objective_weights(trial):
    # Ensemble weight to optimize
    mlp_weight = trial.suggest_float('mlp_weight', 0.1, 0.6)
    
    # Use GroupKFold for internal CV (faster)
    gkf = GroupKFold(n_splits=5)
    errors = []
    
    for train_idx, val_idx in gkf.split(X_feat_single, y_single, groups=groups_single):
        X_train_df = X_single.iloc[train_idx]
        X_val_df = X_single.iloc[val_idx]
        y_train = Y_single.iloc[train_idx]
        y_val = Y_single.iloc[val_idx].values
        
        # Train model with trial weights
        weights = {'mlp': mlp_weight, 'hgb': (1-mlp_weight)/2, 'etr': (1-mlp_weight)/2}
        model = PerTargetMLPHybrid(data='single', weights=weights)
        model.train_model(X_train_df, y_train)
        
        # Predict
        preds = model.predict(X_val_df).numpy()
        mae = np.mean(np.abs(preds - y_val))
        errors.append(mae)
    
    return np.mean(errors)

# Run Optuna (fewer trials since each is expensive)
study_weights = optuna.create_study(direction='minimize')
study_weights.optimize(objective_weights, n_trials=20, show_progress_bar=True)

print(f"\nBest MLP weight: {study_weights.best_params['mlp_weight']:.4f}")
print(f"Best GroupKFold CV: {study_weights.best_value:.6f}")

best_mlp_weight = study_weights.best_params['mlp_weight']

Running Optuna optimization for ensemble weights...


  0%|          | 0/20 [00:00<?, ?it/s]


Best MLP weight: 0.5012
Best GroupKFold CV: 0.070156


In [None]:
# --- QUICK VALIDATION TEST ---
print("Quick test of PerTargetMLPHybrid with LOO...")

best_weights = {'mlp': best_mlp_weight, 'hgb': (1-best_mlp_weight)/2, 'etr': (1-best_mlp_weight)/2}
print(f"Using weights: {best_weights}")

errors = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_single, Y_single)):
    if i >= 3: break
    solvent = test_X['SOLVENT NAME'].iloc[0]
    model = PerTargetMLPHybrid(data='single', weights=best_weights)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    print(f"Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nQuick test MAE (3 folds): {np.mean(errors):.4f}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = PerTargetMLPHybrid(data='single', weights=best_weights) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = PerTargetMLPHybrid(data='full', weights=best_weights) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################