# Experiment 014: Per-Target + Optuna Optimization

**Key approach:**
1. Per-target models: HGB for SM, ExtraTrees for Products (best CV from exp_004)
2. Optuna hyperparameter optimization (key missing piece from top kernel)
3. GroupKFold (5-fold) INTERNALLY for Optuna (faster iteration)
4. LOO for final submission (REQUIRED by evaluation metric)

**Hyperparameters to optimize:**
- HGB: max_depth, learning_rate, max_iter
- ETR: max_depth, n_estimators, min_samples_split

In [1]:
import numpy as np
import pandas as pd
import torch
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import GroupKFold
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print(f"CUDA available: {torch.cuda.is_available()}")

CUDA available: True


In [2]:
# --- UTILITY FUNCTIONS ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

# Load Spange descriptors
Spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv')
print(f"Spange: {Spange.shape}")
Spange_dict = {row['SOLVENT NAME']: row.drop('SOLVENT NAME').values for _, row in Spange.iterrows()}

Spange: (26, 14)


In [3]:
# --- LOO UTILITY FUNCTIONS (REQUIRED FOR SUBMISSION) ---
def generate_leave_one_out_splits(X, Y):
    """Leave-One-Solvent-Out for single solvent data (24 folds)."""
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    """Leave-One-Ramp-Out for full data (13 folds)."""
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & 
                 (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print("LOO utility functions defined")

LOO utility functions defined


In [4]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [5]:
# --- FEATURE EXTRACTION ---
def get_features_single(X):
    """Extract features for single solvent data."""
    features = []
    for _, row in X.iterrows():
        solvent = row['SOLVENT NAME']
        spange = Spange_dict.get(solvent, np.zeros(13))
        feat = np.concatenate([[row['Residence Time'], row['Temperature']], spange])
        features.append(feat)
    return np.array(features)

def get_features_full(X):
    """Extract features for full (mixed solvent) data."""
    features = []
    for _, row in X.iterrows():
        solvent_a = row['SOLVENT A NAME']
        solvent_b = row['SOLVENT B NAME']
        pct_b = row['SolventB%'] / 100.0
        spange_a = Spange_dict.get(solvent_a, np.zeros(13))
        spange_b = Spange_dict.get(solvent_b, np.zeros(13))
        spange_mix = (1 - pct_b) * spange_a + pct_b * spange_b
        feat = np.concatenate([[row['Residence Time'], row['Temperature'], pct_b], spange_mix])
        features.append(feat)
    return np.array(features)

print("Feature extraction functions defined")

Feature extraction functions defined


In [6]:
# --- OPTUNA OPTIMIZATION FOR SINGLE SOLVENT DATA ---
print("Running Optuna optimization for single solvent data...")

X_single, Y_single = load_data("single_solvent")
X_feat_single = get_features_single(X_single)
y_single = Y_single.values
groups_single = X_single["SOLVENT NAME"].values

def objective_single(trial):
    # HGB hyperparameters (for SM - target 2)
    hgb_depth = trial.suggest_int('hgb_depth', 3, 10)
    hgb_lr = trial.suggest_float('hgb_lr', 0.01, 0.3, log=True)
    hgb_iter = trial.suggest_int('hgb_iter', 100, 500)
    
    # ETR hyperparameters (for Products - targets 0, 1)
    etr_depth = trial.suggest_int('etr_depth', 5, 20)
    etr_n_estimators = trial.suggest_int('etr_n_estimators', 100, 500)
    etr_min_samples = trial.suggest_int('etr_min_samples', 2, 10)
    
    # Use GroupKFold for internal CV (faster)
    gkf = GroupKFold(n_splits=5)
    errors = []
    
    for train_idx, val_idx in gkf.split(X_feat_single, y_single, groups=groups_single):
        X_train, X_val = X_feat_single[train_idx], X_feat_single[val_idx]
        y_train, y_val = y_single[train_idx], y_single[val_idx]
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        
        # Train HGB for SM (target 2)
        hgb = HistGradientBoostingRegressor(
            max_depth=hgb_depth, learning_rate=hgb_lr, max_iter=hgb_iter, random_state=42)
        hgb.fit(X_train_scaled, y_train[:, 2])
        
        # Train ETR for Products (targets 0, 1)
        etr = ExtraTreesRegressor(
            max_depth=etr_depth, n_estimators=etr_n_estimators, 
            min_samples_split=etr_min_samples, random_state=42, n_jobs=-1)
        etr.fit(X_train_scaled, y_train[:, :2])
        
        # Predict
        pred_products = etr.predict(X_val_scaled)
        pred_sm = hgb.predict(X_val_scaled).reshape(-1, 1)
        preds = np.column_stack([pred_products, pred_sm])
        preds = np.clip(preds, 0, 1)
        
        mae = np.mean(np.abs(preds - y_val))
        errors.append(mae)
    
    return np.mean(errors)

# Run Optuna
study_single = optuna.create_study(direction='minimize')
study_single.optimize(objective_single, n_trials=50, show_progress_bar=True)

print(f"\nBest single solvent params: {study_single.best_params}")
print(f"Best single solvent CV (GroupKFold): {study_single.best_value:.6f}")

Running Optuna optimization for single solvent data...


  0%|          | 0/50 [00:00<?, ?it/s]


Best single solvent params: {'hgb_depth': 3, 'hgb_lr': 0.09437852301949078, 'hgb_iter': 326, 'etr_depth': 20, 'etr_n_estimators': 494, 'etr_min_samples': 8}
Best single solvent CV (GroupKFold): 0.078348


In [7]:
# --- OPTUNA OPTIMIZATION FOR FULL DATA ---
print("Running Optuna optimization for full data...")

X_full, Y_full = load_data("full")
X_feat_full = get_features_full(X_full)
y_full = Y_full.values
groups_full = (X_full["SOLVENT A NAME"].astype(str) + "_" + X_full["SOLVENT B NAME"].astype(str)).values

def objective_full(trial):
    # HGB hyperparameters (for SM - target 2)
    hgb_depth = trial.suggest_int('hgb_depth', 3, 10)
    hgb_lr = trial.suggest_float('hgb_lr', 0.01, 0.3, log=True)
    hgb_iter = trial.suggest_int('hgb_iter', 100, 500)
    
    # ETR hyperparameters (for Products - targets 0, 1)
    etr_depth = trial.suggest_int('etr_depth', 5, 20)
    etr_n_estimators = trial.suggest_int('etr_n_estimators', 100, 500)
    etr_min_samples = trial.suggest_int('etr_min_samples', 2, 10)
    
    # Use GroupKFold for internal CV (faster)
    gkf = GroupKFold(n_splits=5)
    errors = []
    
    for train_idx, val_idx in gkf.split(X_feat_full, y_full, groups=groups_full):
        X_train, X_val = X_feat_full[train_idx], X_feat_full[val_idx]
        y_train, y_val = y_full[train_idx], y_full[val_idx]
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        
        # Train HGB for SM (target 2)
        hgb = HistGradientBoostingRegressor(
            max_depth=hgb_depth, learning_rate=hgb_lr, max_iter=hgb_iter, random_state=42)
        hgb.fit(X_train_scaled, y_train[:, 2])
        
        # Train ETR for Products (targets 0, 1)
        etr = ExtraTreesRegressor(
            max_depth=etr_depth, n_estimators=etr_n_estimators, 
            min_samples_split=etr_min_samples, random_state=42, n_jobs=-1)
        etr.fit(X_train_scaled, y_train[:, :2])
        
        # Predict
        pred_products = etr.predict(X_val_scaled)
        pred_sm = hgb.predict(X_val_scaled).reshape(-1, 1)
        preds = np.column_stack([pred_products, pred_sm])
        preds = np.clip(preds, 0, 1)
        
        mae = np.mean(np.abs(preds - y_val))
        errors.append(mae)
    
    return np.mean(errors)

# Run Optuna
study_full = optuna.create_study(direction='minimize')
study_full.optimize(objective_full, n_trials=50, show_progress_bar=True)

print(f"\nBest full data params: {study_full.best_params}")
print(f"Best full data CV (GroupKFold): {study_full.best_value:.6f}")

Running Optuna optimization for full data...


  0%|          | 0/50 [00:00<?, ?it/s]


Best full data params: {'hgb_depth': 4, 'hgb_lr': 0.19681752326845484, 'hgb_iter': 208, 'etr_depth': 6, 'etr_n_estimators': 188, 'etr_min_samples': 3}
Best full data CV (GroupKFold): 0.083789


In [8]:
# --- OPTUNA-OPTIMIZED PER-TARGET MODEL ---
class OptunaPerTargetModel(BaseModel):
    """Per-target model with Optuna-optimized hyperparameters.
    
    Uses:
    - HGB for SM (target 2) - captures gradient patterns
    - ETR for Products (targets 0, 1) - robust to outliers
    """
    
    def __init__(self, data='single', params=None):
        super().__init__()
        self.data = data
        self.params = params or {}
        self.scaler = StandardScaler()
        self.hgb = None
        self.etr = None
    
    def _get_features(self, X):
        if self.data == 'single':
            return get_features_single(X)
        else:
            return get_features_full(X)
    
    def train_model(self, X_train, y_train):
        X_feat = self._get_features(X_train)
        y_np = y_train.values
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # Get hyperparameters
        hgb_depth = self.params.get('hgb_depth', 5)
        hgb_lr = self.params.get('hgb_lr', 0.1)
        hgb_iter = self.params.get('hgb_iter', 200)
        etr_depth = self.params.get('etr_depth', 10)
        etr_n_estimators = self.params.get('etr_n_estimators', 200)
        etr_min_samples = self.params.get('etr_min_samples', 2)
        
        # Train HGB for SM (target 2)
        self.hgb = HistGradientBoostingRegressor(
            max_depth=hgb_depth, learning_rate=hgb_lr, max_iter=hgb_iter, random_state=42)
        self.hgb.fit(X_scaled, y_np[:, 2])
        
        # Train ETR for Products (targets 0, 1)
        self.etr = ExtraTreesRegressor(
            max_depth=etr_depth, n_estimators=etr_n_estimators,
            min_samples_split=etr_min_samples, random_state=42, n_jobs=-1)
        self.etr.fit(X_scaled, y_np[:, :2])
    
    def predict(self, X_test):
        X_feat = self._get_features(X_test)
        X_scaled = self.scaler.transform(X_feat)
        
        pred_products = self.etr.predict(X_scaled)
        pred_sm = self.hgb.predict(X_scaled).reshape(-1, 1)
        preds = np.column_stack([pred_products, pred_sm])
        preds = np.clip(preds, 0, 1)
        
        return torch.tensor(preds)

# Store best params
best_params_single = study_single.best_params
best_params_full = study_full.best_params

print(f"Best single params: {best_params_single}")
print(f"Best full params: {best_params_full}")

Best single params: {'hgb_depth': 3, 'hgb_lr': 0.09437852301949078, 'hgb_iter': 326, 'etr_depth': 20, 'etr_n_estimators': 494, 'etr_min_samples': 8}
Best full params: {'hgb_depth': 4, 'hgb_lr': 0.19681752326845484, 'hgb_iter': 208, 'etr_depth': 6, 'etr_n_estimators': 188, 'etr_min_samples': 3}


In [9]:
# --- QUICK VALIDATION TEST ---
print("Quick test of OptunaPerTargetModel with LOO...")
X_test, Y_test = load_data("single_solvent")

errors = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_test, Y_test)):
    if i >= 3: break
    solvent = test_X['SOLVENT NAME'].iloc[0]
    model = OptunaPerTargetModel(data='single', params=best_params_single)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    print(f"Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nQuick test MAE (3 folds): {np.mean(errors):.4f}")

Quick test of OptunaPerTargetModel with LOO...


Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1687


Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1166


Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0359

Quick test MAE (3 folds): 0.1071


In [10]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = OptunaPerTargetModel(data='single', params=best_params_single) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:00,  2.91it/s]

2it [00:00,  2.86it/s]

3it [00:01,  2.85it/s]

4it [00:01,  2.78it/s]

5it [00:01,  2.78it/s]

6it [00:02,  2.84it/s]

7it [00:02,  2.81it/s]

8it [00:02,  2.83it/s]

9it [00:03,  2.84it/s]

10it [00:03,  2.86it/s]

11it [00:03,  2.87it/s]

12it [00:04,  2.90it/s]

13it [00:04,  2.86it/s]

14it [00:04,  2.87it/s]

15it [00:05,  2.88it/s]

16it [00:05,  2.88it/s]

17it [00:05,  2.87it/s]

18it [00:06,  2.84it/s]

19it [00:06,  2.84it/s]

20it [00:07,  2.85it/s]

21it [00:07,  2.84it/s]

22it [00:07,  2.84it/s]

23it [00:08,  2.87it/s]

24it [00:08,  2.88it/s]

24it [00:08,  2.85it/s]




In [11]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = OptunaPerTargetModel(data='full', params=best_params_full) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:00,  4.12it/s]

2it [00:00,  4.25it/s]

3it [00:00,  4.17it/s]

4it [00:00,  4.13it/s]

5it [00:01,  4.12it/s]

6it [00:01,  4.23it/s]

7it [00:01,  4.16it/s]

8it [00:01,  4.16it/s]

9it [00:02,  4.19it/s]

10it [00:02,  4.21it/s]

11it [00:02,  4.30it/s]

12it [00:02,  4.30it/s]

13it [00:03,  4.34it/s]

13it [00:03,  4.23it/s]




In [12]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################