# Experiment 017: Replicate exp_004's EXACT Architecture

**CRITICAL FIX**: exp_016 used FEATURE combination, but exp_004 uses PREDICTION combination.

**exp_004's architecture (CV 0.0623):**
1. Train SEPARATE models on spange and acs_pca features for EACH target
2. Use Arrhenius kinetics features (inv_temp, log_time, interaction)
3. Combine PREDICTIONS: 0.8 * acs_pred + 0.2 * spange_pred
4. HGB for SM (depth=7, iter=700, lr=0.04)
5. ETR for Products (n_estimators=500, depth=10, min_samples_leaf=2)

**Expected result:**
- Full data CV should match exp_004's 0.0603
- Single solvent CV should be ~0.0659
- Combined CV should be ~0.0623

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print("Setup complete")

Setup complete


In [2]:
# --- UTILITY FUNCTIONS ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & 
                 (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load feature dataframes
SPANGE_DF = load_features('spange_descriptors')
ACS_PCA_DF = load_features('acs_pca_descriptors')
print(f"Spange: {SPANGE_DF.shape}, ACS_PCA: {ACS_PCA_DF.shape}")

Spange: (26, 13), ACS_PCA: (24, 5)


In [3]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [4]:
# --- EXACT REPLICATION OF exp_004's HybridPerTargetModel ---
class HybridPerTargetModel(BaseModel):
    """EXACT replication of exp_004's architecture.
    
    Key architecture (from exp_004):
    - Train SEPARATE models on spange and acs_pca features for EACH target
    - Combine PREDICTIONS: 0.8 * acs_pred + 0.2 * spange_pred
    - HGB for SM (depth=7, iter=700, lr=0.04)
    - ETR for Products (n_estimators=500, depth=10, min_samples_leaf=2)
    - Arrhenius kinetics features (inv_temp, log_time, interaction)
    - NO TTA
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.targets = ['Product 2', 'Product 3', 'SM']
        
        # Load both feature sets
        self.spange = SPANGE_DF
        self.acs_pca = ACS_PCA_DF
        
        # Scalers for each feature set
        self.scaler_spange = StandardScaler()
        self.scaler_acs = StandardScaler()
        
        # Models: {target: {feature_set: model}}
        self.models = {}
        
        # Feature weights: 0.8 acs_pca + 0.2 spange (PREDICTION combination)
        self.acs_weight = 0.8
        self.spange_weight = 0.2

    def _build_features(self, X, feature_df):
        """Build features with Arrhenius kinetics - NO augmentation."""
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        # Arrhenius kinetic features (CRITICAL - missing in exp_016)
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(rt + 1e-6)
        interaction = inv_temp * log_time
        
        if self.mixed:
            pct = X['SolventB%'].values.reshape(-1, 1)
            A = feature_df.loc[X['SOLVENT A NAME']].values
            B = feature_df.loc[X['SOLVENT B NAME']].values
            solvent_feats = A * (1 - pct) + B * pct
            return np.hstack([rt, temp, inv_temp, log_time, interaction, pct, solvent_feats])
        else:
            solvent_feats = feature_df.loc[X['SOLVENT NAME']].values
            return np.hstack([rt, temp, inv_temp, log_time, interaction, solvent_feats])

    def train_model(self, X_train, y_train):
        # Build features - NO AUGMENTATION!
        X_spange = self._build_features(X_train, self.spange)
        X_acs = self._build_features(X_train, self.acs_pca)
        
        # Scale
        X_spange_sc = self.scaler_spange.fit_transform(X_spange)
        X_acs_sc = self.scaler_acs.fit_transform(X_acs)
        
        y = y_train.values
        
        # Train per-target models (SEPARATE models for each feature set)
        for i, target in enumerate(self.targets):
            y_target = y[:, i]
            
            if target == 'SM':
                # HistGradientBoosting for SM
                model_spange = HistGradientBoostingRegressor(
                    max_depth=7, max_iter=700, learning_rate=0.04, random_state=42
                )
                model_acs = HistGradientBoostingRegressor(
                    max_depth=7, max_iter=700, learning_rate=0.04, random_state=42
                )
            else:
                # ExtraTrees for Products
                model_spange = ExtraTreesRegressor(
                    n_estimators=500, max_depth=10, min_samples_leaf=2,
                    random_state=42, n_jobs=-1
                )
                model_acs = ExtraTreesRegressor(
                    n_estimators=500, max_depth=10, min_samples_leaf=2,
                    random_state=42, n_jobs=-1
                )
            
            model_spange.fit(X_spange_sc, y_target)
            model_acs.fit(X_acs_sc, y_target)
            
            self.models[target] = {'spange': model_spange, 'acs': model_acs}

    def predict(self, X):
        # Build features - NO TTA!
        X_spange = self._build_features(X, self.spange)
        X_acs = self._build_features(X, self.acs_pca)
        
        X_spange_sc = self.scaler_spange.transform(X_spange)
        X_acs_sc = self.scaler_acs.transform(X_acs)
        
        preds_all = []
        for target in self.targets:
            p_spange = self.models[target]['spange'].predict(X_spange_sc)
            p_acs = self.models[target]['acs'].predict(X_acs_sc)
            # PREDICTION combination: 0.8 acs + 0.2 spange (NOT feature combination!)
            p_combined = self.acs_weight * p_acs + self.spange_weight * p_spange
            preds_all.append(p_combined.reshape(-1, 1))
        
        preds = np.hstack(preds_all)
        preds = np.clip(preds, 0, 1)
        return torch.tensor(preds, dtype=torch.double)

print("HybridPerTargetModel defined (EXACT replication of exp_004)")

HybridPerTargetModel defined (EXACT replication of exp_004)


In [5]:
# --- QUICK VALIDATION TEST ---
print("Testing HybridPerTargetModel (EXACT exp_004 replication)...")
X_test, Y_test = load_data("single_solvent")

# Quick test on first 3 solvents
errors = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_test, Y_test)):
    if i >= 3: break
    model = HybridPerTargetModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Single Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nSingle solvent quick test MAE: {np.mean(errors):.4f}")

# Also test on full data
print("\nTesting on full data...")
X_full, Y_full = load_data("full")
errors_full = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    if i >= 3: break
    model = HybridPerTargetModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors_full.append(mae)
    print(f"Full Fold {i}: MAE = {mae:.4f}")

print(f"\nFull data quick test MAE: {np.mean(errors_full):.4f}")

Testing HybridPerTargetModel (EXACT exp_004 replication)...


Single Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1446


Single Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.0962


Single Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0369

Single solvent quick test MAE: 0.0926

Testing on full data...


Full Fold 0: MAE = 0.0547


Full Fold 1: MAE = 0.0885


Full Fold 2: MAE = 0.0598

Full data quick test MAE: 0.0677


In [6]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = HybridPerTargetModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:01,  1.83s/it]

2it [00:03,  1.92s/it]

3it [00:05,  1.91s/it]

4it [00:07,  1.91s/it]

5it [00:09,  1.91s/it]

6it [00:11,  1.91s/it]

7it [00:13,  1.90s/it]

8it [00:15,  1.89s/it]

9it [00:17,  1.88s/it]

10it [00:18,  1.88s/it]

11it [00:20,  1.87s/it]

12it [00:22,  1.87s/it]

13it [00:24,  1.87s/it]

14it [00:26,  1.87s/it]

15it [00:28,  1.87s/it]

16it [00:30,  1.91s/it]

17it [00:32,  1.92s/it]

18it [00:34,  1.91s/it]

19it [00:36,  1.93s/it]

20it [00:37,  1.92s/it]

21it [00:39,  1.92s/it]

22it [00:41,  1.91s/it]

23it [00:43,  1.93s/it]

24it [00:45,  1.92s/it]

24it [00:45,  1.90s/it]




In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = HybridPerTargetModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:02,  2.03s/it]

2it [00:03,  1.97s/it]

3it [00:05,  1.98s/it]

4it [00:07,  1.96s/it]

5it [00:09,  1.97s/it]

6it [00:12,  2.07s/it]

7it [00:14,  2.09s/it]

8it [00:16,  2.07s/it]

9it [00:18,  2.06s/it]

10it [00:20,  2.03s/it]

11it [00:22,  2.04s/it]

12it [00:24,  2.07s/it]

13it [00:26,  2.09s/it]

13it [00:26,  2.05s/it]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################