# Experiment 007: Intermediate Regularization with Combined Features

**Goal**: Find the sweet spot between underfitting (Ridge) and overfitting (deep trees).

Based on Loop 6 analysis:
- ETR(depth=7) is optimal: GroupKFold CV 0.0713
- Combined features (DRFP-PCA(15) + Spange + ACS_PCA) achieve best CV: 0.0706
- Per-target models: HGB(depth=5) for SM, ETR(depth=7) for Products
- NO TTA

**Expected**: GroupKFold CV ~0.07, potentially better LB due to less overfitting.

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print("Setup complete")

Setup complete


In [2]:
# --- UTILITY FUNCTIONS ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load all feature sets
SPANGE_DF = load_features('spange_descriptors')
ACS_PCA_DF = load_features('acs_pca_descriptors')
DRFP_DF = load_features('drfps_catechol')

print(f"Spange: {SPANGE_DF.shape}")
print(f"ACS_PCA: {ACS_PCA_DF.shape}")
print(f"DRFP: {DRFP_DF.shape}")

Spange: (26, 13)
ACS_PCA: (24, 5)
DRFP: (24, 2048)


In [3]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [4]:
# Precompute DRFP-PCA for all solvents
# Apply PCA to reduce DRFP from 2048 to 15 dimensions
drfp_pca = PCA(n_components=15, random_state=42)
DRFP_PCA_VALUES = drfp_pca.fit_transform(DRFP_DF.values)
DRFP_PCA_DF = pd.DataFrame(DRFP_PCA_VALUES, index=DRFP_DF.index)
print(f"DRFP-PCA: {DRFP_PCA_DF.shape}")
print(f"Explained variance: {drfp_pca.explained_variance_ratio_.sum():.3f}")

DRFP-PCA: (24, 15)
Explained variance: 0.949


In [5]:
# --- INTERMEDIATE REGULARIZATION MODEL ---
class IntermediateRegModel(BaseModel):
    """Per-target model with intermediate regularization and combined features.
    
    Key insights from Loop 6 analysis:
    - ETR(depth=7) is optimal: GroupKFold CV 0.0713
    - Combined features (DRFP-PCA + Spange + ACS_PCA) achieve best CV: 0.0706
    - Per-target: HGB(depth=5) for SM, ETR(depth=7) for Products
    - NO TTA
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.targets = ['Product 2', 'Product 3', 'SM']
        
        # Feature dataframes
        self.spange = SPANGE_DF
        self.acs_pca = ACS_PCA_DF
        self.drfp_pca = DRFP_PCA_DF
        
        # Scaler
        self.scaler = StandardScaler()
        
        # Per-target models
        self.models = {}
        
    def _build_features(self, X):
        """Build combined features: Arrhenius + DRFP-PCA + Spange + ACS_PCA."""
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        # Arrhenius kinetic features
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(rt + 1e-6)
        interaction = inv_temp * log_time
        
        process_feats = np.hstack([rt, temp, inv_temp, log_time, interaction])
        
        if self.mixed:
            pct = X['SolventB%'].values.reshape(-1, 1)
            
            # Spange features (weighted mix)
            A_spange = self.spange.loc[X['SOLVENT A NAME']].values
            B_spange = self.spange.loc[X['SOLVENT B NAME']].values
            spange_feats = A_spange * (1 - pct) + B_spange * pct
            
            # ACS_PCA features (weighted mix)
            A_acs = self.acs_pca.loc[X['SOLVENT A NAME']].values
            B_acs = self.acs_pca.loc[X['SOLVENT B NAME']].values
            acs_feats = A_acs * (1 - pct) + B_acs * pct
            
            # DRFP-PCA features (weighted mix)
            A_drfp = self.drfp_pca.loc[X['SOLVENT A NAME']].values
            B_drfp = self.drfp_pca.loc[X['SOLVENT B NAME']].values
            drfp_feats = A_drfp * (1 - pct) + B_drfp * pct
            
            return np.hstack([process_feats, pct, spange_feats, acs_feats, drfp_feats])
        else:
            spange_feats = self.spange.loc[X['SOLVENT NAME']].values
            acs_feats = self.acs_pca.loc[X['SOLVENT NAME']].values
            drfp_feats = self.drfp_pca.loc[X['SOLVENT NAME']].values
            return np.hstack([process_feats, spange_feats, acs_feats, drfp_feats])

    def train_model(self, X_train, y_train):
        # Build features - NO AUGMENTATION!
        X_feat = self._build_features(X_train)
        X_scaled = self.scaler.fit_transform(X_feat)
        y = y_train.values
        
        # Train per-target models with INTERMEDIATE regularization
        for i, target in enumerate(self.targets):
            y_target = y[:, i]
            
            if target == 'SM':
                # HGB with depth=5 for SM (was 7)
                model = HistGradientBoostingRegressor(
                    max_depth=5, max_iter=500, learning_rate=0.05, random_state=42
                )
            else:
                # ETR with depth=7 for Products (was 10)
                model = ExtraTreesRegressor(
                    n_estimators=200, max_depth=7, min_samples_leaf=2,
                    random_state=42, n_jobs=-1
                )
            
            model.fit(X_scaled, y_target)
            self.models[target] = model

    def predict(self, X):
        # Build features - NO TTA!
        X_feat = self._build_features(X)
        X_scaled = self.scaler.transform(X_feat)
        
        preds_all = []
        for target in self.targets:
            p = self.models[target].predict(X_scaled)
            preds_all.append(p.reshape(-1, 1))
        
        preds = np.hstack(preds_all)
        preds = np.clip(preds, 0, 1)
        return torch.tensor(preds, dtype=torch.double)

In [6]:
# --- QUICK VALIDATION TEST ---
print("Testing IntermediateRegModel...")
X_test, Y_test = load_data("single_solvent")

# Quick leave-one-out test on first 5 solvents
errors = []
split_gen = generate_leave_one_out_splits(X_test, Y_test)
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(split_gen):
    if i >= 5: break
    model = IntermediateRegModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nQuick test MAE (single): {np.mean(errors):.4f}")

# Also test on full data
print("\nTesting on full data...")
X_full, Y_full = load_data("full")
errors_full = []
split_gen = generate_leave_one_ramp_out_splits(X_full, Y_full)
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(split_gen):
    if i >= 3: break
    model = IntermediateRegModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors_full.append(mae)
    print(f"Fold {i}: MAE = {mae:.4f}")

print(f"\nQuick test MAE (full): {np.mean(errors_full):.4f}")

Testing IntermediateRegModel...


Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1709


Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1107


Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0479


Fold 3 (Acetonitrile): MAE = 0.0765


Fold 4 (Acetonitrile.Acetic Acid): MAE = 0.1250

Quick test MAE (single): 0.1062

Testing on full data...


Fold 0: MAE = 0.0644


Fold 1: MAE = 0.1012


Fold 2: MAE = 0.0655

Quick test MAE (full): 0.0770


## Template-Compliant Cross-Validation

The following 3 cells are the FINAL 3 cells - EXACTLY as in the template.

In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = IntermediateRegModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:00,  1.71it/s]

2it [00:01,  1.86it/s]

3it [00:01,  1.95it/s]

4it [00:02,  1.92it/s]

5it [00:02,  1.96it/s]

6it [00:03,  1.90it/s]

7it [00:03,  1.95it/s]

8it [00:04,  2.01it/s]

9it [00:04,  2.01it/s]

10it [00:05,  1.99it/s]

11it [00:05,  1.97it/s]

12it [00:06,  2.01it/s]

13it [00:06,  2.01it/s]

14it [00:07,  2.03it/s]

15it [00:07,  2.05it/s]

16it [00:08,  2.03it/s]

17it [00:08,  1.98it/s]

18it [00:09,  1.96it/s]

19it [00:09,  1.99it/s]

20it [00:10,  1.98it/s]

21it [00:10,  1.98it/s]

22it [00:11,  2.00it/s]

23it [00:11,  2.02it/s]

24it [00:12,  2.02it/s]

24it [00:12,  1.98it/s]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = IntermediateRegModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:00,  1.80it/s]

2it [00:01,  1.84it/s]

3it [00:01,  1.86it/s]

4it [00:02,  1.88it/s]

5it [00:02,  1.87it/s]

6it [00:03,  1.83it/s]

7it [00:03,  1.86it/s]

8it [00:04,  1.85it/s]

9it [00:04,  1.90it/s]

10it [00:05,  1.90it/s]

11it [00:05,  1.89it/s]

12it [00:06,  1.84it/s]

13it [00:06,  1.85it/s]

13it [00:06,  1.86it/s]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################