# Experiment 026: Ultra-Simple Physics-Based Model

**Hypothesis**: The 53% CV-LB gap suggests overfitting to training solvents. Simpler models with physics-based features may generalize better.

**Key insight**: We're not trying to minimize CV - we're trying to minimize LB. A model with WORSE CV might have BETTER LB if it generalizes better.

**Approach**:
1. Use ONLY physics-based features (Arrhenius kinetics + basic solvent properties)
2. Use simple Ridge regression (fewer parameters = less overfitting)
3. NO TTA (confirmed to hurt performance)
4. Per-target architecture

**TEMPLATE COMPLIANCE**: Last 3 cells are EXACTLY as template, NO cells after them.

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print("Setup complete")

Setup complete


In [2]:
# --- UTILITY FUNCTIONS ---
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load Spange descriptors
SPANGE_DF = load_features('spange_descriptors')
print(f"Spange: {SPANGE_DF.shape}")
print(f"Columns: {SPANGE_DF.columns.tolist()}")

Spange: (26, 13)
Columns: ['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 'SA', 'SB', 'SP', 'SdP', 'N', 'n', 'f(n)', 'delta']


In [3]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [4]:
# --- ULTRA-SIMPLE FEATURIZER ---
# Use ONLY physics-based features that should generalize
class SimplePhysicsFeaturizer:
    """Featurizer using only physics-based features.
    
    Features:
    - Arrhenius kinetics: 1/T (Kelvin), ln(time), 1/T * ln(time)
    - Basic solvent: dielectric constant, polarity (pi*), hydrogen bonding (alpha, beta)
    """
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange = SPANGE_DF
        # Select only the most fundamental solvent properties
        self.selected_cols = ['dielectric constant', 'pi*', 'alpha', 'beta']

    def _build_process_features(self, X):
        """Build physics-based process features."""
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        # Arrhenius kinetic features
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(rt + 1e-6)
        interaction = inv_temp * log_time
        
        return np.hstack([rt, temp, inv_temp, log_time, interaction])

    def _get_solvent_features(self, X, flip=False):
        """Get selected solvent features."""
        if self.mixed:
            A = self.spange.loc[X["SOLVENT A NAME"]][self.selected_cols].values
            B = self.spange.loc[X["SOLVENT B NAME"]][self.selected_cols].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                return B * (1 - pct) + A * pct
            else:
                return A * (1 - pct) + B * pct
        else:
            return self.spange.loc[X["SOLVENT NAME"]][self.selected_cols].values

    def featurize(self, X, flip=False):
        """Build combined features."""
        process = self._build_process_features(X)
        solvent = self._get_solvent_features(X, flip)
        
        if self.mixed:
            pct = X["SolventB%"].values.reshape(-1, 1)
            return np.hstack([process, pct, solvent])
        return np.hstack([process, solvent])

print("SimplePhysicsFeaturizer defined")
print(f"Using {len(['dielectric constant', 'pi*', 'alpha', 'beta'])} solvent features")

SimplePhysicsFeaturizer defined
Using 4 solvent features


In [5]:
# --- ULTRA-SIMPLE MODEL ---
class UltraSimpleModel(BaseModel):
    """Ultra-simple model for better generalization.
    
    - SM: Shallow HGB (max_depth=3) - needs some non-linearity
    - Products: Ridge regression - simplest possible
    - NO TTA - confirmed to hurt performance
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = SimplePhysicsFeaturizer(mixed=(data=='full'))
        self.targets = ['Product 2', 'Product 3', 'SM']
        self.scaler = StandardScaler()
        self.models = {}

    def train_model(self, X_train, y_train):
        X_feat = self.featurizer.featurize(X_train)
        y = y_train.values
        
        if self.data_type == 'full':
            # Data augmentation (but NO TTA at prediction time)
            X_feat_flip = self.featurizer.featurize(X_train, flip=True)
            X_feat = np.vstack([X_feat, X_feat_flip])
            y = np.vstack([y, y])
        
        X_scaled = self.scaler.fit_transform(X_feat)
        
        for i, target in enumerate(self.targets):
            y_target = y[:, i]
            
            if target == 'SM':
                # SM needs some non-linearity - use shallow HGB
                model = HistGradientBoostingRegressor(
                    max_depth=3, max_iter=100, learning_rate=0.1,
                    random_state=42
                )
            else:
                # Products - use simple Ridge
                model = Ridge(alpha=1.0)
            
            model.fit(X_scaled, y_target)
            self.models[target] = model

    def predict(self, X):
        # NO TTA - just direct prediction
        X_feat = self.featurizer.featurize(X)
        X_scaled = self.scaler.transform(X_feat)
        
        preds = []
        for target in self.targets:
            p = self.models[target].predict(X_scaled)
            preds.append(p.reshape(-1, 1))
        
        final_preds = np.hstack(preds)
        final_preds = np.clip(final_preds, 0, 1)
        return torch.tensor(final_preds, dtype=torch.double)

print("UltraSimpleModel defined")

UltraSimpleModel defined


In [6]:
# Quick validation test
print("Testing UltraSimpleModel...")
X_test, Y_test = load_data("single_solvent")

errors = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_test, Y_test)):
    if i >= 5: break
    model = UltraSimpleModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Single Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nQuick test MAE: {np.mean(errors):.4f}")
print("Note: Higher CV is expected - we're optimizing for generalization, not CV")

Testing UltraSimpleModel...
Single Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1335
Single Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1453
Single Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0479
Single Fold 3 (Acetonitrile): MAE = 0.0827


Single Fold 4 (Acetonitrile.Acetic Acid): MAE = 0.0968

Quick test MAE: 0.1012
Note: Higher CV is expected - we're optimizing for generalization, not CV


In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = UltraSimpleModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

3it [00:00, 28.93it/s]

6it [00:00, 27.78it/s]

9it [00:00, 26.91it/s]

12it [00:00, 27.40it/s]

15it [00:00, 26.94it/s]

18it [00:00, 27.06it/s]

21it [00:00, 26.58it/s]

24it [00:00, 26.82it/s]

24it [00:00, 27.01it/s]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = UltraSimpleModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:00,  4.88it/s]

2it [00:00,  4.94it/s]

3it [00:00,  4.98it/s]

4it [00:00,  4.98it/s]

5it [00:00,  5.11it/s]

6it [00:01,  4.89it/s]

7it [00:01,  5.02it/s]

8it [00:01,  4.95it/s]

9it [00:01,  5.02it/s]

10it [00:01,  5.08it/s]

11it [00:02,  5.01it/s]

12it [00:02,  4.97it/s]

13it [00:02,  5.06it/s]

13it [00:02,  5.01it/s]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [10]:
# Calculate CV score - THIS CELL MUST BE REMOVED BEFORE SUBMISSION
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")

# Single solvent CV
single_preds = submission_single_solvent[['target_1', 'target_2', 'target_3']].values
single_true = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_single, Y_single)):
    single_true.append(test_Y.values)
single_true = np.vstack(single_true)
single_mae = np.mean(np.abs(single_preds - single_true))
print(f"Single solvent CV MAE: {single_mae:.4f}")

# Full data CV
full_preds = submission_full_data[['target_1', 'target_2', 'target_3']].values
full_true = []
for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    full_true.append(test_Y.values)
full_true = np.vstack(full_true)
full_mae = np.mean(np.abs(full_preds - full_true))
print(f"Full data CV MAE: {full_mae:.4f}")

# Combined
combined_mae = (single_mae + full_mae) / 2
print(f"\nCombined CV MAE: {combined_mae:.4f}")
print(f"exp_004 CV: 0.0623")
print(f"Difference: {combined_mae - 0.0623:.4f}")
print(f"\nNote: Higher CV is expected. The hypothesis is that this simpler model")
print(f"might generalize better to unseen solvents (lower LB despite higher CV).")

Single solvent CV MAE: 0.0739
Full data CV MAE: 0.0804

Combined CV MAE: 0.0772
exp_004 CV: 0.0623
Difference: 0.0149

Note: Higher CV is expected. The hypothesis is that this simpler model
might generalize better to unseen solvents (lower LB despite higher CV).
