# Experiment 051: Simpler Model Based on exp_000's Best Residual

**Hypothesis**: exp_000 has the best residual (-0.002136), meaning it performed BETTER on LB than predicted by its CV. A simpler model may generalize better to LB.

**Based on**: exp_000 used:
- Spange descriptors (13 features)
- Arrhenius kinetics features (3 features)
- MLP with hidden_dims=[128, 128, 64], dropout=0.2
- 3 models bagged
- 200 epochs

**This experiment**: Recreate exp_000's approach with stronger regularization to see if we can improve the residual further.

In [1]:
import sys
sys.path.insert(0, '/home/code/experiments/049_manual_ood_handling')

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load data
from utils_local import load_data, load_features, generate_leave_one_out_splits, generate_leave_one_ramp_out_splits

print("Loading data...")
X_single_raw, Y_single = load_data("single_solvent")
X_full_raw, Y_full = load_data("full")

print(f"Single solvent: {X_single_raw.shape}, Mixtures: {X_full_raw.shape}")

# Load features
spange = load_features("spange_descriptors")
print(f"Spange: {spange.shape}")

Loading data...
Single solvent: (656, 3), Mixtures: (1227, 5)
Spange: (26, 13)


In [2]:
# Prepare datasets with Spange features ONLY (no DRFP)
def prepare_single_solvent_dataset(X_raw, spange):
    """Prepare single solvent dataset with Spange features only"""
    solvent_name = X_raw['SOLVENT NAME'].values
    spange_features = spange.loc[solvent_name].values
    time = X_raw['Residence Time'].values
    temp = X_raw['Temperature'].values
    
    spange_cols = spange.columns.tolist()
    df = pd.DataFrame(spange_features, columns=spange_cols)
    df['TEMPERATURE'] = temp
    df['TIME'] = time
    df['SOLVENT NAME'] = solvent_name
    
    return df

def prepare_mixture_dataset(X_raw, spange):
    """Prepare mixture dataset with Spange features only"""
    solvent_a = X_raw['SOLVENT A NAME'].values
    solvent_b = X_raw['SOLVENT B NAME'].values
    solvent_b_pct = X_raw['SolventB%'].values / 100.0
    
    spange_a = spange.loc[solvent_a].values
    spange_b = spange.loc[solvent_b].values
    spange_mix = (1 - solvent_b_pct[:, None]) * spange_a + solvent_b_pct[:, None] * spange_b
    
    solvent_name = [f"{a}.{b}" for a, b in zip(solvent_a, solvent_b)]
    time = X_raw['Residence Time'].values
    temp = X_raw['Temperature'].values
    
    spange_cols = spange.columns.tolist()
    df = pd.DataFrame(spange_mix, columns=spange_cols)
    df['TEMPERATURE'] = temp
    df['TIME'] = time
    df['SOLVENT NAME'] = solvent_name
    df['SOLVENT A NAME'] = solvent_a
    df['SOLVENT B NAME'] = solvent_b
    df['SolventB%'] = X_raw['SolventB%'].values
    
    return df

X_single = prepare_single_solvent_dataset(X_single_raw, spange)
X_mix = prepare_mixture_dataset(X_full_raw, spange)

print(f"Single solvent dataset: {X_single.shape}")
print(f"Mixture dataset: {X_mix.shape}")

Single solvent dataset: (656, 16)
Mixture dataset: (1227, 19)


In [3]:
# Feature extraction with Arrhenius kinetics (same as exp_000)
def get_spange_features(X_data):
    spange_cols = ['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 
                   'SA', 'SB', 'SP', 'SdP', 'N', 'n', 'f(n)', 'delta']
    return X_data[spange_cols].values

def get_arrhenius_features(X_data):
    """Arrhenius kinetics features (same as exp_000)"""
    T = X_data['TEMPERATURE'].values
    t = X_data['TIME'].values
    T_kelvin = T + 273.15
    inv_T = 1000.0 / T_kelvin  # Scaled inverse temperature
    ln_t = np.log(t + 1e-6)
    interaction = inv_T * ln_t
    return np.column_stack([T, t, inv_T, ln_t, interaction])

def prepare_features(X_data):
    """Prepare features (Spange + Arrhenius)"""
    spange = get_spange_features(X_data)
    arrhenius = get_arrhenius_features(X_data)
    return np.hstack([spange, arrhenius])

print("Feature extraction functions defined")
print(f"Feature dimension: 13 (Spange) + 5 (Arrhenius) = 18")

Feature extraction functions defined
Feature dimension: 13 (Spange) + 5 (Arrhenius) = 18


In [4]:
# Simple MLP Model (same as exp_000)
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 128, 64], dropout=0.2):
        super().__init__()
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))
        layers.append(nn.Sigmoid())
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def train_simple_mlp(X_train, Y_train, input_dim, epochs=200, lr=5e-4, 
                     weight_decay=1e-5, hidden_dims=[128, 128, 64], dropout=0.2):
    """Train simple MLP (same as exp_000)"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SimpleMLP(input_dim, hidden_dims=hidden_dims, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=20)
    criterion = nn.HuberLoss()
    
    X_tensor = torch.FloatTensor(X_train).to(device)
    Y_tensor = torch.FloatTensor(Y_train).to(device)
    
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        pred = model(X_tensor)
        loss = criterion(pred, Y_tensor)
        loss.backward()
        optimizer.step()
        scheduler.step(loss)
    
    return model

print("Simple MLP model defined")

Simple MLP model defined


In [5]:
# Simple Model (recreating exp_000)
class SimpleModel:
    """
    Simple MLP model with Spange + Arrhenius features.
    Recreating exp_000 which has the best residual.
    """
    def __init__(self, n_models=3, hidden_dims=[128, 128, 64], dropout=0.2):
        self.n_models = n_models
        self.hidden_dims = hidden_dims
        self.dropout = dropout
        
        self.scaler = StandardScaler()
        self.models = []
        self.input_dim = None
    
    def fit(self, X_train, Y_train):
        """Train n_models MLPs"""
        # Prepare features
        X_features = prepare_features(X_train)
        self.input_dim = X_features.shape[1]
        X_scaled = self.scaler.fit_transform(X_features)
        
        Y_values = Y_train.values
        
        # Train multiple models for bagging
        for _ in range(self.n_models):
            mlp = train_simple_mlp(X_scaled, Y_values, self.input_dim, 
                                   epochs=200, hidden_dims=self.hidden_dims,
                                   dropout=self.dropout)
            self.models.append(mlp)
        
        return self
    
    def predict(self, X_test):
        """Predict using ensemble of MLPs"""
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        X_features = prepare_features(X_test)
        X_scaled = self.scaler.transform(X_features)
        
        # Get predictions from all models
        preds = []
        X_tensor = torch.FloatTensor(X_scaled).to(device)
        for mlp in self.models:
            mlp.eval()
            with torch.no_grad():
                pred = mlp(X_tensor).cpu().numpy()
            preds.append(pred)
        
        # Average predictions
        final_pred = np.mean(preds, axis=0)
        return np.clip(final_pred, 0, 1)

print("SimpleModel defined")

SimpleModel defined


In [6]:
# Run CV for single solvents
print("Running Single Solvent CV with Simple Model (exp_000 style)...")
print("="*60)

splits = list(generate_leave_one_out_splits(X_single, Y_single))
print(f"Number of folds: {len(splits)}")

solvent_errors = {}
all_preds = []
all_true = []

for fold_idx, (train_idx, test_idx) in enumerate(splits):
    X_train = X_single.iloc[train_idx]
    Y_train = Y_single.iloc[train_idx]
    X_test = X_single.iloc[test_idx]
    Y_test = Y_single.iloc[test_idx]
    
    test_solvent = X_test['SOLVENT NAME'].iloc[0]
    
    # Train model
    model = SimpleModel(n_models=3, hidden_dims=[128, 128, 64], dropout=0.2)
    model.fit(X_train, Y_train)
    
    # Predict
    preds = model.predict(X_test)
    
    # Calculate MSE
    mse = np.mean((preds - Y_test.values) ** 2)
    solvent_errors[test_solvent] = mse
    
    all_preds.append(preds)
    all_true.append(Y_test.values)
    
    print(f"Fold {fold_idx+1:2d}: {test_solvent:45s} MSE = {mse:.6f}")

all_preds = np.vstack(all_preds)
all_true = np.vstack(all_true)
single_mse = np.mean((all_preds - all_true) ** 2)
single_std = np.std([solvent_errors[s] for s in solvent_errors])

print(f"\nSimple Model Single Solvent CV MSE: {single_mse:.6f} +/- {single_std:.6f}")

Running Single Solvent CV with Simple Model (exp_000 style)...
Number of folds: 24


Fold  1: 1,1,1,3,3,3-Hexafluoropropan-2-ol             MSE = 0.025301


Fold  2: 2,2,2-Trifluoroethanol                        MSE = 0.025804


Fold  3: 2-Methyltetrahydrofuran [2-MeTHF]             MSE = 0.002894


Fold  4: Acetonitrile                                  MSE = 0.007722


Fold  5: Acetonitrile.Acetic Acid                      MSE = 0.024953


Fold  6: Butanone [MEK]                                MSE = 0.007777


Fold  7: Cyclohexane                                   MSE = 0.034916


Fold  8: DMA [N,N-Dimethylacetamide]                   MSE = 0.022409


Fold  9: Decanol                                       MSE = 0.011892


Fold 10: Diethyl Ether [Ether]                         MSE = 0.011901


Fold 11: Dihydrolevoglucosenone (Cyrene)               MSE = 0.007338


Fold 12: Dimethyl Carbonate                            MSE = 0.008718


Fold 13: Ethanol                                       MSE = 0.004134


Fold 14: Ethyl Acetate                                 MSE = 0.006988


Fold 15: Ethyl Lactate                                 MSE = 0.002889


Fold 16: Ethylene Glycol [1,2-Ethanediol]              MSE = 0.014864


Fold 17: IPA [Propan-2-ol]                             MSE = 0.003815


Fold 18: MTBE [tert-Butylmethylether]                  MSE = 0.003934


Fold 19: Methanol                                      MSE = 0.005559


Fold 20: Methyl Propionate                             MSE = 0.007628


Fold 21: THF [Tetrahydrofuran]                         MSE = 0.003183


Fold 22: Water.2,2,2-Trifluoroethanol                  MSE = 0.003446


Fold 23: Water.Acetonitrile                            MSE = 0.004503


Fold 24: tert-Butanol [2-Methylpropan-2-ol]            MSE = 0.014061

Simple Model Single Solvent CV MSE: 0.011822 +/- 0.008859


In [None]:
# Run CV for mixtures
print("\n" + "="*60)
print("Running Mixture CV with Simple Model...")
print("="*60)

mix_splits = list(generate_leave_one_ramp_out_splits(X_mix, Y_full))
print(f"Number of folds: {len(mix_splits)}")

mix_errors = {}
mix_preds = []
mix_true = []

for fold_idx, (train_idx, test_idx) in enumerate(mix_splits):
    X_train = X_mix.iloc[train_idx]
    Y_train = Y_full.iloc[train_idx]
    X_test = X_mix.iloc[test_idx]
    Y_test = Y_full.iloc[test_idx]
    
    test_mixture = X_test['SOLVENT NAME'].iloc[0]
    
    # Train model
    model = SimpleModel(n_models=3, hidden_dims=[128, 128, 64], dropout=0.2)
    model.fit(X_train, Y_train)
    
    # Predict
    preds = model.predict(X_test)
    
    # Calculate MSE
    mse = np.mean((preds - Y_test.values) ** 2)
    mix_errors[test_mixture] = mse
    
    mix_preds.append(preds)
    mix_true.append(Y_test.values)
    
    print(f"Fold {fold_idx+1:2d}: {test_mixture:55s} MSE = {mse:.6f}")

mix_preds = np.vstack(mix_preds)
mix_true = np.vstack(mix_true)
mix_mse = np.mean((mix_preds - mix_true) ** 2)
mix_std = np.std([mix_errors[s] for s in mix_errors])

print(f"\nSimple Model Mixture CV MSE: {mix_mse:.6f} +/- {mix_std:.6f}")

In [None]:
# Calculate overall CV score
print("\n" + "="*60)
print("Simple Model Overall Results")
print("="*60)

n_single = len(all_true)
n_mix = len(mix_true)
n_total = n_single + n_mix

overall_mse = (n_single * single_mse + n_mix * mix_mse) / n_total

print(f"\nSingle Solvent CV MSE: {single_mse:.6f} +/- {single_std:.6f} (n={n_single})")
print(f"Mixture CV MSE: {mix_mse:.6f} +/- {mix_std:.6f} (n={n_mix})")
print(f"Overall CV MSE: {overall_mse:.6f}")

print(f"\nBaseline (exp_030): CV = 0.008298")
print(f"exp_000 (original): CV = 0.011081")
print(f"Improvement vs exp_030: {(0.008298 - overall_mse) / 0.008298 * 100:.1f}%")
print(f"Improvement vs exp_000: {(0.011081 - overall_mse) / 0.011081 * 100:.1f}%")

# Calculate predicted LB based on CV-LB relationship
predicted_lb = 4.29 * overall_mse + 0.0528
print(f"\nPredicted LB (based on CV-LB relationship): {predicted_lb:.4f}")
print(f"exp_000 actual LB: 0.0856")
print(f"exp_000 residual: -0.002136 (beat predicted LB)")

In [None]:
# Final Summary
print("\n" + "="*60)
print("EXPERIMENT 051 SUMMARY")
print("="*60)

print(f"\nSimple Model (exp_000 style):")
print(f"  Single Solvent CV: {single_mse:.6f}")
print(f"  Mixture CV: {mix_mse:.6f}")
print(f"  Overall CV: {overall_mse:.6f}")
print(f"  vs Baseline (exp_030): {(overall_mse - 0.008298) / 0.008298 * 100:+.1f}%")
print(f"  vs exp_000: {(overall_mse - 0.011081) / 0.011081 * 100:+.1f}%")

print("\nKey insights:")
print("1. exp_000 has the best residual (-0.002136) despite worse CV")
print("2. This suggests simpler models may generalize better to LB")
print("3. The CV-LB relationship may not hold for simpler models")

if overall_mse < 0.008298:
    print("\nCONCLUSION: Simple model IMPROVES overall CV!")
else:
    print("\nCONCLUSION: Simple model does NOT improve overall CV.")
    print("But exp_000's best residual suggests it may still perform better on LB.")

print(f"\nRemaining submissions: 5")
print(f"Best model: exp_030 (GP 0.15 + MLP 0.55 + LGBM 0.3) with CV 0.008298, LB 0.0877")