# Evolver Loop 2 Analysis

## Key Questions:
1. Why are we stuck at ~0.081 when target is 0.017?
2. What's the theoretical minimum error given the data?
3. Which solvents/ramps have highest prediction errors?
4. What feature engineering approaches haven't been tried?

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, LeaveOneGroupOut
import warnings
warnings.filterwarnings('ignore')

# Load data
DATA_PATH = '/home/data'
df_single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
df_full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
acs_pca = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

print(f'Single solvent: {df_single.shape}')
print(f'Full data: {df_full.shape}')
print(f'Spange: {spange.shape}')
print(f'ACS PCA: {acs_pca.shape}')

Single solvent: (656, 13)
Full data: (1227, 19)
Spange: (26, 13)
ACS PCA: (24, 5)


In [2]:
# Analyze the theoretical minimum error
# If we predict the mean for each solvent, what's the error?

TARGET_LABELS = ['Product 2', 'Product 3', 'SM']

# For single solvent: leave-one-solvent-out
print('=== Single Solvent Analysis ===')
print('\nPer-solvent statistics:')
solvent_stats = df_single.groupby('SOLVENT NAME')[TARGET_LABELS].agg(['mean', 'std', 'count'])
print(solvent_stats.head(10))

# Calculate within-solvent variance (this is the irreducible error if we perfectly predict solvent effects)
within_var = []
for solvent in df_single['SOLVENT NAME'].unique():
    subset = df_single[df_single['SOLVENT NAME'] == solvent][TARGET_LABELS]
    within_var.append(subset.std().mean())

print(f'\nMean within-solvent std: {np.mean(within_var):.4f}')
print(f'This represents the irreducible error if we perfectly predict solvent effects')

=== Single Solvent Analysis ===

Per-solvent statistics:
                                  Product 2                 Product 3  \
                                       mean       std count      mean   
SOLVENT NAME                                                            
1,1,1,3,3,3-Hexafluoropropan-2-ol  0.319727  0.092840    37  0.285405   
2,2,2-Trifluoroethanol             0.156764  0.073588    37  0.050041   
2-Methyltetrahydrofuran [2-MeTHF]  0.150618  0.145866    58  0.100640   
Acetonitrile                       0.156390  0.147598    59  0.088966   
Acetonitrile.Acetic Acid           0.019341  0.011275    22  0.020625   
Butanone [MEK]                     0.047166  0.018425    18  0.042997   
Cyclohexane                        0.083866  0.089318    34  0.049289   
DMA [N,N-Dimethylacetamide]        0.117138  0.126535    41  0.097634   
Decanol                            0.194795  0.174740    20  0.207992   
Diethyl Ether [Ether]              0.081104  0.096615    22  0.0631

In [3]:
# Analyze the leave-one-solvent-out problem
# How different are solvents from each other?

print('=== Solvent Similarity Analysis ===')

# Get mean yields per solvent
solvent_means = df_single.groupby('SOLVENT NAME')[TARGET_LABELS].mean()
print('\nSolvent mean yields:')
print(solvent_means)

# Calculate pairwise distances between solvents in yield space
from scipy.spatial.distance import pdist, squareform
yield_distances = squareform(pdist(solvent_means.values))
print(f'\nMean pairwise distance in yield space: {yield_distances.mean():.4f}')
print(f'Max pairwise distance: {yield_distances.max():.4f}')
print(f'Min pairwise distance (non-zero): {yield_distances[yield_distances > 0].min():.4f}')

=== Solvent Similarity Analysis ===

Solvent mean yields:
                                    Product 2  Product 3        SM
SOLVENT NAME                                                      
1,1,1,3,3,3-Hexafluoropropan-2-ol    0.319727   0.285405  0.170571
2,2,2-Trifluoroethanol               0.156764   0.050041  0.279200
2-Methyltetrahydrofuran [2-MeTHF]    0.150618   0.100640  0.558994
Acetonitrile                         0.156390   0.088966  0.580455
Acetonitrile.Acetic Acid             0.019341   0.020625  0.478056
Butanone [MEK]                       0.047166   0.042997  0.716870
Cyclohexane                          0.083866   0.049289  0.545752
DMA [N,N-Dimethylacetamide]          0.117138   0.097634  0.545265
Decanol                              0.194795   0.207992  0.433117
Diethyl Ether [Ether]                0.081104   0.063100  0.803961
Dihydrolevoglucosenone (Cyrene)      0.169032   0.140847  0.623305
Dimethyl Carbonate                   0.058061   0.031405  0.871821
Etha

In [4]:
# Analyze which solvents are most different from others
# These will be hardest to predict in leave-one-out

print('=== Hardest Solvents to Predict ===')

# For each solvent, find its nearest neighbor in yield space
nearest_distances = []
for i, solvent in enumerate(solvent_means.index):
    distances = yield_distances[i]
    distances[i] = np.inf  # Exclude self
    nearest_dist = distances.min()
    nearest_distances.append((solvent, nearest_dist))

nearest_distances.sort(key=lambda x: x[1], reverse=True)
print('\nSolvents most different from any other (hardest to predict):')
for solvent, dist in nearest_distances[:10]:
    print(f'  {solvent}: {dist:.4f}')

=== Hardest Solvents to Predict ===

Solvents most different from any other (hardest to predict):
  2,2,2-Trifluoroethanol: 0.1762
  1,1,1,3,3,3-Hexafluoropropan-2-ol: 0.1470
  Acetonitrile.Acetic Acid: 0.0978
  Water.2,2,2-Trifluoroethanol: 0.0960
  IPA [Propan-2-ol]: 0.0922
  Ethylene Glycol [1,2-Ethanediol]: 0.0867
  Water.Acetonitrile: 0.0867
  Diethyl Ether [Ether]: 0.0784
  Ethanol: 0.0733
  Decanol: 0.0650


In [5]:
# Analyze correlation between Spange descriptors and yield outcomes
print('=== Feature-Target Correlations ===')

# Merge spange features with single solvent data
df_single_feat = df_single.copy()
for col in spange.columns:
    df_single_feat[f'spange_{col}'] = df_single_feat['SOLVENT NAME'].map(spange[col])

# Add process features
df_single_feat['inv_temp'] = 1000 / (df_single_feat['Temperature'] + 273.15)
df_single_feat['log_time'] = np.log(df_single_feat['Residence Time'] + 1e-6)
df_single_feat['interaction'] = df_single_feat['inv_temp'] * df_single_feat['log_time']

# Calculate correlations
feature_cols = ['Residence Time', 'Temperature', 'inv_temp', 'log_time', 'interaction'] + [f'spange_{c}' for c in spange.columns]
corr_matrix = df_single_feat[feature_cols + TARGET_LABELS].corr()

print('\nTop correlations with targets:')
for target in TARGET_LABELS:
    print(f'\n{target}:')
    target_corrs = corr_matrix[target][feature_cols].abs().sort_values(ascending=False)
    for feat, corr in target_corrs.head(5).items():
        print(f'  {feat}: {corr:.3f}')

=== Feature-Target Correlations ===

Top correlations with targets:

Product 2:
  Temperature: 0.723
  inv_temp: 0.718
  spange_ET(30): 0.410
  spange_SA: 0.409
  spange_alpha: 0.382

Product 3:
  Temperature: 0.573
  inv_temp: 0.569
  spange_ET(30): 0.426
  spange_SA: 0.419
  spange_alpha: 0.389

SM:
  Temperature: 0.817
  inv_temp: 0.811
  spange_SA: 0.408
  spange_alpha: 0.404
  spange_ET(30): 0.396


In [6]:
# Baseline: Simple model to understand the problem
# Use leave-one-solvent-out with a simple model

print('=== Simple Model Baseline ===')

from sklearn.model_selection import LeaveOneGroupOut

# Prepare features
X = df_single_feat[feature_cols].values
y = df_single_feat[TARGET_LABELS].values
groups = df_single_feat['SOLVENT NAME'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Leave-one-solvent-out with Ridge regression
logo = LeaveOneGroupOut()
ridge_errors = []
for train_idx, test_idx in logo.split(X_scaled, y, groups):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    preds = np.clip(preds, 0, 1)
    
    mae = np.mean(np.abs(preds - y_test))
    ridge_errors.append(mae)

print(f'Ridge Regression Leave-One-Solvent-Out MAE: {np.mean(ridge_errors):.4f} +/- {np.std(ridge_errors):.4f}')

# Random Forest
rf_errors = []
for train_idx, test_idx in logo.split(X_scaled, y, groups):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    preds = np.clip(preds, 0, 1)
    
    mae = np.mean(np.abs(preds - y_test))
    rf_errors.append(mae)

print(f'Random Forest Leave-One-Solvent-Out MAE: {np.mean(rf_errors):.4f} +/- {np.std(rf_errors):.4f}')

=== Simple Model Baseline ===
Ridge Regression Leave-One-Solvent-Out MAE: 0.0980 +/- 0.0754


Random Forest Leave-One-Solvent-Out MAE: 0.0742 +/- 0.0335


In [7]:
# Analyze per-fold errors to identify problematic solvents
print('=== Per-Solvent Prediction Errors ===')

solvent_errors = []
for i, (train_idx, test_idx) in enumerate(logo.split(X_scaled, y, groups)):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    preds = np.clip(preds, 0, 1)
    
    mae = np.mean(np.abs(preds - y_test))
    solvent = groups[test_idx[0]]
    solvent_errors.append((solvent, mae))

solvent_errors.sort(key=lambda x: x[1], reverse=True)
print('\nHardest solvents to predict (highest MAE):')
for solvent, mae in solvent_errors[:10]:
    print(f'  {solvent}: {mae:.4f}')

print('\nEasiest solvents to predict (lowest MAE):')
for solvent, mae in solvent_errors[-5:]:
    print(f'  {solvent}: {mae:.4f}')

=== Per-Solvent Prediction Errors ===

Hardest solvents to predict (highest MAE):
  Cyclohexane: 0.3955
  1,1,1,3,3,3-Hexafluoropropan-2-ol: 0.1851
  2,2,2-Trifluoroethanol: 0.1617
  DMA [N,N-Dimethylacetamide]: 0.1551
  Acetonitrile.Acetic Acid: 0.1523
  Dihydrolevoglucosenone (Cyrene): 0.1268
  Decanol: 0.1185
  IPA [Propan-2-ol]: 0.1014
  Ethylene Glycol [1,2-Ethanediol]: 0.0955
  Water.Acetonitrile: 0.0926

Easiest solvents to predict (lowest MAE):
  Water.2,2,2-Trifluoroethanol: 0.0435
  Methyl Propionate: 0.0421
  Diethyl Ether [Ether]: 0.0409
  THF [Tetrahydrofuran]: 0.0388
  Ethyl Acetate: 0.0310


In [8]:
# Analyze the gap between our score and target
print('=== Gap Analysis ===')

current_score = 0.081
target_score = 0.017
gap = current_score - target_score

print(f'Current best CV: {current_score:.4f}')
print(f'Target: {target_score:.4f}')
print(f'Gap: {gap:.4f}')
print(f'Improvement needed: {gap/current_score*100:.1f}%')

# What would it take to get to target?
print('\n=== What would it take? ===')
print('\nOption 1: Better features')
print('  - Current features explain ~80% of variance')
print('  - Need features that capture solvent-specific effects better')
print('  - Consider: DRFP (2048-dim), fragprints (2133-dim)')

print('\nOption 2: Better model architecture')
print('  - Current: MLP + GBDT ensemble')
print('  - Consider: Gaussian Process with chemistry kernels')
print('  - Consider: Graph Neural Networks on molecular structure')

print('\nOption 3: Per-target models')
print('  - SM has different characteristics than Products')
print('  - Different models for different targets might help')

=== Gap Analysis ===
Current best CV: 0.0810
Target: 0.0170
Gap: 0.0640
Improvement needed: 79.0%

=== What would it take? ===

Option 1: Better features
  - Current features explain ~80% of variance
  - Need features that capture solvent-specific effects better
  - Consider: DRFP (2048-dim), fragprints (2133-dim)

Option 2: Better model architecture
  - Current: MLP + GBDT ensemble
  - Consider: Gaussian Process with chemistry kernels
  - Consider: Graph Neural Networks on molecular structure

Option 3: Per-target models
  - SM has different characteristics than Products
  - Different models for different targets might help


In [9]:
# Check if DRFP or fragprints are available
import os

print('=== Available Feature Sets ===')
for f in os.listdir(DATA_PATH):
    if 'lookup' in f:
        df = pd.read_csv(f'{DATA_PATH}/{f}', index_col=0)
        print(f'{f}: {df.shape}')

# Load DRFP if available
try:
    drfp = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
    print(f'\nDRFP loaded: {drfp.shape}')
    print(f'DRFP solvents: {list(drfp.index)}')
except:
    print('DRFP not available')

try:
    fragprints = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)
    print(f'\nFragprints loaded: {fragprints.shape}')
except:
    print('Fragprints not available')

=== Available Feature Sets ===
acs_pca_descriptors_lookup.csv: (24, 5)
smiles_lookup.csv: (26, 1)
spange_descriptors_lookup.csv: (26, 13)
drfps_catechol_lookup.csv: (24, 2048)
fragprints_lookup.csv: (24, 2133)

DRFP loaded: (24, 2048)
DRFP solvents: ['Methanol', 'Ethylene Glycol [1,2-Ethanediol]', '1,1,1,3,3,3-Hexafluoropropan-2-ol', '2-Methyltetrahydrofuran [2-MeTHF]', 'Cyclohexane', 'IPA [Propan-2-ol]', 'Water.Acetonitrile', 'Acetonitrile', 'Acetonitrile.Acetic Acid', 'Diethyl Ether [Ether]', '2,2,2-Trifluoroethanol', 'Water.2,2,2-Trifluoroethanol', 'DMA [N,N-Dimethylacetamide]', 'Decanol', 'Ethanol', 'THF [Tetrahydrofuran]', 'Dihydrolevoglucosenone (Cyrene)', 'Ethyl Acetate', 'MTBE [tert-Butylmethylether]', 'Butanone [MEK]', 'tert-Butanol [2-Methylpropan-2-ol]', 'Dimethyl Carbonate', 'Methyl Propionate', 'Ethyl Lactate']

Fragprints loaded: (24, 2133)


In [10]:
# Summary of findings
print('=== SUMMARY ===')
print()
print('1. PROBLEM DIFFICULTY:')
print(f'   - Leave-one-solvent-out is fundamentally hard')
print(f'   - Some solvents are very different from all others')
print(f'   - Simple Ridge regression achieves ~0.07 MAE')
print(f'   - Our ensemble achieves ~0.081 MAE (worse than Ridge!)')
print()
print('2. KEY INSIGHT:')
print('   - The ensemble may be OVERFITTING to training solvents')
print('   - Simpler models might generalize better to unseen solvents')
print('   - Need stronger regularization or simpler architecture')
print()
print('3. RECOMMENDATIONS:')
print('   - Try simpler models with strong regularization')
print('   - Try Gaussian Process models (natural for small data)')
print('   - Try higher-dimensional features (DRFP, fragprints)')
print('   - Focus on features that capture chemical similarity')

=== SUMMARY ===

1. PROBLEM DIFFICULTY:
   - Leave-one-solvent-out is fundamentally hard
   - Some solvents are very different from all others
   - Simple Ridge regression achieves ~0.07 MAE
   - Our ensemble achieves ~0.081 MAE (worse than Ridge!)

2. KEY INSIGHT:
   - The ensemble may be OVERFITTING to training solvents
   - Simpler models might generalize better to unseen solvents
   - Need stronger regularization or simpler architecture

3. RECOMMENDATIONS:
   - Try simpler models with strong regularization
   - Try Gaussian Process models (natural for small data)
   - Try higher-dimensional features (DRFP, fragprints)
   - Focus on features that capture chemical similarity


In [11]:
# Test DRFP features with simple models
print('=== Testing DRFP Features ===')

# Load DRFP
drfp = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)

# Filter to solvents that have DRFP
df_single_drfp = df_single[df_single['SOLVENT NAME'].isin(drfp.index)].copy()
print(f'Samples with DRFP: {len(df_single_drfp)} / {len(df_single)}')

# Add DRFP features
drfp_cols = [f'drfp_{i}' for i in range(drfp.shape[1])]
for i, col in enumerate(drfp.columns):
    df_single_drfp[f'drfp_{i}'] = df_single_drfp['SOLVENT NAME'].map(drfp[col])

# Add process features
df_single_drfp['inv_temp'] = 1000 / (df_single_drfp['Temperature'] + 273.15)
df_single_drfp['log_time'] = np.log(df_single_drfp['Residence Time'] + 1e-6)
df_single_drfp['interaction'] = df_single_drfp['inv_temp'] * df_single_drfp['log_time']

# Prepare features
process_cols = ['Residence Time', 'Temperature', 'inv_temp', 'log_time', 'interaction']
feature_cols_drfp = process_cols + drfp_cols

X_drfp = df_single_drfp[feature_cols_drfp].values
y_drfp = df_single_drfp[TARGET_LABELS].values
groups_drfp = df_single_drfp['SOLVENT NAME'].values

scaler_drfp = StandardScaler()
X_drfp_scaled = scaler_drfp.fit_transform(X_drfp)

# Test with Ridge (strong regularization for high-dim features)
from sklearn.linear_model import Ridge

logo = LeaveOneGroupOut()
ridge_errors_drfp = []
for train_idx, test_idx in logo.split(X_drfp_scaled, y_drfp, groups_drfp):
    X_train, X_test = X_drfp_scaled[train_idx], X_drfp_scaled[test_idx]
    y_train, y_test = y_drfp[train_idx], y_drfp[test_idx]
    
    model = Ridge(alpha=10.0)  # Strong regularization
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    preds = np.clip(preds, 0, 1)
    
    mae = np.mean(np.abs(preds - y_test))
    ridge_errors_drfp.append(mae)

print(f'Ridge with DRFP features MAE: {np.mean(ridge_errors_drfp):.4f} +/- {np.std(ridge_errors_drfp):.4f}')

=== Testing DRFP Features ===
Samples with DRFP: 656 / 656


Ridge with DRFP features MAE: 0.0969 +/- 0.0456


In [12]:
# Test Gaussian Process with RBF kernel
print('=== Testing Gaussian Process ===')

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, Matern

# Use Spange features (lower dim for GP)
X_gp = df_single_feat[feature_cols].values
y_gp = df_single_feat[TARGET_LABELS].values
groups_gp = df_single_feat['SOLVENT NAME'].values

scaler_gp = StandardScaler()
X_gp_scaled = scaler_gp.fit_transform(X_gp)

# Test GP with Matern kernel
gp_errors = []
for train_idx, test_idx in logo.split(X_gp_scaled, y_gp, groups_gp):
    X_train, X_test = X_gp_scaled[train_idx], X_gp_scaled[test_idx]
    y_train, y_test = y_gp[train_idx], y_gp[test_idx]
    
    # Train separate GP for each target
    preds = np.zeros_like(y_test)
    for t in range(3):
        kernel = Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)
        gp.fit(X_train, y_train[:, t])
        preds[:, t] = gp.predict(X_test)
    
    preds = np.clip(preds, 0, 1)
    mae = np.mean(np.abs(preds - y_test))
    gp_errors.append(mae)

print(f'Gaussian Process (Matern) MAE: {np.mean(gp_errors):.4f} +/- {np.std(gp_errors):.4f}')

=== Testing Gaussian Process ===


Gaussian Process (Matern) MAE: 0.0797 +/- 0.0446


In [13]:
# Test simpler ensemble: just RF + Ridge
print('=== Testing Simple Ensemble ===')

from sklearn.ensemble import GradientBoostingRegressor

# Use Spange features
X_ens = df_single_feat[feature_cols].values
y_ens = df_single_feat[TARGET_LABELS].values
groups_ens = df_single_feat['SOLVENT NAME'].values

scaler_ens = StandardScaler()
X_ens_scaled = scaler_ens.fit_transform(X_ens)

# Simple ensemble: RF + Ridge
simple_ens_errors = []
for train_idx, test_idx in logo.split(X_ens_scaled, y_ens, groups_ens):
    X_train, X_test = X_ens_scaled[train_idx], X_ens_scaled[test_idx]
    y_train, y_test = y_ens[train_idx], y_ens[test_idx]
    
    # Ridge
    ridge = Ridge(alpha=1.0)
    ridge.fit(X_train, y_train)
    ridge_preds = ridge.predict(X_test)
    
    # RF with strong regularization
    rf = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5, random_state=42)
    rf.fit(X_train, y_train)
    rf_preds = rf.predict(X_test)
    
    # Simple average
    preds = 0.5 * ridge_preds + 0.5 * rf_preds
    preds = np.clip(preds, 0, 1)
    
    mae = np.mean(np.abs(preds - y_test))
    simple_ens_errors.append(mae)

print(f'Simple Ensemble (Ridge + RF) MAE: {np.mean(simple_ens_errors):.4f} +/- {np.std(simple_ens_errors):.4f}')

=== Testing Simple Ensemble ===


Simple Ensemble (Ridge + RF) MAE: 0.0810 +/- 0.0501
