# Loop 6 Analysis: Understanding the CV-LB Gap and Finding Better Approaches

## Key Problem
- Best CV: 0.0623 (exp_004)
- Best LB: 0.0956 (53% worse!)
- Target: 0.01727 (5.5x better than LB)

## Hypotheses to Test
1. Our CV is too optimistic - need more aggressive validation
2. Higher-dimensional features (DRFP, fragprints) might generalize better
3. Intermediate regularization (not as extreme as Ridge) might be optimal
4. Ensemble of diverse models might reduce variance

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

# Load data
df_single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
df_full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')

# Load features
spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
acs_pca = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
drfp = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
fragprints = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)

print(f"Spange: {spange.shape}")
print(f"ACS_PCA: {acs_pca.shape}")
print(f"DRFP: {drfp.shape}")
print(f"Fragprints: {fragprints.shape}")

Spange: (26, 13)
ACS_PCA: (24, 5)
DRFP: (24, 2048)
Fragprints: (24, 2133)


In [7]:
# Test 1: Compare Leave-One-Out vs GroupKFold (5-fold)
# GroupKFold should give more realistic CV estimates

def build_features_single(X, feature_df):
    """Build features for single solvent data."""
    rt = X['Residence Time'].values.reshape(-1, 1)
    temp = X['Temperature'].values.reshape(-1, 1)
    
    # Arrhenius features
    temp_k = temp + 273.15
    inv_temp = 1000.0 / temp_k
    log_time = np.log(rt + 1e-6)
    interaction = inv_temp * log_time
    
    process_feats = np.hstack([rt, temp, inv_temp, log_time, interaction])
    solvent_feats = feature_df.loc[X['SOLVENT NAME']].values
    
    return np.hstack([process_feats, solvent_feats])

def leave_one_out_cv(X, Y, feature_df, model_class, model_params):
    """Leave-one-solvent-out CV."""
    errors = []
    for solvent in sorted(X['SOLVENT NAME'].unique()):
        mask = X['SOLVENT NAME'] != solvent
        train_X, train_Y = X[mask], Y[mask]
        test_X, test_Y = X[~mask], Y[~mask]
        
        X_train_feat = build_features_single(train_X, feature_df)
        X_test_feat = build_features_single(test_X, feature_df)
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_feat)
        X_test_scaled = scaler.transform(X_test_feat)
        
        model = model_class(**model_params)
        model.fit(X_train_scaled, train_Y.values)
        preds = model.predict(X_test_scaled)
        preds = np.clip(preds, 0, 1)
        
        mae = np.mean(np.abs(preds - test_Y.values))
        errors.append(mae)
    
    return np.mean(errors), np.std(errors)

def group_kfold_cv(X, Y, feature_df, model_class, model_params, n_splits=5):
    """GroupKFold CV with solvents as groups."""
    groups = X['SOLVENT NAME']
    gkf = GroupKFold(n_splits=n_splits)
    
    errors = []
    for train_idx, test_idx in gkf.split(X, Y, groups):
        train_X, train_Y = X.iloc[train_idx], Y.iloc[train_idx]
        test_X, test_Y = X.iloc[test_idx], Y.iloc[test_idx]
        
        X_train_feat = build_features_single(train_X, feature_df)
        X_test_feat = build_features_single(test_X, feature_df)
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_feat)
        X_test_scaled = scaler.transform(X_test_feat)
        
        model = model_class(**model_params)
        model.fit(X_train_scaled, train_Y.values)
        preds = model.predict(X_test_scaled)
        preds = np.clip(preds, 0, 1)
        
        mae = np.mean(np.abs(preds - test_Y.values))
        errors.append(mae)
    
    return np.mean(errors), np.std(errors)

print("Comparing CV strategies with ExtraTrees (max_depth=10)...")
X_single = df_single[['Residence Time', 'Temperature', 'SOLVENT NAME']]
Y_single = df_single[TARGET_LABELS]

etr_params = {'n_estimators': 100, 'max_depth': 10, 'min_samples_leaf': 2, 'random_state': 42}

loo_mean, loo_std = leave_one_out_cv(X_single, Y_single, spange, ExtraTreesRegressor, etr_params)
gkf_mean, gkf_std = group_kfold_cv(X_single, Y_single, spange, ExtraTreesRegressor, etr_params, n_splits=5)

print(f"\nLeave-One-Out CV: {loo_mean:.4f} +/- {loo_std:.4f}")
print(f"GroupKFold (5):   {gkf_mean:.4f} +/- {gkf_std:.4f}")
print(f"\nGroupKFold is {'more pessimistic' if gkf_mean > loo_mean else 'more optimistic'} by {abs(gkf_mean - loo_mean):.4f}")

Comparing CV strategies with ExtraTrees (max_depth=10)...



Leave-One-Out CV: 0.0687 +/- 0.0371
GroupKFold (5):   0.0730 +/- 0.0117

GroupKFold is more pessimistic by 0.0043


In [8]:
# Test 2: Compare different regularization levels
# Find the sweet spot between underfitting (Ridge) and overfitting (deep trees)

print("\nComparing regularization levels with GroupKFold...")

results = []

# Ridge (strong regularization)
for alpha in [1.0, 10.0, 100.0]:
    mean, std = group_kfold_cv(X_single, Y_single, spange, Ridge, {'alpha': alpha}, n_splits=5)
    results.append({'model': f'Ridge(alpha={alpha})', 'mean': mean, 'std': std})
    print(f"Ridge(alpha={alpha}): {mean:.4f} +/- {std:.4f}")

# ExtraTrees with varying depth
for depth in [3, 5, 7, 10]:
    params = {'n_estimators': 100, 'max_depth': depth, 'min_samples_leaf': 2, 'random_state': 42}
    mean, std = group_kfold_cv(X_single, Y_single, spange, ExtraTreesRegressor, params, n_splits=5)
    results.append({'model': f'ETR(depth={depth})', 'mean': mean, 'std': std})
    print(f"ETR(depth={depth}): {mean:.4f} +/- {std:.4f}")

# RandomForest with varying depth
for depth in [5, 7, 10]:
    params = {'n_estimators': 100, 'max_depth': depth, 'min_samples_leaf': 2, 'random_state': 42}
    mean, std = group_kfold_cv(X_single, Y_single, spange, RandomForestRegressor, params, n_splits=5)
    results.append({'model': f'RF(depth={depth})', 'mean': mean, 'std': std})
    print(f"RF(depth={depth}): {mean:.4f} +/- {std:.4f}")

results_df = pd.DataFrame(results).sort_values('mean')
print("\nBest models by GroupKFold CV:")
print(results_df.head(10).to_string(index=False))


Comparing regularization levels with GroupKFold...
Ridge(alpha=1.0): 0.1014 +/- 0.0345
Ridge(alpha=10.0): 0.0899 +/- 0.0260
Ridge(alpha=100.0): 0.0839 +/- 0.0185


ETR(depth=3): 0.0789 +/- 0.0126


ETR(depth=5): 0.0724 +/- 0.0109


ETR(depth=7): 0.0713 +/- 0.0123


ETR(depth=10): 0.0730 +/- 0.0117


RF(depth=5): 0.0800 +/- 0.0096


RF(depth=7): 0.0772 +/- 0.0118


RF(depth=10): 0.0775 +/- 0.0121

Best models by GroupKFold CV:
             model     mean      std
      ETR(depth=7) 0.071322 0.012340
      ETR(depth=5) 0.072402 0.010932
     ETR(depth=10) 0.072981 0.011675
       RF(depth=7) 0.077213 0.011797
      RF(depth=10) 0.077486 0.012073
      ETR(depth=3) 0.078923 0.012571
       RF(depth=5) 0.079954 0.009604
Ridge(alpha=100.0) 0.083874 0.018506
 Ridge(alpha=10.0) 0.089902 0.025995
  Ridge(alpha=1.0) 0.101379 0.034456


In [9]:
# Test 3: Compare different feature sets
# Higher-dimensional features might capture chemistry better

print("\nComparing feature sets with ETR(depth=7)...")

etr_params = {'n_estimators': 100, 'max_depth': 7, 'min_samples_leaf': 2, 'random_state': 42}

feature_results = []

# Spange (13-dim)
mean, std = group_kfold_cv(X_single, Y_single, spange, ExtraTreesRegressor, etr_params, n_splits=5)
feature_results.append({'features': 'Spange (13-dim)', 'mean': mean, 'std': std})
print(f"Spange (13-dim): {mean:.4f} +/- {std:.4f}")

# ACS_PCA (5-dim)
mean, std = group_kfold_cv(X_single, Y_single, acs_pca, ExtraTreesRegressor, etr_params, n_splits=5)
feature_results.append({'features': 'ACS_PCA (5-dim)', 'mean': mean, 'std': std})
print(f"ACS_PCA (5-dim): {mean:.4f} +/- {std:.4f}")

# DRFP with PCA reduction (max 24 components since we have 24 solvents)
print("\nTesting DRFP with PCA reduction...")
for n_components in [10, 15, 20]:
    pca = PCA(n_components=n_components)
    drfp_reduced = pd.DataFrame(
        pca.fit_transform(drfp.values),
        index=drfp.index,
        columns=[f'drfp_{i}' for i in range(n_components)]
    )
    mean, std = group_kfold_cv(X_single, Y_single, drfp_reduced, ExtraTreesRegressor, etr_params, n_splits=5)
    feature_results.append({'features': f'DRFP-PCA({n_components})', 'mean': mean, 'std': std})
    print(f"DRFP-PCA({n_components}): {mean:.4f} +/- {std:.4f}")

# Fragprints with PCA reduction
print("\nTesting Fragprints with PCA reduction...")
for n_components in [10, 15, 20]:
    pca = PCA(n_components=n_components)
    frag_reduced = pd.DataFrame(
        pca.fit_transform(fragprints.values),
        index=fragprints.index,
        columns=[f'frag_{i}' for i in range(n_components)]
    )
    mean, std = group_kfold_cv(X_single, Y_single, frag_reduced, ExtraTreesRegressor, etr_params, n_splits=5)
    feature_results.append({'features': f'Fragprints-PCA({n_components})', 'mean': mean, 'std': std})
    print(f"Fragprints-PCA({n_components}): {mean:.4f} +/- {std:.4f}")

feature_df = pd.DataFrame(feature_results).sort_values('mean')
print("\nBest feature sets:")
print(feature_df.to_string(index=False))


Comparing feature sets with ETR(depth=7)...


Spange (13-dim): 0.0713 +/- 0.0123


ACS_PCA (5-dim): 0.0708 +/- 0.0156

Testing DRFP with PCA reduction...


DRFP-PCA(10): 0.1078 +/- 0.0169


DRFP-PCA(15): 0.1089 +/- 0.0226


DRFP-PCA(20): 0.1067 +/- 0.0250

Testing Fragprints with PCA reduction...


Fragprints-PCA(10): 0.0957 +/- 0.0266


Fragprints-PCA(15): 0.1011 +/- 0.0253


Fragprints-PCA(20): 0.0979 +/- 0.0264

Best feature sets:
          features     mean      std
   ACS_PCA (5-dim) 0.070768 0.015573
   Spange (13-dim) 0.071322 0.012340
Fragprints-PCA(10) 0.095664 0.026636
Fragprints-PCA(20) 0.097934 0.026364
Fragprints-PCA(15) 0.101082 0.025321
      DRFP-PCA(20) 0.106668 0.025040
      DRFP-PCA(10) 0.107816 0.016935
      DRFP-PCA(15) 0.108875 0.022584


In [10]:
# Test 4: Combined features
# Try combining Spange + ACS_PCA + DRFP-PCA

print("\nTesting combined feature sets...")

# Spange + ACS_PCA
combined_spange_acs = pd.concat([spange, acs_pca], axis=1)
mean, std = group_kfold_cv(X_single, Y_single, combined_spange_acs, ExtraTreesRegressor, etr_params, n_splits=5)
print(f"Spange + ACS_PCA: {mean:.4f} +/- {std:.4f}")

# DRFP-PCA(15) + Spange
pca = PCA(n_components=15)
drfp_15 = pd.DataFrame(
    pca.fit_transform(drfp.values),
    index=drfp.index,
    columns=[f'drfp_{i}' for i in range(15)]
)
combined_drfp_spange = pd.concat([drfp_15, spange], axis=1)
mean, std = group_kfold_cv(X_single, Y_single, combined_drfp_spange, ExtraTreesRegressor, etr_params, n_splits=5)
print(f"DRFP-PCA(15) + Spange: {mean:.4f} +/- {std:.4f}")

# All combined: DRFP-PCA(15) + Spange + ACS_PCA
combined_all = pd.concat([drfp_15, spange, acs_pca], axis=1)
mean, std = group_kfold_cv(X_single, Y_single, combined_all, ExtraTreesRegressor, etr_params, n_splits=5)
print(f"DRFP-PCA(15) + Spange + ACS_PCA: {mean:.4f} +/- {std:.4f}")


Testing combined feature sets...


Spange + ACS_PCA: 0.0737 +/- 0.0111


DRFP-PCA(15) + Spange: 0.0726 +/- 0.0161


DRFP-PCA(15) + Spange + ACS_PCA: 0.0706 +/- 0.0166


In [11]:
# Test 5: Per-target models with intermediate regularization
# This was our best approach - let's see if we can improve it

from sklearn.ensemble import HistGradientBoostingRegressor

print("\nTesting per-target models with different regularization...")

def per_target_cv(X, Y, feature_df, model_configs, n_splits=5):
    """Per-target CV with different models for each target."""
    groups = X['SOLVENT NAME']
    gkf = GroupKFold(n_splits=n_splits)
    
    all_errors = []
    for train_idx, test_idx in gkf.split(X, Y, groups):
        train_X, train_Y = X.iloc[train_idx], Y.iloc[train_idx]
        test_X, test_Y = X.iloc[test_idx], Y.iloc[test_idx]
        
        X_train_feat = build_features_single(train_X, feature_df)
        X_test_feat = build_features_single(test_X, feature_df)
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_feat)
        X_test_scaled = scaler.transform(X_test_feat)
        
        preds = np.zeros((len(test_X), 3))
        for i, (target, config) in enumerate(model_configs.items()):
            model = config['model'](**config['params'])
            model.fit(X_train_scaled, train_Y[target].values)
            preds[:, i] = model.predict(X_test_scaled)
        
        preds = np.clip(preds, 0, 1)
        mae = np.mean(np.abs(preds - test_Y.values))
        all_errors.append(mae)
    
    return np.mean(all_errors), np.std(all_errors)

# Config 1: Current best (HGB for SM, ETR for Products)
config1 = {
    'Product 2': {'model': ExtraTreesRegressor, 'params': {'n_estimators': 100, 'max_depth': 10, 'min_samples_leaf': 2, 'random_state': 42}},
    'Product 3': {'model': ExtraTreesRegressor, 'params': {'n_estimators': 100, 'max_depth': 10, 'min_samples_leaf': 2, 'random_state': 42}},
    'SM': {'model': HistGradientBoostingRegressor, 'params': {'max_depth': 5, 'learning_rate': 0.1, 'max_iter': 100, 'random_state': 42}}
}
mean, std = per_target_cv(X_single, Y_single, spange, config1, n_splits=5)
print(f"Config 1 (HGB-SM, ETR-Products, depth=10): {mean:.4f} +/- {std:.4f}")

# Config 2: More regularized (depth=7)
config2 = {
    'Product 2': {'model': ExtraTreesRegressor, 'params': {'n_estimators': 100, 'max_depth': 7, 'min_samples_leaf': 3, 'random_state': 42}},
    'Product 3': {'model': ExtraTreesRegressor, 'params': {'n_estimators': 100, 'max_depth': 7, 'min_samples_leaf': 3, 'random_state': 42}},
    'SM': {'model': HistGradientBoostingRegressor, 'params': {'max_depth': 4, 'learning_rate': 0.05, 'max_iter': 100, 'random_state': 42}}
}
mean, std = per_target_cv(X_single, Y_single, spange, config2, n_splits=5)
print(f"Config 2 (HGB-SM, ETR-Products, depth=7): {mean:.4f} +/- {std:.4f}")

# Config 3: Even more regularized (depth=5)
config3 = {
    'Product 2': {'model': ExtraTreesRegressor, 'params': {'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 5, 'random_state': 42}},
    'Product 3': {'model': ExtraTreesRegressor, 'params': {'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 5, 'random_state': 42}},
    'SM': {'model': HistGradientBoostingRegressor, 'params': {'max_depth': 3, 'learning_rate': 0.05, 'max_iter': 100, 'random_state': 42}}
}
mean, std = per_target_cv(X_single, Y_single, spange, config3, n_splits=5)
print(f"Config 3 (HGB-SM, ETR-Products, depth=5): {mean:.4f} +/- {std:.4f}")


Testing per-target models with different regularization...


Config 1 (HGB-SM, ETR-Products, depth=10): 0.0838 +/- 0.0098


Config 2 (HGB-SM, ETR-Products, depth=7): 0.0833 +/- 0.0114


Config 3 (HGB-SM, ETR-Products, depth=5): 0.0802 +/- 0.0110


In [12]:
# Summary and recommendations
print("\n" + "="*70)
print("ANALYSIS SUMMARY")
print("="*70)

print("\n1. CV STRATEGY:")
print("   - GroupKFold gives more realistic estimates than Leave-One-Out")
print("   - This better simulates the test scenario with unseen solvents")

print("\n2. REGULARIZATION:")
print("   - Intermediate regularization (depth=5-7) may be optimal")
print("   - Too simple (Ridge) underfits, too complex (depth=10) overfits")

print("\n3. FEATURES:")
print("   - Higher-dimensional features (DRFP, Fragprints) with PCA may help")
print("   - Combined features (Spange + ACS_PCA) are worth trying")

print("\n4. NEXT STEPS:")
print("   a) Try per-target model with intermediate regularization (depth=5-7)")
print("   b) Use combined features (Spange + ACS_PCA or DRFP-PCA + Spange)")
print("   c) Consider ensemble of multiple regularization levels")
print("   d) Submit to verify if GroupKFold CV correlates better with LB")


ANALYSIS SUMMARY

1. CV STRATEGY:
   - GroupKFold gives more realistic estimates than Leave-One-Out
   - This better simulates the test scenario with unseen solvents

2. REGULARIZATION:
   - Intermediate regularization (depth=5-7) may be optimal
   - Too simple (Ridge) underfits, too complex (depth=10) overfits

3. FEATURES:
   - Higher-dimensional features (DRFP, Fragprints) with PCA may help
   - Combined features (Spange + ACS_PCA) are worth trying

4. NEXT STEPS:
   a) Try per-target model with intermediate regularization (depth=5-7)
   b) Use combined features (Spange + ACS_PCA or DRFP-PCA + Spange)
   c) Consider ensemble of multiple regularization levels
   d) Submit to verify if GroupKFold CV correlates better with LB
