# Experiment 049: Manual OOD Solvent Handling

**Hypothesis**: Manually identify high-error solvents and use simpler features (Spange only) for them.

**Based on evaluator's suggestion**: The cosine similarity approach in exp_048 failed because all solvents had similarity >0.99. Instead, we manually identify the high-error solvents from exp_048 analysis:
- HFIP: 0.038 (4.3x mean error)
- Water.Ethanol: 0.028 (3.2x mean error)
- Acetonitrile.Acetic Acid: 0.022 (2.5x mean error)
- TFE: 0.015 (1.7x mean error)

**Implementation**:
- Model A: Full features (Spange + DRFP) with GP + MLP + LGBM ensemble for normal solvents
- Model B: Simple features (Spange only) with MLP + LGBM for high-error solvents
- Hard-coded list of high-error solvents

In [1]:
import sys
sys.path.insert(0, '/home/code/experiments/049_manual_ood_handling')

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, Matern
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
from utils_local import load_data, load_features, generate_leave_one_out_splits, generate_leave_one_ramp_out_splits

print("Loading single solvent data...")
X_single_raw, Y_single = load_data("single_solvent")
print(f"Single solvent X shape: {X_single_raw.shape}, Y shape: {Y_single.shape}")

print("\nLoading full (mixture) data...")
X_full_raw, Y_full = load_data("full")
print(f"Full X shape: {X_full_raw.shape}, Y shape: {Y_full.shape}")

print(f"\nTarget columns: {Y_single.columns.tolist()}")

# Load features
spange = load_features("spange_descriptors")
drfp = load_features("drfps_catechol")
print(f"Spange features: {spange.shape}")
print(f"DRFP features: {drfp.shape}")

Loading single solvent data...
Single solvent X shape: (656, 3), Y shape: (656, 3)

Loading full (mixture) data...
Full X shape: (1227, 5), Y shape: (1227, 3)

Target columns: ['Product 2', 'Product 3', 'SM']
Spange features: (26, 13)
DRFP features: (24, 2048)


In [2]:
# Prepare datasets with features
def prepare_single_solvent_dataset(X_raw, spange, drfp):
    """Prepare single solvent dataset with all features"""
    solvent_name = X_raw['SOLVENT NAME'].values
    
    # Get Spange features
    spange_features = spange.loc[solvent_name].values
    
    # Get DRFP features
    drfp_features = drfp.loc[solvent_name].values
    
    # Get time and temperature
    time = X_raw['Residence Time'].values
    temp = X_raw['Temperature'].values
    
    # Create DataFrame with all features
    spange_cols = spange.columns.tolist()
    drfp_cols = [f'DRFP_{i}' for i in range(drfp.shape[1])]
    
    df = pd.DataFrame(spange_features, columns=spange_cols)
    df_drfp = pd.DataFrame(drfp_features, columns=drfp_cols)
    df = pd.concat([df, df_drfp], axis=1)
    
    df['TEMPERATURE'] = temp
    df['TIME'] = time
    df['SOLVENT NAME'] = solvent_name
    
    return df

def prepare_mixture_dataset(X_raw, spange, drfp):
    """Prepare mixture dataset with all features"""
    solvent_a = X_raw['SOLVENT A NAME'].values
    solvent_b = X_raw['SOLVENT B NAME'].values
    solvent_b_pct = X_raw['SolventB%'].values / 100.0  # Convert to fraction
    
    # Get Spange features for both solvents and mix
    spange_a = spange.loc[solvent_a].values
    spange_b = spange.loc[solvent_b].values
    spange_mix = (1 - solvent_b_pct[:, None]) * spange_a + solvent_b_pct[:, None] * spange_b
    
    # Get DRFP features for both solvents and mix
    drfp_a = drfp.loc[solvent_a].values
    drfp_b = drfp.loc[solvent_b].values
    drfp_mix = (1 - solvent_b_pct[:, None]) * drfp_a + solvent_b_pct[:, None] * drfp_b
    
    # Create combined solvent name
    solvent_name = [f"{a}.{b}" for a, b in zip(solvent_a, solvent_b)]
    
    # Get time and temperature
    time = X_raw['Residence Time'].values
    temp = X_raw['Temperature'].values
    
    # Create DataFrame with all features
    spange_cols = spange.columns.tolist()
    drfp_cols = [f'DRFP_{i}' for i in range(drfp.shape[1])]
    
    df = pd.DataFrame(spange_mix, columns=spange_cols)
    df_drfp = pd.DataFrame(drfp_mix, columns=drfp_cols)
    df = pd.concat([df, df_drfp], axis=1)
    
    df['TEMPERATURE'] = temp
    df['TIME'] = time
    df['SOLVENT NAME'] = solvent_name
    df['SOLVENT A NAME'] = solvent_a
    df['SOLVENT B NAME'] = solvent_b
    df['SolventB%'] = X_raw['SolventB%'].values
    
    return df

# Prepare datasets
X_single = prepare_single_solvent_dataset(X_single_raw, spange, drfp)
X_mix = prepare_mixture_dataset(X_full_raw, spange, drfp)

print(f"Single solvent dataset shape: {X_single.shape}")
print(f"Mixture dataset shape: {X_mix.shape}")
print(f"\\nUnique single solvents: {X_single['SOLVENT NAME'].nunique()}")
print(f"Unique mixtures: {X_mix['SOLVENT NAME'].nunique()}")

Single solvent dataset shape: (656, 2064)
Mixture dataset shape: (1227, 2067)
\nUnique single solvents: 24
Unique mixtures: 13


In [3]:
# Feature extraction functions
def get_spange_features(X_data):
    """Extract Spange descriptors (13 features)"""
    # Actual column names from spange_descriptors_lookup.csv
    spange_cols = ['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 
                   'SA', 'SB', 'SP', 'SdP', 'N', 'n', 'f(n)', 'delta']
    return X_data[spange_cols].values

def get_drfp_features(X_data):
    """Extract DRFP features (high-variance only)"""
    drfp_cols = [col for col in X_data.columns if col.startswith('DRFP_')]
    drfp_data = X_data[drfp_cols].values
    # Filter by variance > 0
    var_mask = drfp_data.var(axis=0) > 0
    return drfp_data[:, var_mask], var_mask

def get_arrhenius_features(X_data):
    """Extract Arrhenius kinetics features"""
    T = X_data['TEMPERATURE'].values
    t = X_data['TIME'].values
    T_kelvin = T + 273.15
    inv_T = 1.0 / T_kelvin
    ln_t = np.log(t + 1e-6)
    interaction = inv_T * ln_t
    return np.column_stack([inv_T, ln_t, interaction, T, t])

def prepare_features(X_data, drfp_mask=None, include_drfp=True):
    """Prepare all features"""
    spange = get_spange_features(X_data)
    arrhenius = get_arrhenius_features(X_data)
    
    if include_drfp:
        drfp_cols = [col for col in X_data.columns if col.startswith('DRFP_')]
        drfp_data = X_data[drfp_cols].values
        if drfp_mask is not None:
            drfp_data = drfp_data[:, drfp_mask]
        features = np.hstack([spange, drfp_data, arrhenius])
    else:
        features = np.hstack([spange, arrhenius])
    
    return features

print("Feature extraction functions defined")
print(f"Spange columns: {['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 'SA', 'SB', 'SP', 'SdP', 'N', 'n', 'f(n)', 'delta']}")

Feature extraction functions defined
Spange columns: ['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 'SA', 'SB', 'SP', 'SdP', 'N', 'n', 'f(n)', 'delta']


In [4]:
# MLP Model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64], dropout=0.2):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))  # 3 outputs
        layers.append(nn.Sigmoid())
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def train_mlp(X_train, Y_train, input_dim, epochs=200, lr=5e-4, weight_decay=1e-4, hidden_dims=[128, 64]):
    """Train MLP model"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = MLPModel(input_dim, hidden_dims=hidden_dims).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=20)
    criterion = nn.HuberLoss()
    
    X_tensor = torch.FloatTensor(X_train).to(device)
    Y_tensor = torch.FloatTensor(Y_train).to(device)
    
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        pred = model(X_tensor)
        loss = criterion(pred, Y_tensor)
        loss.backward()
        optimizer.step()
        scheduler.step(loss)
    
    return model

print("MLP model defined")

MLP model defined


In [5]:
# Manual OOD Handling Model
class ManualOODModel:
    """
    Model that uses different feature sets for high-error solvents.
    - Full features (Spange + DRFP) for normal solvents
    - Simple features (Spange only) for high-error solvents
    """
    def __init__(self, high_error_solvents, gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.3):
        self.high_error_solvents = high_error_solvents
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        
        # Full feature models
        self.scaler_full = StandardScaler()
        self.gp_models_full = []
        self.mlp_models_full = []
        self.lgbm_models_full = []
        
        # Simple feature models (for high-error solvents)
        self.scaler_simple = StandardScaler()
        self.mlp_models_simple = []
        self.lgbm_models_simple = []
        
        self.drfp_mask = None
        self.input_dim_full = None
        self.input_dim_simple = None
    
    def fit(self, X_train, Y_train):
        """Train both full and simple feature models"""
        # Get DRFP mask from training data
        drfp_cols = [col for col in X_train.columns if col.startswith('DRFP_')]
        drfp_data = X_train[drfp_cols].values
        self.drfp_mask = drfp_data.var(axis=0) > 0
        
        # Prepare full features
        X_full = prepare_features(X_train, self.drfp_mask, include_drfp=True)
        self.input_dim_full = X_full.shape[1]
        X_full_scaled = self.scaler_full.fit_transform(X_full)
        
        # Prepare simple features
        X_simple = prepare_features(X_train, None, include_drfp=False)
        self.input_dim_simple = X_simple.shape[1]
        X_simple_scaled = self.scaler_simple.fit_transform(X_simple)
        
        Y_values = Y_train.values
        
        # Train full feature models
        # GP (only on subset for speed)
        n_gp = min(200, len(X_full_scaled))
        idx_gp = np.random.choice(len(X_full_scaled), n_gp, replace=False)
        for i in range(3):
            kernel = Matern(nu=2.5) + WhiteKernel(noise_level=0.1)
            gp = GaussianProcessRegressor(kernel=kernel, alpha=0.1, n_restarts_optimizer=2)
            gp.fit(X_full_scaled[idx_gp], Y_values[idx_gp, i])
            self.gp_models_full.append(gp)
        
        # MLP (3 models for bagging)
        for _ in range(3):
            mlp = train_mlp(X_full_scaled, Y_values, self.input_dim_full, epochs=200, hidden_dims=[128, 64])
            self.mlp_models_full.append(mlp)
        
        # LightGBM
        lgbm_params = {
            'objective': 'regression',
            'metric': 'mse',
            'learning_rate': 0.03,
            'max_depth': 6,
            'num_leaves': 31,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            'verbose': -1
        }
        for i in range(3):
            model = lgb.LGBMRegressor(**lgbm_params, n_estimators=500)
            model.fit(X_full_scaled, Y_values[:, i])
            self.lgbm_models_full.append(model)
        
        # Train simple feature models
        # MLP (3 models for bagging)
        for _ in range(3):
            mlp = train_mlp(X_simple_scaled, Y_values, self.input_dim_simple, epochs=200, hidden_dims=[64, 32])
            self.mlp_models_simple.append(mlp)
        
        # LightGBM
        for i in range(3):
            model = lgb.LGBMRegressor(**lgbm_params, n_estimators=500)
            model.fit(X_simple_scaled, Y_values[:, i])
            self.lgbm_models_simple.append(model)
        
        return self
    
    def predict(self, X_test):
        """Predict using appropriate model based on solvent"""
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Prepare features
        X_full = prepare_features(X_test, self.drfp_mask, include_drfp=True)
        X_full_scaled = self.scaler_full.transform(X_full)
        
        X_simple = prepare_features(X_test, None, include_drfp=False)
        X_simple_scaled = self.scaler_simple.transform(X_simple)
        
        # Get predictions from full feature models
        # GP predictions
        gp_preds = np.zeros((len(X_test), 3))
        for i, gp in enumerate(self.gp_models_full):
            gp_preds[:, i] = gp.predict(X_full_scaled)
        gp_preds = np.clip(gp_preds, 0, 1)
        
        # MLP predictions
        mlp_preds_full = []
        X_tensor = torch.FloatTensor(X_full_scaled).to(device)
        for mlp in self.mlp_models_full:
            mlp.eval()
            with torch.no_grad():
                pred = mlp(X_tensor).cpu().numpy()
            mlp_preds_full.append(pred)
        mlp_pred_full = np.mean(mlp_preds_full, axis=0)
        
        # LightGBM predictions
        lgbm_preds_full = np.zeros((len(X_test), 3))
        for i, model in enumerate(self.lgbm_models_full):
            lgbm_preds_full[:, i] = model.predict(X_full_scaled)
        lgbm_preds_full = np.clip(lgbm_preds_full, 0, 1)
        
        # Ensemble full feature predictions
        pred_full = self.gp_weight * gp_preds + self.mlp_weight * mlp_pred_full + self.lgbm_weight * lgbm_preds_full
        
        # Get predictions from simple feature models
        # MLP predictions
        mlp_preds_simple = []
        X_tensor_simple = torch.FloatTensor(X_simple_scaled).to(device)
        for mlp in self.mlp_models_simple:
            mlp.eval()
            with torch.no_grad():
                pred = mlp(X_tensor_simple).cpu().numpy()
            mlp_preds_simple.append(pred)
        mlp_pred_simple = np.mean(mlp_preds_simple, axis=0)
        
        # LightGBM predictions
        lgbm_preds_simple = np.zeros((len(X_test), 3))
        for i, model in enumerate(self.lgbm_models_simple):
            lgbm_preds_simple[:, i] = model.predict(X_simple_scaled)
        lgbm_preds_simple = np.clip(lgbm_preds_simple, 0, 1)
        
        # Ensemble simple feature predictions (no GP, just MLP + LGBM)
        pred_simple = 0.6 * mlp_pred_simple + 0.4 * lgbm_preds_simple
        
        # Select predictions based on solvent
        final_preds = np.zeros((len(X_test), 3))
        solvents = X_test['SOLVENT NAME'].values
        
        for idx in range(len(X_test)):
            solvent = solvents[idx]
            if solvent in self.high_error_solvents:
                final_preds[idx] = pred_simple[idx]
            else:
                final_preds[idx] = pred_full[idx]
        
        return np.clip(final_preds, 0, 1)

print("ManualOODModel defined")

ManualOODModel defined


In [6]:
# Check unique solvents in the data
print("Unique single solvents:")
unique_single = X_single['SOLVENT NAME'].unique()
for s in sorted(unique_single):
    count = (X_single['SOLVENT NAME'] == s).sum()
    print(f"  {s}: {count} samples")

print("\nUnique mixtures:")
unique_mix = X_mix['SOLVENT NAME'].unique()
for s in sorted(unique_mix):
    count = (X_mix['SOLVENT NAME'] == s).sum()
    print(f"  {s}: {count} samples")

Unique single solvents:
  1,1,1,3,3,3-Hexafluoropropan-2-ol: 37 samples
  2,2,2-Trifluoroethanol: 37 samples
  2-Methyltetrahydrofuran [2-MeTHF]: 58 samples
  Acetonitrile: 59 samples
  Acetonitrile.Acetic Acid: 22 samples
  Butanone [MEK]: 18 samples
  Cyclohexane: 34 samples
  DMA [N,N-Dimethylacetamide]: 41 samples
  Decanol: 20 samples
  Diethyl Ether [Ether]: 22 samples
  Dihydrolevoglucosenone (Cyrene): 18 samples
  Dimethyl Carbonate: 18 samples
  Ethanol: 42 samples
  Ethyl Acetate: 18 samples
  Ethyl Lactate: 17 samples
  Ethylene Glycol [1,2-Ethanediol]: 22 samples
  IPA [Propan-2-ol]: 5 samples
  MTBE [tert-Butylmethylether]: 16 samples
  Methanol: 36 samples
  Methyl Propionate: 18 samples
  THF [Tetrahydrofuran]: 21 samples
  Water.2,2,2-Trifluoroethanol: 22 samples
  Water.Acetonitrile: 37 samples
  tert-Butanol [2-Methylpropan-2-ol]: 18 samples

Unique mixtures:
  1,1,1,3,3,3-Hexafluoropropan-2-ol.2-Methyltetrahydrofuran [2-MeTHF]: 124 samples
  2,2,2-Trifluoroethanol.Wa

In [7]:
# Run CV for single solvents
print("Running Single Solvent CV (Leave-One-Out, 24 folds)...")
print("="*60)

print(f"Single solvent samples: {len(X_single)}")
print(f"Unique single solvents: {X_single['SOLVENT NAME'].nunique()}")

# Define high-error solvents for single solvents
high_error_single = [
    '1,1,1,3,3,3-Hexafluoropropan-2-ol',  # HFIP
    '2,2,2-Trifluoroethanol',  # TFE
    'Cyclohexane',
]

# Generate splits
splits = list(generate_leave_one_out_splits(X_single, Y_single))
print(f"Number of folds: {len(splits)}")

# Track per-solvent errors
solvent_errors = {}
all_preds = []
all_true = []

for fold_idx, (train_idx, test_idx) in enumerate(splits):
    X_train = X_single.iloc[train_idx]
    Y_train = Y_single.iloc[train_idx]
    X_test = X_single.iloc[test_idx]
    Y_test = Y_single.iloc[test_idx]
    
    test_solvent = X_test['SOLVENT NAME'].iloc[0]
    
    # Train model
    model = ManualOODModel(high_error_single)
    model.fit(X_train, Y_train)
    
    # Predict
    preds = model.predict(X_test)
    
    # Calculate MSE
    mse = np.mean((preds - Y_test.values) ** 2)
    solvent_errors[test_solvent] = mse
    
    all_preds.append(preds)
    all_true.append(Y_test.values)
    
    is_high_error = test_solvent in high_error_single
    marker = " [HIGH-ERROR]" if is_high_error else ""
    print(f"Fold {fold_idx+1:2d}: {test_solvent:45s} MSE = {mse:.6f}{marker}")

# Calculate overall MSE
all_preds = np.vstack(all_preds)
all_true = np.vstack(all_true)
single_mse = np.mean((all_preds - all_true) ** 2)
single_std = np.std([solvent_errors[s] for s in solvent_errors])

print(f"\nSingle Solvent CV MSE: {single_mse:.6f} +/- {single_std:.6f}")

Running Single Solvent CV (Leave-One-Out, 24 folds)...
Single solvent samples: 656
Unique single solvents: 24
Number of folds: 24


Fold  1: 1,1,1,3,3,3-Hexafluoropropan-2-ol             MSE = 0.024988 [HIGH-ERROR]


Fold  2: 2,2,2-Trifluoroethanol                        MSE = 0.035799 [HIGH-ERROR]


Fold  3: 2-Methyltetrahydrofuran [2-MeTHF]             MSE = 0.006002


Fold  4: Acetonitrile                                  MSE = 0.013579


Fold  5: Acetonitrile.Acetic Acid                      MSE = 0.036111


Fold  6: Butanone [MEK]                                MSE = 0.013852


Fold  7: Cyclohexane                                   MSE = 0.026071 [HIGH-ERROR]


Fold  8: DMA [N,N-Dimethylacetamide]                   MSE = 0.012694


Fold  9: Decanol                                       MSE = 0.011909


Fold 10: Diethyl Ether [Ether]                         MSE = 0.018893


Fold 11: Dihydrolevoglucosenone (Cyrene)               MSE = 0.001500


Fold 12: Dimethyl Carbonate                            MSE = 0.032256


Fold 13: Ethanol                                       MSE = 0.007453


Fold 14: Ethyl Acetate                                 MSE = 0.006220


Fold 15: Ethyl Lactate                                 MSE = 0.003403


Fold 16: Ethylene Glycol [1,2-Ethanediol]              MSE = 0.015261


Fold 17: IPA [Propan-2-ol]                             MSE = 0.013211


Fold 18: MTBE [tert-Butylmethylether]                  MSE = 0.019392


Fold 19: Methanol                                      MSE = 0.009849


Fold 20: Methyl Propionate                             MSE = 0.014964


Fold 21: THF [Tetrahydrofuran]                         MSE = 0.006307


Fold 22: Water.2,2,2-Trifluoroethanol                  MSE = 0.004766


Fold 23: Water.Acetonitrile                            MSE = 0.014114


Fold 24: tert-Butanol [2-Methylpropan-2-ol]            MSE = 0.008274

Single Solvent CV MSE: 0.014997 +/- 0.009658


In [8]:
# Analyze per-solvent errors
print("\n" + "="*60)
print("Per-Solvent Error Analysis")
print("="*60)

# Sort by error
sorted_errors = sorted(solvent_errors.items(), key=lambda x: x[1], reverse=True)

print("\nTop 10 highest error solvents:")
for i, (solvent, mse) in enumerate(sorted_errors[:10]):
    is_high_error = solvent in high_error_single
    marker = " [TARGETED]" if is_high_error else ""
    print(f"  {i+1:2d}. {solvent:40s}: {mse:.6f}{marker}")

print("\nTop 10 lowest error solvents:")
for i, (solvent, mse) in enumerate(sorted_errors[-10:]):
    print(f"  {i+1:2d}. {solvent:40s}: {mse:.6f}")

# Compare high-error solvents to baseline
print("\nHigh-error solvent comparison:")
print(f"{'Solvent':<45} {'This Exp':>12} {'Baseline':>12} {'Change':>12}")
print("-"*85)

# Baseline errors from exp_030 (approximate from previous experiments)
baseline_errors = {
    '1,1,1,3,3,3-Hexafluoropropan-2-ol': 0.096,  # HFIP
    '2,2,2-Trifluoroethanol': 0.042,  # TFE
    'Cyclohexane': 0.198,
}

for solvent in high_error_single:
    if solvent in solvent_errors:
        this_exp = solvent_errors[solvent]
        baseline = baseline_errors.get(solvent, 'N/A')
        if isinstance(baseline, float):
            change = (this_exp - baseline) / baseline * 100
            print(f"{solvent:<45} {this_exp:>12.6f} {baseline:>12.6f} {change:>11.1f}%")
        else:
            print(f"{solvent:<45} {this_exp:>12.6f} {baseline:>12}")

print(f"\nMean error for high-error solvents: {np.mean([solvent_errors.get(s, 0) for s in high_error_single if s in solvent_errors]):.6f}")
print(f"Mean error for other solvents: {np.mean([v for k, v in solvent_errors.items() if k not in high_error_single]):.6f}")


Per-Solvent Error Analysis

Top 10 highest error solvents:
   1. Acetonitrile.Acetic Acid                : 0.036111
   2. 2,2,2-Trifluoroethanol                  : 0.035799 [TARGETED]
   3. Dimethyl Carbonate                      : 0.032256
   4. Cyclohexane                             : 0.026071 [TARGETED]
   5. 1,1,1,3,3,3-Hexafluoropropan-2-ol       : 0.024988 [TARGETED]
   6. MTBE [tert-Butylmethylether]            : 0.019392
   7. Diethyl Ether [Ether]                   : 0.018893
   8. Ethylene Glycol [1,2-Ethanediol]        : 0.015261
   9. Methyl Propionate                       : 0.014964
  10. Water.Acetonitrile                      : 0.014114

Top 10 lowest error solvents:
   1. Decanol                                 : 0.011909
   2. Methanol                                : 0.009849
   3. tert-Butanol [2-Methylpropan-2-ol]      : 0.008274
   4. Ethanol                                 : 0.007453
   5. THF [Tetrahydrofuran]                   : 0.006307
   6. Ethyl Acetate  

In [9]:
# Run CV for mixtures
print("\n" + "="*60)
print("Running Mixture CV (Leave-One-Ramp-Out, 13 folds)...")
print("="*60)

print(f"Mixture samples: {len(X_mix)}")
print(f"Unique mixtures: {X_mix['SOLVENT NAME'].nunique()}")

# Define high-error solvents for mixtures (based on containing high-error single solvents)
high_error_mix = [
    '1,1,1,3,3,3-Hexafluoropropan-2-ol.2-Methyltetrahydrofuran [2-MeTHF]',  # Contains HFIP
    '2,2,2-Trifluoroethanol.Water.2,2,2-Trifluoroethanol',  # Contains TFE
    'Cyclohexane.IPA [Propan-2-ol]',  # Contains Cyclohexane
]

# Generate splits
mix_splits = list(generate_leave_one_ramp_out_splits(X_mix, Y_full))
print(f"Number of folds: {len(mix_splits)}")

# Track per-mixture errors
mix_errors = {}
mix_preds = []
mix_true = []

for fold_idx, (train_idx, test_idx) in enumerate(mix_splits):
    X_train = X_mix.iloc[train_idx]
    Y_train = Y_full.iloc[train_idx]
    X_test = X_mix.iloc[test_idx]
    Y_test = Y_full.iloc[test_idx]
    
    test_mixture = X_test['SOLVENT NAME'].iloc[0]
    
    # Train model
    model = ManualOODModel(high_error_mix)
    model.fit(X_train, Y_train)
    
    # Predict
    preds = model.predict(X_test)
    
    # Calculate MSE
    mse = np.mean((preds - Y_test.values) ** 2)
    mix_errors[test_mixture] = mse
    
    mix_preds.append(preds)
    mix_true.append(Y_test.values)
    
    is_high_error = test_mixture in high_error_mix
    marker = " [HIGH-ERROR]" if is_high_error else ""
    print(f"Fold {fold_idx+1:2d}: {test_mixture:55s} MSE = {mse:.6f}{marker}")

# Calculate overall MSE
mix_preds = np.vstack(mix_preds)
mix_true = np.vstack(mix_true)
mix_mse = np.mean((mix_preds - mix_true) ** 2)
mix_std = np.std([mix_errors[s] for s in mix_errors])

print(f"\nMixture CV MSE: {mix_mse:.6f} +/- {mix_std:.6f}")


Running Mixture CV (Leave-One-Ramp-Out, 13 folds)...
Mixture samples: 1227
Unique mixtures: 13
Number of folds: 13


Fold  1: 1,1,1,3,3,3-Hexafluoropropan-2-ol.2-Methyltetrahydrofuran [2-MeTHF] MSE = 0.021581 [HIGH-ERROR]


Fold  2: 2,2,2-Trifluoroethanol.Water.2,2,2-Trifluoroethanol     MSE = 0.020026 [HIGH-ERROR]


Fold  3: 2-Methyltetrahydrofuran [2-MeTHF].Diethyl Ether [Ether] MSE = 0.028090


Fold  4: Acetonitrile.Acetonitrile.Acetic Acid                   MSE = 0.030224


Fold  5: Cyclohexane.IPA [Propan-2-ol]                           MSE = 0.023929 [HIGH-ERROR]


Fold  6: DMA [N,N-Dimethylacetamide].Decanol                     MSE = 0.012839


Fold  7: Dihydrolevoglucosenone (Cyrene).Ethyl Acetate           MSE = 0.007300


Fold  8: Ethanol.THF [Tetrahydrofuran]                           MSE = 0.046145


Fold  9: MTBE [tert-Butylmethylether].Butanone [MEK]             MSE = 0.014158


Fold 10: Methanol.Ethylene Glycol [1,2-Ethanediol]               MSE = 0.012227


Fold 11: Methyl Propionate.Ethyl Lactate                         MSE = 0.079240


Fold 12: Water.Acetonitrile.Acetonitrile                         MSE = 0.011940


Fold 13: tert-Butanol [2-Methylpropan-2-ol].Dimethyl Carbonate   MSE = 0.014348

Mixture CV MSE: 0.023814 +/- 0.018582


In [10]:
# Calculate overall CV score
print("\n" + "="*60)
print("Overall Results")
print("="*60)

# Weighted average (same as competition)
n_single = len(all_true)
n_mix = len(mix_true)
n_total = n_single + n_mix

overall_mse = (n_single * single_mse + n_mix * mix_mse) / n_total

print(f"\nSingle Solvent CV MSE: {single_mse:.6f} +/- {single_std:.6f} (n={n_single})")
print(f"Mixture CV MSE: {mix_mse:.6f} +/- {mix_std:.6f} (n={n_mix})")
print(f"Overall CV MSE: {overall_mse:.6f}")

print(f"\nBaseline (exp_030): CV = 0.008298")
print(f"Improvement: {(0.008298 - overall_mse) / 0.008298 * 100:.1f}%")

# Check if this is better than baseline
if overall_mse < 0.008298:
    print("\n✓ BETTER than baseline! Consider submitting.")
else:
    print("\n✗ WORSE than baseline. Need to adjust approach.")


Overall Results

Single Solvent CV MSE: 0.014997 +/- 0.009658 (n=656)
Mixture CV MSE: 0.023814 +/- 0.018582 (n=1227)
Overall CV MSE: 0.020742

Baseline (exp_030): CV = 0.008298
Improvement: -150.0%

✗ WORSE than baseline. Need to adjust approach.


In [None]:
# Summary and next steps
print("\n" + "="*60)
print("Summary of Manual OOD Handling Experiment")
print("="*60)

print(f"\nManual OOD Handling CV MSE: {overall_mse:.6f}")
print(f"Baseline (exp_030): CV = 0.008298")
print(f"Improvement: {(0.008298 - overall_mse) / 0.008298 * 100:.1f}%")

print("\nPer-solvent comparison for high-error solvents:")
for solvent in high_error_single:
    if solvent in solvent_errors:
        this_exp = solvent_errors[solvent]
        baseline = baseline_errors.get(solvent, 'N/A')
        if isinstance(baseline, float):
            change = (this_exp - baseline) / baseline * 100
            print(f"  {solvent}: {this_exp:.6f} (was {baseline:.6f}, {change:+.1f}%)")

print("\nKey Insights:")
print("1. Manual OOD handling targets specific high-error solvents")
print("2. Uses simpler features (Spange only) for these solvents")
print("3. Uses full features (Spange + DRFP) for other solvents")

if overall_mse < 0.008298:
    print("\nCONCLUSION: Manual OOD handling IMPROVES overall CV.")
    print("Consider submitting to test if this changes the CV-LB relationship.")
else:
    print("\nCONCLUSION: Manual OOD handling does NOT improve overall CV.")
    print("The improvement on high-error solvents doesn't compensate for other solvents.")
    print("\nNext steps:")
    print("1. Try mixall-style ensemble (MLP + XGBoost + RF + LightGBM)")
    print("2. Try ensemble disagreement for OOD detection")
    print("3. Consider different high-error solvent list")

# Experiment 049b: Mixall-Style Ensemble\n\n**Hypothesis**: The mixall kernel's success is due to its ensemble structure (MLP + XGBoost + RF + LightGBM) with Spange-only features.\n\n**Implementation**:\n- Train MLP + XGBoost + RandomForest + LightGBM ensemble\n- Use Spange descriptors only (like mixall)\n- Use learned weights for ensemble combination"}

In [11]:
# Mixall-Style Ensemble Model
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

class MixallStyleModel:
    """
    Mixall-style ensemble: MLP + XGBoost + RandomForest + LightGBM
    Uses Spange descriptors only (no DRFP)
    """
    def __init__(self, mlp_weight=0.25, xgb_weight=0.25, rf_weight=0.25, lgbm_weight=0.25):
        self.mlp_weight = mlp_weight
        self.xgb_weight = xgb_weight
        self.rf_weight = rf_weight
        self.lgbm_weight = lgbm_weight
        
        self.scaler = StandardScaler()
        self.mlp_models = []
        self.xgb_models = []
        self.rf_models = []
        self.lgbm_models = []
        
        self.input_dim = None
    
    def fit(self, X_train, Y_train):
        """Train all models"""
        # Prepare features (Spange + Arrhenius only, no DRFP)
        X_features = prepare_features(X_train, None, include_drfp=False)
        self.input_dim = X_features.shape[1]
        X_scaled = self.scaler.fit_transform(X_features)
        
        Y_values = Y_train.values
        
        # Train MLP (3 models for bagging)
        for _ in range(3):
            mlp = train_mlp(X_scaled, Y_values, self.input_dim, epochs=200, hidden_dims=[64, 32])
            self.mlp_models.append(mlp)
        
        # Train XGBoost (per-target)
        xgb_params = {
            'objective': 'reg:squarederror',
            'learning_rate': 0.03,
            'max_depth': 6,
            'n_estimators': 500,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            'verbosity': 0
        }
        for i in range(3):
            model = xgb.XGBRegressor(**xgb_params)
            model.fit(X_scaled, Y_values[:, i])
            self.xgb_models.append(model)
        
        # Train RandomForest (per-target)
        rf_params = {
            'n_estimators': 200,
            'max_depth': 10,
            'min_samples_split': 5,
            'min_samples_leaf': 2,
            'n_jobs': -1
        }
        for i in range(3):
            model = RandomForestRegressor(**rf_params)
            model.fit(X_scaled, Y_values[:, i])
            self.rf_models.append(model)
        
        # Train LightGBM (per-target)
        lgbm_params = {
            'objective': 'regression',
            'metric': 'mse',
            'learning_rate': 0.03,
            'max_depth': 6,
            'num_leaves': 31,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            'verbose': -1,
            'n_estimators': 500
        }
        for i in range(3):
            model = lgb.LGBMRegressor(**lgbm_params)
            model.fit(X_scaled, Y_values[:, i])
            self.lgbm_models.append(model)
        
        return self
    
    def predict(self, X_test):
        """Predict using ensemble"""
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Prepare features
        X_features = prepare_features(X_test, None, include_drfp=False)
        X_scaled = self.scaler.transform(X_features)
        
        # MLP predictions
        mlp_preds = []
        X_tensor = torch.FloatTensor(X_scaled).to(device)
        for mlp in self.mlp_models:
            mlp.eval()
            with torch.no_grad():
                pred = mlp(X_tensor).cpu().numpy()
            mlp_preds.append(pred)
        mlp_pred = np.mean(mlp_preds, axis=0)
        
        # XGBoost predictions
        xgb_pred = np.zeros((len(X_test), 3))
        for i, model in enumerate(self.xgb_models):
            xgb_pred[:, i] = model.predict(X_scaled)
        
        # RandomForest predictions
        rf_pred = np.zeros((len(X_test), 3))
        for i, model in enumerate(self.rf_models):
            rf_pred[:, i] = model.predict(X_scaled)
        
        # LightGBM predictions
        lgbm_pred = np.zeros((len(X_test), 3))
        for i, model in enumerate(self.lgbm_models):
            lgbm_pred[:, i] = model.predict(X_scaled)
        
        # Ensemble
        final_pred = (self.mlp_weight * mlp_pred + 
                      self.xgb_weight * xgb_pred + 
                      self.rf_weight * rf_pred + 
                      self.lgbm_weight * lgbm_pred)
        
        return np.clip(final_pred, 0, 1)

print("MixallStyleModel defined")

MixallStyleModel defined


In [12]:
# Run CV for single solvents with Mixall-style model
print("Running Single Solvent CV with Mixall-Style Model...")
print("="*60)

# Generate splits
splits = list(generate_leave_one_out_splits(X_single, Y_single))
print(f"Number of folds: {len(splits)}")

# Track per-solvent errors
solvent_errors_mixall = {}
all_preds_mixall = []
all_true_mixall = []

for fold_idx, (train_idx, test_idx) in enumerate(splits):
    X_train = X_single.iloc[train_idx]
    Y_train = Y_single.iloc[train_idx]
    X_test = X_single.iloc[test_idx]
    Y_test = Y_single.iloc[test_idx]
    
    test_solvent = X_test['SOLVENT NAME'].iloc[0]
    
    # Train model
    model = MixallStyleModel()
    model.fit(X_train, Y_train)
    
    # Predict
    preds = model.predict(X_test)
    
    # Calculate MSE
    mse = np.mean((preds - Y_test.values) ** 2)
    solvent_errors_mixall[test_solvent] = mse
    
    all_preds_mixall.append(preds)
    all_true_mixall.append(Y_test.values)
    
    print(f"Fold {fold_idx+1:2d}: {test_solvent:45s} MSE = {mse:.6f}")

# Calculate overall MSE
all_preds_mixall = np.vstack(all_preds_mixall)
all_true_mixall = np.vstack(all_true_mixall)
single_mse_mixall = np.mean((all_preds_mixall - all_true_mixall) ** 2)
single_std_mixall = np.std([solvent_errors_mixall[s] for s in solvent_errors_mixall])

print(f"\nMixall-Style Single Solvent CV MSE: {single_mse_mixall:.6f} +/- {single_std_mixall:.6f}")

Running Single Solvent CV with Mixall-Style Model...
Number of folds: 24


Fold  1: 1,1,1,3,3,3-Hexafluoropropan-2-ol             MSE = 0.034599


Fold  2: 2,2,2-Trifluoroethanol                        MSE = 0.027233


Fold  3: 2-Methyltetrahydrofuran [2-MeTHF]             MSE = 0.001749


Fold  4: Acetonitrile                                  MSE = 0.013828


Fold  5: Acetonitrile.Acetic Acid                      MSE = 0.036788


Fold  6: Butanone [MEK]                                MSE = 0.005288


Fold  7: Cyclohexane                                   MSE = 0.017445


Fold  8: DMA [N,N-Dimethylacetamide]                   MSE = 0.007404


Fold  9: Decanol                                       MSE = 0.009547


Fold 10: Diethyl Ether [Ether]                         MSE = 0.018687


Fold 11: Dihydrolevoglucosenone (Cyrene)               MSE = 0.006091


Fold 12: Dimethyl Carbonate                            MSE = 0.008467


Fold 13: Ethanol                                       MSE = 0.003670


Fold 14: Ethyl Acetate                                 MSE = 0.008380


Fold 15: Ethyl Lactate                                 MSE = 0.001896


Fold 16: Ethylene Glycol [1,2-Ethanediol]              MSE = 0.012994


Fold 17: IPA [Propan-2-ol]                             MSE = 0.008910


Fold 18: MTBE [tert-Butylmethylether]                  MSE = 0.007564


Fold 19: Methanol                                      MSE = 0.007013


Fold 20: Methyl Propionate                             MSE = 0.002998


Fold 21: THF [Tetrahydrofuran]                         MSE = 0.002521


Fold 22: Water.2,2,2-Trifluoroethanol                  MSE = 0.005419


Fold 23: Water.Acetonitrile                            MSE = 0.008579


Fold 24: tert-Butanol [2-Methylpropan-2-ol]            MSE = 0.006095

Mixall-Style Single Solvent CV MSE: 0.011532 +/- 0.009430


In [13]:
# Run CV for mixtures with Mixall-style model
print("\n" + "="*60)
print("Running Mixture CV with Mixall-Style Model...")
print("="*60)

# Generate splits
mix_splits = list(generate_leave_one_ramp_out_splits(X_mix, Y_full))
print(f"Number of folds: {len(mix_splits)}")

# Track per-mixture errors
mix_errors_mixall = {}
mix_preds_mixall = []
mix_true_mixall = []

for fold_idx, (train_idx, test_idx) in enumerate(mix_splits):
    X_train = X_mix.iloc[train_idx]
    Y_train = Y_full.iloc[train_idx]
    X_test = X_mix.iloc[test_idx]
    Y_test = Y_full.iloc[test_idx]
    
    test_mixture = X_test['SOLVENT NAME'].iloc[0]
    
    # Train model
    model = MixallStyleModel()
    model.fit(X_train, Y_train)
    
    # Predict
    preds = model.predict(X_test)
    
    # Calculate MSE
    mse = np.mean((preds - Y_test.values) ** 2)
    mix_errors_mixall[test_mixture] = mse
    
    mix_preds_mixall.append(preds)
    mix_true_mixall.append(Y_test.values)
    
    print(f"Fold {fold_idx+1:2d}: {test_mixture:55s} MSE = {mse:.6f}")

# Calculate overall MSE
mix_preds_mixall = np.vstack(mix_preds_mixall)
mix_true_mixall = np.vstack(mix_true_mixall)
mix_mse_mixall = np.mean((mix_preds_mixall - mix_true_mixall) ** 2)
mix_std_mixall = np.std([mix_errors_mixall[s] for s in mix_errors_mixall])

print(f"\nMixall-Style Mixture CV MSE: {mix_mse_mixall:.6f} +/- {mix_std_mixall:.6f}")


Running Mixture CV with Mixall-Style Model...
Number of folds: 13


Fold  1: 1,1,1,3,3,3-Hexafluoropropan-2-ol.2-Methyltetrahydrofuran [2-MeTHF] MSE = 0.021649


Fold  2: 2,2,2-Trifluoroethanol.Water.2,2,2-Trifluoroethanol     MSE = 0.017734


Fold  3: 2-Methyltetrahydrofuran [2-MeTHF].Diethyl Ether [Ether] MSE = 0.008472


Fold  4: Acetonitrile.Acetonitrile.Acetic Acid                   MSE = 0.021580


Fold  5: Cyclohexane.IPA [Propan-2-ol]                           MSE = 0.026328


Fold  6: DMA [N,N-Dimethylacetamide].Decanol                     MSE = 0.012825


Fold  7: Dihydrolevoglucosenone (Cyrene).Ethyl Acetate           MSE = 0.005199


Fold  8: Ethanol.THF [Tetrahydrofuran]                           MSE = 0.005679


Fold  9: MTBE [tert-Butylmethylether].Butanone [MEK]             MSE = 0.006432


Fold 10: Methanol.Ethylene Glycol [1,2-Ethanediol]               MSE = 0.015570


Fold 11: Methyl Propionate.Ethyl Lactate                         MSE = 0.003296


Fold 12: Water.Acetonitrile.Acetonitrile                         MSE = 0.019407


Fold 13: tert-Butanol [2-Methylpropan-2-ol].Dimethyl Carbonate   MSE = 0.022211

Mixall-Style Mixture CV MSE: 0.015620 +/- 0.007494


In [14]:
# Calculate overall CV score for Mixall-style model
print("\n" + "="*60)
print("Mixall-Style Model Overall Results")
print("="*60)

# Weighted average (same as competition)
n_single_mixall = len(all_true_mixall)
n_mix_mixall = len(mix_true_mixall)
n_total_mixall = n_single_mixall + n_mix_mixall

overall_mse_mixall = (n_single_mixall * single_mse_mixall + n_mix_mixall * mix_mse_mixall) / n_total_mixall

print(f"\nSingle Solvent CV MSE: {single_mse_mixall:.6f} +/- {single_std_mixall:.6f} (n={n_single_mixall})")
print(f"Mixture CV MSE: {mix_mse_mixall:.6f} +/- {mix_std_mixall:.6f} (n={n_mix_mixall})")
print(f"Overall CV MSE: {overall_mse_mixall:.6f}")

print(f"\nBaseline (exp_030): CV = 0.008298")
print(f"Improvement: {(0.008298 - overall_mse_mixall) / 0.008298 * 100:.1f}%")

# Check if this is better than baseline
if overall_mse_mixall < 0.008298:
    print("\n✓ BETTER than baseline! Consider submitting.")
else:
    print("\n✗ WORSE than baseline.")


Mixall-Style Model Overall Results

Single Solvent CV MSE: 0.011532 +/- 0.009430 (n=656)
Mixture CV MSE: 0.015620 +/- 0.007494 (n=1227)
Overall CV MSE: 0.014196

Baseline (exp_030): CV = 0.008298
Improvement: -71.1%

✗ WORSE than baseline.


# Experiment 049c: Ensemble Disagreement for OOD Detection\n\n**Hypothesis**: Use variance of ensemble predictions as OOD indicator. For high-uncertainty samples, use a more conservative prediction.\n\n**Implementation**:\n- Train GP + MLP + LGBM ensemble (same as baseline)\n- Compute prediction variance across ensemble members\n- For high-variance samples, shrink predictions toward the mean"}

In [15]:
# Ensemble Disagreement Model
class EnsembleDisagreementModel:
    """
    GP + MLP + LGBM ensemble with uncertainty-based prediction adjustment.
    For high-uncertainty samples, shrink predictions toward the training mean.
    """
    def __init__(self, gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.3, shrink_threshold=0.1):
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        self.shrink_threshold = shrink_threshold
        
        self.scaler = StandardScaler()
        self.gp_models = []
        self.mlp_models = []
        self.lgbm_models = []
        
        self.drfp_mask = None
        self.input_dim = None
        self.train_mean = None
    
    def fit(self, X_train, Y_train):
        """Train all models"""
        # Get DRFP mask from training data
        drfp_cols = [col for col in X_train.columns if col.startswith('DRFP_')]
        drfp_data = X_train[drfp_cols].values
        self.drfp_mask = drfp_data.var(axis=0) > 0
        
        # Prepare features
        X_features = prepare_features(X_train, self.drfp_mask, include_drfp=True)
        self.input_dim = X_features.shape[1]
        X_scaled = self.scaler.fit_transform(X_features)
        
        Y_values = Y_train.values
        self.train_mean = Y_values.mean(axis=0)
        
        # Train GP (only on subset for speed)
        n_gp = min(200, len(X_scaled))
        idx_gp = np.random.choice(len(X_scaled), n_gp, replace=False)
        for i in range(3):
            kernel = Matern(nu=2.5) + WhiteKernel(noise_level=0.1)
            gp = GaussianProcessRegressor(kernel=kernel, alpha=0.1, n_restarts_optimizer=2)
            gp.fit(X_scaled[idx_gp], Y_values[idx_gp, i])
            self.gp_models.append(gp)
        
        # Train MLP (3 models for bagging)
        for _ in range(3):
            mlp = train_mlp(X_scaled, Y_values, self.input_dim, epochs=200, hidden_dims=[128, 64])
            self.mlp_models.append(mlp)
        
        # Train LightGBM (per-target)
        lgbm_params = {
            'objective': 'regression',
            'metric': 'mse',
            'learning_rate': 0.03,
            'max_depth': 6,
            'num_leaves': 31,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            'verbose': -1,
            'n_estimators': 500
        }
        for i in range(3):
            model = lgb.LGBMRegressor(**lgbm_params)
            model.fit(X_scaled, Y_values[:, i])
            self.lgbm_models.append(model)
        
        return self
    
    def predict(self, X_test):
        """Predict with uncertainty-based adjustment"""
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Prepare features
        X_features = prepare_features(X_test, self.drfp_mask, include_drfp=True)
        X_scaled = self.scaler.transform(X_features)
        
        # GP predictions
        gp_preds = np.zeros((len(X_test), 3))
        for i, gp in enumerate(self.gp_models):
            gp_preds[:, i] = gp.predict(X_scaled)
        gp_preds = np.clip(gp_preds, 0, 1)
        
        # MLP predictions (collect all individual predictions for variance)
        mlp_preds_list = []
        X_tensor = torch.FloatTensor(X_scaled).to(device)
        for mlp in self.mlp_models:
            mlp.eval()
            with torch.no_grad():
                pred = mlp(X_tensor).cpu().numpy()
            mlp_preds_list.append(pred)
        mlp_pred = np.mean(mlp_preds_list, axis=0)
        
        # LightGBM predictions
        lgbm_pred = np.zeros((len(X_test), 3))
        for i, model in enumerate(self.lgbm_models):
            lgbm_pred[:, i] = model.predict(X_scaled)
        lgbm_pred = np.clip(lgbm_pred, 0, 1)
        
        # Ensemble prediction
        ensemble_pred = self.gp_weight * gp_preds + self.mlp_weight * mlp_pred + self.lgbm_weight * lgbm_pred
        
        # Compute uncertainty (variance across ensemble members)
        all_preds = np.stack([gp_preds] + mlp_preds_list + [lgbm_pred], axis=0)  # (n_models, n_samples, 3)
        pred_std = np.std(all_preds, axis=0)  # (n_samples, 3)
        pred_uncertainty = pred_std.mean(axis=1)  # (n_samples,)
        
        # For high-uncertainty samples, shrink toward training mean
        final_pred = ensemble_pred.copy()
        for idx in range(len(X_test)):
            if pred_uncertainty[idx] > self.shrink_threshold:
                # Shrink factor based on uncertainty
                shrink_factor = min(1.0, (pred_uncertainty[idx] - self.shrink_threshold) / self.shrink_threshold)
                final_pred[idx] = (1 - shrink_factor) * ensemble_pred[idx] + shrink_factor * self.train_mean
        
        return np.clip(final_pred, 0, 1)

print("EnsembleDisagreementModel defined")

EnsembleDisagreementModel defined


In [16]:
# Run CV for single solvents with Ensemble Disagreement model
print("Running Single Solvent CV with Ensemble Disagreement Model...")
print("="*60)

# Generate splits
splits = list(generate_leave_one_out_splits(X_single, Y_single))
print(f"Number of folds: {len(splits)}")

# Track per-solvent errors
solvent_errors_ed = {}
all_preds_ed = []
all_true_ed = []

for fold_idx, (train_idx, test_idx) in enumerate(splits):
    X_train = X_single.iloc[train_idx]
    Y_train = Y_single.iloc[train_idx]
    X_test = X_single.iloc[test_idx]
    Y_test = Y_single.iloc[test_idx]
    
    test_solvent = X_test['SOLVENT NAME'].iloc[0]
    
    # Train model
    model = EnsembleDisagreementModel(shrink_threshold=0.1)
    model.fit(X_train, Y_train)
    
    # Predict
    preds = model.predict(X_test)
    
    # Calculate MSE
    mse = np.mean((preds - Y_test.values) ** 2)
    solvent_errors_ed[test_solvent] = mse
    
    all_preds_ed.append(preds)
    all_true_ed.append(Y_test.values)
    
    print(f"Fold {fold_idx+1:2d}: {test_solvent:45s} MSE = {mse:.6f}")

# Calculate overall MSE
all_preds_ed = np.vstack(all_preds_ed)
all_true_ed = np.vstack(all_true_ed)
single_mse_ed = np.mean((all_preds_ed - all_true_ed) ** 2)
single_std_ed = np.std([solvent_errors_ed[s] for s in solvent_errors_ed])

print(f"\nEnsemble Disagreement Single Solvent CV MSE: {single_mse_ed:.6f} +/- {single_std_ed:.6f}")

Running Single Solvent CV with Ensemble Disagreement Model...
Number of folds: 24


Fold  1: 1,1,1,3,3,3-Hexafluoropropan-2-ol             MSE = 0.030778


Fold  2: 2,2,2-Trifluoroethanol                        MSE = 0.021733


Fold  3: 2-Methyltetrahydrofuran [2-MeTHF]             MSE = 0.006043


Fold  4: Acetonitrile                                  MSE = 0.012389


Fold  5: Acetonitrile.Acetic Acid                      MSE = 0.034267


Fold  6: Butanone [MEK]                                MSE = 0.011477


Fold  7: Cyclohexane                                   MSE = 0.013214


Fold  8: DMA [N,N-Dimethylacetamide]                   MSE = 0.011573


Fold  9: Decanol                                       MSE = 0.012028


Fold 10: Diethyl Ether [Ether]                         MSE = 0.014244


Fold 11: Dihydrolevoglucosenone (Cyrene)               MSE = 0.003462


Fold 12: Dimethyl Carbonate                            MSE = 0.022822


Fold 13: Ethanol                                       MSE = 0.006270


Fold 14: Ethyl Acetate                                 MSE = 0.009070


Fold 15: Ethyl Lactate                                 MSE = 0.002501


Fold 16: Ethylene Glycol [1,2-Ethanediol]              MSE = 0.017318


Fold 17: IPA [Propan-2-ol]                             MSE = 0.015294


Fold 18: MTBE [tert-Butylmethylether]                  MSE = 0.010976


Fold 19: Methanol                                      MSE = 0.009079


Fold 20: Methyl Propionate                             MSE = 0.011165


Fold 21: THF [Tetrahydrofuran]                         MSE = 0.006371


Fold 22: Water.2,2,2-Trifluoroethanol                  MSE = 0.011496


Fold 23: Water.Acetonitrile                            MSE = 0.011375


Fold 24: tert-Butanol [2-Methylpropan-2-ol]            MSE = 0.010053

Ensemble Disagreement Single Solvent CV MSE: 0.013040 +/- 0.007535


In [17]:
# Run CV for mixtures with Ensemble Disagreement model
print("\n" + "="*60)
print("Running Mixture CV with Ensemble Disagreement Model...")
print("="*60)

# Generate splits
mix_splits = list(generate_leave_one_ramp_out_splits(X_mix, Y_full))
print(f"Number of folds: {len(mix_splits)}")

# Track per-mixture errors
mix_errors_ed = {}
mix_preds_ed = []
mix_true_ed = []

for fold_idx, (train_idx, test_idx) in enumerate(mix_splits):
    X_train = X_mix.iloc[train_idx]
    Y_train = Y_full.iloc[train_idx]
    X_test = X_mix.iloc[test_idx]
    Y_test = Y_full.iloc[test_idx]
    
    test_mixture = X_test['SOLVENT NAME'].iloc[0]
    
    # Train model
    model = EnsembleDisagreementModel(shrink_threshold=0.1)
    model.fit(X_train, Y_train)
    
    # Predict
    preds = model.predict(X_test)
    
    # Calculate MSE
    mse = np.mean((preds - Y_test.values) ** 2)
    mix_errors_ed[test_mixture] = mse
    
    mix_preds_ed.append(preds)
    mix_true_ed.append(Y_test.values)
    
    print(f"Fold {fold_idx+1:2d}: {test_mixture:55s} MSE = {mse:.6f}")

# Calculate overall MSE
mix_preds_ed = np.vstack(mix_preds_ed)
mix_true_ed = np.vstack(mix_true_ed)
mix_mse_ed = np.mean((mix_preds_ed - mix_true_ed) ** 2)
mix_std_ed = np.std([mix_errors_ed[s] for s in mix_errors_ed])

print(f"\nEnsemble Disagreement Mixture CV MSE: {mix_mse_ed:.6f} +/- {mix_std_ed:.6f}")


Running Mixture CV with Ensemble Disagreement Model...
Number of folds: 13


Fold  1: 1,1,1,3,3,3-Hexafluoropropan-2-ol.2-Methyltetrahydrofuran [2-MeTHF] MSE = 0.038437


Fold  2: 2,2,2-Trifluoroethanol.Water.2,2,2-Trifluoroethanol     MSE = 0.014005


Fold  3: 2-Methyltetrahydrofuran [2-MeTHF].Diethyl Ether [Ether] MSE = 0.047329


Fold  4: Acetonitrile.Acetonitrile.Acetic Acid                   MSE = 0.029822


Fold  5: Cyclohexane.IPA [Propan-2-ol]                           MSE = 0.060848


Fold  6: DMA [N,N-Dimethylacetamide].Decanol                     MSE = 0.012710


Fold  7: Dihydrolevoglucosenone (Cyrene).Ethyl Acetate           MSE = 0.009623


Fold  8: Ethanol.THF [Tetrahydrofuran]                           MSE = 0.046156


Fold  9: MTBE [tert-Butylmethylether].Butanone [MEK]             MSE = 0.009979


Fold 10: Methanol.Ethylene Glycol [1,2-Ethanediol]               MSE = 0.010140


Fold 11: Methyl Propionate.Ethyl Lactate                         MSE = 0.023332


Fold 12: Water.Acetonitrile.Acetonitrile                         MSE = 0.012784


Fold 13: tert-Butanol [2-Methylpropan-2-ol].Dimethyl Carbonate   MSE = 0.023170

Ensemble Disagreement Mixture CV MSE: 0.028421 +/- 0.016482


In [18]:
# Calculate overall CV score for Ensemble Disagreement model
print("\n" + "="*60)
print("Ensemble Disagreement Model Overall Results")
print("="*60)

# Weighted average (same as competition)
n_single_ed = len(all_true_ed)
n_mix_ed = len(mix_true_ed)
n_total_ed = n_single_ed + n_mix_ed

overall_mse_ed = (n_single_ed * single_mse_ed + n_mix_ed * mix_mse_ed) / n_total_ed

print(f"\nSingle Solvent CV MSE: {single_mse_ed:.6f} +/- {single_std_ed:.6f} (n={n_single_ed})")
print(f"Mixture CV MSE: {mix_mse_ed:.6f} +/- {mix_std_ed:.6f} (n={n_mix_ed})")
print(f"Overall CV MSE: {overall_mse_ed:.6f}")

print(f"\nBaseline (exp_030): CV = 0.008298")
print(f"Improvement: {(0.008298 - overall_mse_ed) / 0.008298 * 100:.1f}%")

# Check if this is better than baseline
if overall_mse_ed < 0.008298:
    print("\n✓ BETTER than baseline! Consider submitting.")
else:
    print("\n✗ WORSE than baseline.")


Ensemble Disagreement Model Overall Results

Single Solvent CV MSE: 0.013040 +/- 0.007535 (n=656)
Mixture CV MSE: 0.028421 +/- 0.016482 (n=1227)
Overall CV MSE: 0.023063

Baseline (exp_030): CV = 0.008298
Improvement: -177.9%

✗ WORSE than baseline.
