# Experiment 048: Hybrid Feature Ensemble

**Key Insight from exp_047:** Simpler features (Spange only) dramatically reduced error on outlier solvents:
- Cyclohexane: 0.014 vs 0.198 (93% better!)
- HFIP: 0.040 vs 0.096 (58% better!)

But simpler features hurt overall CV (13.2% worse).

**Hypothesis:** Combine strengths of both approaches:
- Model A: Full features (Spange + DRFP) - good for in-distribution solvents
- Model B: Simple features (Spange only) - good for OOD solvents
- Adaptive weighting based on solvent similarity to training set

**Implementation:**
1. Compute similarity of test solvent to training solvents (using Spange descriptors)
2. If similar (in-distribution): weight Model A higher
3. If dissimilar (out-of-distribution): weight Model B higher

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading
DATA_PATH = '/home/data'

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[["SM", "Product 2", "Product 3"]]
    return X, Y

# Load feature lookup tables
spange_df = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
drfp_df = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)

SPANGE_COLS = [c for c in spange_df.columns if c != 'solvent smiles']
DRFP_COLS = [c for c in drfp_df.columns if str(c).isdigit() or isinstance(c, int)]

print(f'Spange: {len(SPANGE_COLS)} features')
print(f'DRFP: {len(DRFP_COLS)} features')

# Load data
X_single, Y_single = load_data('single_solvent')
X_full, Y_full = load_data('full')

print(f'Single solvent: {len(X_single)} samples')
print(f'Full data: {len(X_full)} samples')

Spange: 13 features
DRFP: 2048 features
Single solvent: 656 samples
Full data: 1227 samples


In [3]:
# Feature extraction functions
def get_features_full(X, data_type='single'):
    """Full features: Spange + DRFP + kinetics."""
    features_list = []
    
    for idx, row in X.iterrows():
        time_m = row['Residence Time']
        temp_c = row['Temperature']
        temp_k = temp_c + 273.15
        
        kinetics = np.array([time_m, temp_c, 1.0/temp_k, np.log(time_m+1), time_m/temp_k], dtype=np.float32)
        
        if data_type == 'single':
            solvent = row['SOLVENT NAME']
            spange = spange_df.loc[solvent, SPANGE_COLS].values.astype(np.float32) if solvent in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            drfp = drfp_df.loc[solvent, DRFP_COLS].values.astype(np.float32) if solvent in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
        else:
            solvent_a, solvent_b = row['SOLVENT A NAME'], row['SOLVENT B NAME']
            pct_b = row['SolventB%'] / 100.0
            pct_a = 1 - pct_b
            
            sp_a = spange_df.loc[solvent_a, SPANGE_COLS].values.astype(np.float32) if solvent_a in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            sp_b = spange_df.loc[solvent_b, SPANGE_COLS].values.astype(np.float32) if solvent_b in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            spange = pct_a * sp_a + pct_b * sp_b
            
            dr_a = drfp_df.loc[solvent_a, DRFP_COLS].values.astype(np.float32) if solvent_a in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            dr_b = drfp_df.loc[solvent_b, DRFP_COLS].values.astype(np.float32) if solvent_b in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            drfp = pct_a * dr_a + pct_b * dr_b
        
        features = np.concatenate([kinetics, spange, drfp])
        features_list.append(features)
    
    return np.array(features_list, dtype=np.float32)

def get_features_simple(X, data_type='single'):
    """Simple features: Spange + kinetics only (no DRFP)."""
    features_list = []
    
    for idx, row in X.iterrows():
        time_m = row['Residence Time']
        temp_c = row['Temperature']
        temp_k = temp_c + 273.15
        
        kinetics = np.array([time_m, temp_c, 1.0/temp_k, np.log(time_m+1), time_m/temp_k], dtype=np.float32)
        
        if data_type == 'single':
            solvent = row['SOLVENT NAME']
            spange = spange_df.loc[solvent, SPANGE_COLS].values.astype(np.float32) if solvent in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
        else:
            solvent_a, solvent_b = row['SOLVENT A NAME'], row['SOLVENT B NAME']
            pct_b = row['SolventB%'] / 100.0
            pct_a = 1 - pct_b
            
            sp_a = spange_df.loc[solvent_a, SPANGE_COLS].values.astype(np.float32) if solvent_a in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            sp_b = spange_df.loc[solvent_b, SPANGE_COLS].values.astype(np.float32) if solvent_b in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            spange = pct_a * sp_a + pct_b * sp_b
        
        features = np.concatenate([kinetics, spange])
        features_list.append(features)
    
    return np.array(features_list, dtype=np.float32)

print('Feature extraction functions defined')

Feature extraction functions defined


In [4]:
# MLP Model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[32, 16]):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([nn.Linear(prev_dim, h_dim), nn.BatchNorm1d(h_dim), nn.ReLU(), nn.Dropout(0.3)])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print('MLPModel defined')

MLPModel defined


In [5]:
# Compute solvent similarity using Spange descriptors
def compute_solvent_similarity(test_solvent, train_solvents):
    """Compute similarity of test solvent to training solvents.
    
    Returns a value between 0 (very different) and 1 (very similar).
    """
    # Get Spange descriptors for test solvent
    if test_solvent not in spange_df.index:
        return 0.5  # Default to middle if unknown
    
    test_desc = spange_df.loc[test_solvent, SPANGE_COLS].values.reshape(1, -1)
    
    # Get Spange descriptors for training solvents
    train_descs = []
    for s in train_solvents:
        if s in spange_df.index:
            train_descs.append(spange_df.loc[s, SPANGE_COLS].values)
    
    if len(train_descs) == 0:
        return 0.5
    
    train_descs = np.array(train_descs)
    
    # Compute cosine similarity to each training solvent
    similarities = cosine_similarity(test_desc, train_descs)[0]
    
    # Return max similarity (how similar is test to the most similar training solvent)
    return np.max(similarities)

# Test similarity computation
test_solvent = 'Cyclohexane'
train_solvents = [s for s in X_single['SOLVENT NAME'].unique() if s != test_solvent]
sim = compute_solvent_similarity(test_solvent, train_solvents)
print(f"Similarity of {test_solvent} to training set: {sim:.4f}")

test_solvent = 'Ethanol'
sim = compute_solvent_similarity(test_solvent, train_solvents)
print(f"Similarity of {test_solvent} to training set: {sim:.4f}")

Similarity of Cyclohexane to training set: 0.9995
Similarity of Ethanol to training set: 1.0000


In [6]:
# Hybrid Feature Ensemble Model
class HybridFeatureEnsemble:
    """Ensemble that adaptively weights full vs simple features based on solvent similarity.
    
    - For in-distribution solvents (high similarity): use full features (Spange + DRFP)
    - For OOD solvents (low similarity): use simple features (Spange only)
    """
    
    def __init__(self, data='single', similarity_threshold=0.9, alpha_id=0.8, alpha_ood=0.2):
        self.data_type = data
        self.similarity_threshold = similarity_threshold
        self.alpha_id = alpha_id  # Weight for full features when in-distribution
        self.alpha_ood = alpha_ood  # Weight for full features when OOD
        
        # Model A: Full features (GP + MLP + LGBM)
        self.scaler_full = None
        self.gp_models_full = []
        self.mlp_models_full = []
        self.lgbm_models_full = []
        
        # Model B: Simple features (MLP + LGBM)
        self.scaler_simple = None
        self.mlp_models_simple = []
        self.lgbm_models_simple = []
        
        self.train_solvents = None
    
    def train_model(self, X_train, y_train, epochs=200):
        y_np = y_train.values.astype(np.float32)
        
        # Store training solvents for similarity computation
        if self.data_type == 'single':
            self.train_solvents = X_train['SOLVENT NAME'].unique().tolist()
        else:
            self.train_solvents = X_train['SOLVENT A NAME'].unique().tolist()
        
        # === Train Model A: Full features ===
        X_full = get_features_full(X_train, self.data_type)
        self.scaler_full = StandardScaler()
        X_full_scaled = self.scaler_full.fit_transform(X_full)
        
        # GP models (use only first 18 features)
        self.gp_models_full = []
        for i in range(3):
            kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
            gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)
            gp.fit(X_full_scaled[:, :18], y_np[:, i])
            self.gp_models_full.append(gp)
        
        # MLP models
        self.mlp_models_full = []
        for _ in range(3):
            model = MLPModel(X_full_scaled.shape[1]).to(device)
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
            
            X_tensor = torch.tensor(X_full_scaled).to(device)
            y_tensor = torch.tensor(y_np).to(device)
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    loss = nn.MSELoss()(pred, y_batch)
                    loss.backward()
                    optimizer.step()
                scheduler.step()
            model.eval()
            self.mlp_models_full.append(model)
        
        # LGBM models
        self.lgbm_models_full = []
        for i in range(3):
            lgbm = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.05, max_depth=5, verbose=-1)
            lgbm.fit(X_full_scaled, y_np[:, i])
            self.lgbm_models_full.append(lgbm)
        
        # === Train Model B: Simple features ===
        X_simple = get_features_simple(X_train, self.data_type)
        self.scaler_simple = StandardScaler()
        X_simple_scaled = self.scaler_simple.fit_transform(X_simple)
        
        # MLP models
        self.mlp_models_simple = []
        for _ in range(3):
            model = MLPModel(X_simple_scaled.shape[1]).to(device)
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
            
            X_tensor = torch.tensor(X_simple_scaled).to(device)
            y_tensor = torch.tensor(y_np).to(device)
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    loss = nn.MSELoss()(pred, y_batch)
                    loss.backward()
                    optimizer.step()
                scheduler.step()
            model.eval()
            self.mlp_models_simple.append(model)
        
        # LGBM models
        self.lgbm_models_simple = []
        for i in range(3):
            lgbm = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.05, max_depth=5, verbose=-1)
            lgbm.fit(X_simple_scaled, y_np[:, i])
            self.lgbm_models_simple.append(lgbm)
        
        return self
    
    def predict(self, X_test):
        # Get features
        X_full = get_features_full(X_test, self.data_type)
        X_full_scaled = self.scaler_full.transform(X_full)
        
        X_simple = get_features_simple(X_test, self.data_type)
        X_simple_scaled = self.scaler_simple.transform(X_simple)
        
        # Model A predictions (full features)
        gp_preds = np.zeros((len(X_test), 3))
        for i, gp in enumerate(self.gp_models_full):
            gp_preds[:, i] = gp.predict(X_full_scaled[:, :18])
        
        mlp_preds_full = []
        for model in self.mlp_models_full:
            with torch.no_grad():
                pred = model(torch.tensor(X_full_scaled).to(device)).cpu().numpy()
            mlp_preds_full.append(pred)
        mlp_preds_full = np.mean(mlp_preds_full, axis=0)
        
        lgbm_preds_full = np.zeros((len(X_test), 3))
        for i, lgbm in enumerate(self.lgbm_models_full):
            lgbm_preds_full[:, i] = lgbm.predict(X_full_scaled)
        
        pred_full = 0.15 * gp_preds + 0.55 * mlp_preds_full + 0.3 * lgbm_preds_full
        
        # Model B predictions (simple features)
        mlp_preds_simple = []
        for model in self.mlp_models_simple:
            with torch.no_grad():
                pred = model(torch.tensor(X_simple_scaled).to(device)).cpu().numpy()
            mlp_preds_simple.append(pred)
        mlp_preds_simple = np.mean(mlp_preds_simple, axis=0)
        
        lgbm_preds_simple = np.zeros((len(X_test), 3))
        for i, lgbm in enumerate(self.lgbm_models_simple):
            lgbm_preds_simple[:, i] = lgbm.predict(X_simple_scaled)
        
        pred_simple = 0.6 * mlp_preds_simple + 0.4 * lgbm_preds_simple
        
        # Adaptive weighting based on solvent similarity
        final_preds = []
        for idx, row in X_test.iterrows():
            if self.data_type == 'single':
                test_solvent = row['SOLVENT NAME']
            else:
                test_solvent = row['SOLVENT A NAME']
            
            similarity = compute_solvent_similarity(test_solvent, self.train_solvents)
            
            # Adaptive alpha: higher similarity -> more weight on full features
            if similarity >= self.similarity_threshold:
                alpha = self.alpha_id  # In-distribution: use mostly full features
            else:
                alpha = self.alpha_ood  # OOD: use mostly simple features
            
            i = X_test.index.get_loc(idx)
            final_pred = alpha * pred_full[i] + (1 - alpha) * pred_simple[i]
            final_preds.append(final_pred)
        
        final_preds = np.array(final_preds)
        final_preds = np.clip(final_preds, 0, 1)
        
        return torch.tensor(final_preds, dtype=torch.float32)

print('HybridFeatureEnsemble defined')

HybridFeatureEnsemble defined


In [7]:
# Test hybrid feature ensemble on single solvent data
print("Testing hybrid feature ensemble on single solvent data...")
print()

all_solvents = sorted(X_single["SOLVENT NAME"].unique())
fold_mses = []
fold_details = []

for test_solvent in all_solvents:
    mask = X_single["SOLVENT NAME"] != test_solvent
    
    model = HybridFeatureEnsemble(data='single', similarity_threshold=0.9, alpha_id=0.8, alpha_ood=0.2)
    model.train_model(X_single[mask], Y_single[mask], epochs=150)
    preds = model.predict(X_single[~mask])
    
    actuals = Y_single[~mask].values
    mse = np.mean((actuals - preds.numpy())**2)
    fold_mses.append(mse)
    
    # Compute similarity for this fold
    train_solvents = [s for s in all_solvents if s != test_solvent]
    sim = compute_solvent_similarity(test_solvent, train_solvents)
    fold_details.append((test_solvent, mse, sim))
    print(f"{test_solvent}: MSE = {mse:.6f}, Similarity = {sim:.4f}")

mean_mse = np.mean(fold_mses)
std_mse = np.std(fold_mses)
print(f"\n=== Hybrid Feature Ensemble CV Results ===")
print(f"Mean MSE: {mean_mse:.6f} +/- {std_mse:.6f}")
print(f"Baseline (exp_030): CV = 0.008298")

Testing hybrid feature ensemble on single solvent data...



1,1,1,3,3,3-Hexafluoropropan-2-ol: MSE = 0.038187, Similarity = 0.9952


2,2,2-Trifluoroethanol: MSE = 0.015347, Similarity = 0.9984


2-Methyltetrahydrofuran [2-MeTHF]: MSE = 0.002187, Similarity = 0.9998


Acetonitrile: MSE = 0.008555, Similarity = 0.9994


Acetonitrile.Acetic Acid: MSE = 0.021528, Similarity = 0.9994


Butanone [MEK]: MSE = 0.004194, Similarity = 0.9994


Cyclohexane: MSE = 0.004116, Similarity = 0.9995


DMA [N,N-Dimethylacetamide]: MSE = 0.007208, Similarity = 0.9994


Decanol: MSE = 0.012753, Similarity = 0.9977


Diethyl Ether [Ether]: MSE = 0.012611, Similarity = 0.9992


Dihydrolevoglucosenone (Cyrene): MSE = 0.007900, Similarity = 0.9999


Dimethyl Carbonate: MSE = 0.012755, Similarity = 0.9999


Ethanol: MSE = 0.002654, Similarity = 0.9990


Ethyl Acetate: MSE = 0.001168, Similarity = 0.9992


Ethyl Lactate: MSE = 0.002163, Similarity = 0.9992


Ethylene Glycol [1,2-Ethanediol]: MSE = 0.014847, Similarity = 0.9990


IPA [Propan-2-ol]: MSE = 0.011289, Similarity = 0.9992


MTBE [tert-Butylmethylether]: MSE = 0.007583, Similarity = 0.9989


Methanol: MSE = 0.004234, Similarity = 0.9990


Methyl Propionate: MSE = 0.001243, Similarity = 0.9992


THF [Tetrahydrofuran]: MSE = 0.001263, Similarity = 0.9998


Water.2,2,2-Trifluoroethanol: MSE = 0.004976, Similarity = 0.9983


Water.Acetonitrile: MSE = 0.011855, Similarity = 0.9923


tert-Butanol [2-Methylpropan-2-ol]: MSE = 0.002610, Similarity = 0.9991

=== Hybrid Feature Ensemble CV Results ===
Mean MSE: 0.008884 +/- 0.008128
Baseline (exp_030): CV = 0.008298
