# Experiment 043: Non-Linear Mixture Features

**Problem:** CV-LB relationship has intercept (0.0525) > Target (0.0347). Current approach CANNOT reach target.

**Hypothesis:** Linear mixing of solvent descriptors may not capture non-ideal mixture behavior. Adding interaction terms could:
1. Improve mixture predictions
2. Change the CV-LB relationship
3. Potentially reduce the intercept

**Approaches:**
1. Interaction features: `spange_a * spange_b` (element-wise product)
2. Difference features: `|spange_a - spange_b|` (absolute difference)
3. Polynomial mixing: `a*A + b*B + c*A*B`
4. Ratio features: `spange_a / (spange_a + spange_b)`

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading
DATA_PATH = '/home/data'

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[["SM", "Product 2", "Product 3"]]
    return X, Y

# Load feature lookup tables
spange_df = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
drfp_df = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)

SPANGE_COLS = [c for c in spange_df.columns if c != 'solvent smiles']
DRFP_COLS = [c for c in drfp_df.columns if str(c).isdigit() or isinstance(c, int)]

print(f'Spange: {len(SPANGE_COLS)} features')
print(f'DRFP: {len(DRFP_COLS)} features')

# Load data
X_single, Y_single = load_data('single_solvent')
X_full, Y_full = load_data('full')

print(f'Single solvent: {len(X_single)} samples')
print(f'Full data: {len(X_full)} samples')

Spange: 13 features
DRFP: 2048 features
Single solvent: 656 samples
Full data: 1227 samples


In [3]:
# Feature extraction with non-linear mixture features
def get_features_nonlinear(X, data_type='single', use_interaction=True, use_difference=True, use_ratio=False):
    """Extract features with non-linear mixture terms."""
    features_list = []
    
    for idx, row in X.iterrows():
        # Kinetics features (5 features)
        time_m = row['Residence Time']
        temp_c = row['Temperature']
        temp_k = temp_c + 273.15
        
        kinetics = np.array([
            time_m,
            temp_c,
            1.0 / temp_k,
            np.log(time_m + 1),
            time_m / temp_k
        ], dtype=np.float32)
        
        if data_type == 'single':
            solvent = row['SOLVENT NAME']
            spange = spange_df.loc[solvent, SPANGE_COLS].values.astype(np.float32) if solvent in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            drfp = drfp_df.loc[solvent, DRFP_COLS].values.astype(np.float32) if solvent in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            
            # For single solvents, no interaction terms
            features = np.concatenate([kinetics, spange, drfp])
        else:
            # Full solvent (mixture)
            solvent_a = row['SOLVENT A NAME']
            solvent_b = row['SOLVENT B NAME']
            pct_b = row['SolventB%'] / 100.0
            pct_a = 1 - pct_b
            
            # Get Spange descriptors for each solvent
            sp_a = spange_df.loc[solvent_a, SPANGE_COLS].values.astype(np.float32) if solvent_a in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            sp_b = spange_df.loc[solvent_b, SPANGE_COLS].values.astype(np.float32) if solvent_b in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            
            # Linear mixing (baseline)
            spange_linear = pct_a * sp_a + pct_b * sp_b
            
            # Non-linear features
            extra_features = []
            
            if use_interaction:
                # Interaction: element-wise product scaled by mixing
                interaction = sp_a * sp_b * pct_a * pct_b * 4  # Scale by 4 to normalize (max at 50/50)
                extra_features.append(interaction)
            
            if use_difference:
                # Absolute difference
                difference = np.abs(sp_a - sp_b)
                extra_features.append(difference)
            
            if use_ratio:
                # Ratio features (with epsilon to avoid division by zero)
                eps = 1e-6
                ratio = sp_a / (sp_a + sp_b + eps)
                extra_features.append(ratio)
            
            # DRFP features (linear mixing)
            dr_a = drfp_df.loc[solvent_a, DRFP_COLS].values.astype(np.float32) if solvent_a in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            dr_b = drfp_df.loc[solvent_b, DRFP_COLS].values.astype(np.float32) if solvent_b in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            drfp = pct_a * dr_a + pct_b * dr_b
            
            # Combine all features
            if extra_features:
                extra = np.concatenate(extra_features)
                features = np.concatenate([kinetics, spange_linear, extra, drfp])
            else:
                features = np.concatenate([kinetics, spange_linear, drfp])
        
        features_list.append(features)
    
    return np.array(features_list, dtype=np.float32)

print('Feature extraction with non-linear mixture features defined')

Feature extraction with non-linear mixture features defined


In [4]:
# MLP Model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[32, 16]):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(0.3)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print('MLPModel defined')

MLPModel defined


In [5]:
# Ensemble model with non-linear mixture features
class NonLinearMixtureModel:
    def __init__(self, data='single', gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.3,
                 use_interaction=True, use_difference=True, use_ratio=False):
        self.data_type = data
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        self.use_interaction = use_interaction
        self.use_difference = use_difference
        self.use_ratio = use_ratio
        
        self.scaler = None
        self.gp_models = []
        self.mlp_models = []
        self.lgbm_models = []
    
    def train_model(self, X_train, y_train, epochs=200):
        X_feat = get_features_nonlinear(X_train, self.data_type, 
                                         self.use_interaction, self.use_difference, self.use_ratio)
        y_np = y_train.values.astype(np.float32)
        
        # Scale features
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # Determine GP feature count (kinetics + spange = 18 for single, more for mixture)
        gp_feat_count = 18 if self.data_type == 'single' else 18 + 13 * (self.use_interaction + self.use_difference + self.use_ratio)
        gp_feat_count = min(gp_feat_count, X_scaled.shape[1])  # Cap at total features
        
        # Train GP models (one per target)
        self.gp_models = []
        for i in range(3):
            kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
            gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)
            gp.fit(X_scaled[:, :gp_feat_count], y_np[:, i])
            self.gp_models.append(gp)
        
        # Train MLP models (ensemble of 3)
        self.mlp_models = []
        for _ in range(3):
            model = MLPModel(X_scaled.shape[1], hidden_dims=[32, 16]).to(device)
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
            
            X_tensor = torch.tensor(X_scaled).to(device)
            y_tensor = torch.tensor(y_np).to(device)
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    weights = torch.tensor([1.0, 1.0, 2.0]).to(device)
                    loss = (weights * (pred - y_batch)**2).mean()
                    loss.backward()
                    optimizer.step()
                scheduler.step()
            
            model.eval()
            self.mlp_models.append(model)
        
        # Train LGBM models (one per target)
        self.lgbm_models = []
        for i in range(3):
            lgbm_model = lgb.LGBMRegressor(
                n_estimators=100, learning_rate=0.05, max_depth=5,
                num_leaves=31, random_state=42, verbose=-1
            )
            lgbm_model.fit(X_scaled, y_np[:, i])
            self.lgbm_models.append(lgbm_model)
        
        return self
    
    def predict(self, X_test):
        X_feat = get_features_nonlinear(X_test, self.data_type,
                                         self.use_interaction, self.use_difference, self.use_ratio)
        X_scaled = self.scaler.transform(X_feat)
        
        gp_feat_count = min(18 + 13 * (self.use_interaction + self.use_difference + self.use_ratio), X_scaled.shape[1])
        
        # GP predictions
        gp_preds = np.zeros((len(X_test), 3))
        for i, gp in enumerate(self.gp_models):
            gp_preds[:, i] = gp.predict(X_scaled[:, :gp_feat_count])
        
        # MLP predictions
        mlp_preds = []
        for model in self.mlp_models:
            X_tensor = torch.tensor(X_scaled).to(device)
            with torch.no_grad():
                pred = model(X_tensor).cpu().numpy()
            mlp_preds.append(pred)
        mlp_preds = np.mean(mlp_preds, axis=0)
        
        # LGBM predictions
        lgbm_preds = np.zeros((len(X_test), 3))
        for i, lgbm_model in enumerate(self.lgbm_models):
            lgbm_preds[:, i] = lgbm_model.predict(X_scaled)
        
        # Ensemble
        ensemble_preds = self.gp_weight * gp_preds + self.mlp_weight * mlp_preds + self.lgbm_weight * lgbm_preds
        ensemble_preds = np.clip(ensemble_preds, 0, 1)
        
        return torch.tensor(ensemble_preds, dtype=torch.float32)

print('NonLinearMixtureModel defined')

NonLinearMixtureModel defined


In [6]:
# Test on full data (mixtures) - single fold first
print("Testing non-linear mixture features on full data (single fold)...")

# Get unique ramps for leave-one-ramp-out CV
ramps = X_full.groupby(['SOLVENT A NAME', 'SOLVENT B NAME']).ngroups
print(f"Number of unique ramps: {ramps}")

# Create ramp identifier
X_full_copy = X_full.copy()
X_full_copy['ramp'] = X_full_copy['SOLVENT A NAME'] + '_' + X_full_copy['SOLVENT B NAME']
unique_ramps = X_full_copy['ramp'].unique()
print(f"Unique ramps: {len(unique_ramps)}")

# Test on first ramp
test_ramp = unique_ramps[0]
mask = X_full_copy['ramp'] != test_ramp

print(f"\nTest ramp: {test_ramp}")
print(f"Training samples: {mask.sum()}, Test samples: {(~mask).sum()}")

Testing non-linear mixture features on full data (single fold)...
Number of unique ramps: 13
Unique ramps: 13

Test ramp: Methanol_Ethylene Glycol [1,2-Ethanediol]
Training samples: 1105, Test samples: 122


In [7]:
# Compare baseline vs non-linear features on single fold
print("\nComparing baseline vs non-linear features...")

# Baseline (no non-linear features)
model_baseline = NonLinearMixtureModel(data='full', use_interaction=False, use_difference=False, use_ratio=False)
model_baseline.train_model(X_full[mask], Y_full[mask], epochs=150)
preds_baseline = model_baseline.predict(X_full[~mask])
actuals = Y_full[~mask].values
mse_baseline = np.mean((actuals - preds_baseline.numpy())**2)
print(f"Baseline (linear mixing): MSE = {mse_baseline:.6f}")

# With interaction features
model_interaction = NonLinearMixtureModel(data='full', use_interaction=True, use_difference=False, use_ratio=False)
model_interaction.train_model(X_full[mask], Y_full[mask], epochs=150)
preds_interaction = model_interaction.predict(X_full[~mask])
mse_interaction = np.mean((actuals - preds_interaction.numpy())**2)
print(f"With interaction: MSE = {mse_interaction:.6f}")

# With difference features
model_difference = NonLinearMixtureModel(data='full', use_interaction=False, use_difference=True, use_ratio=False)
model_difference.train_model(X_full[mask], Y_full[mask], epochs=150)
preds_difference = model_difference.predict(X_full[~mask])
mse_difference = np.mean((actuals - preds_difference.numpy())**2)
print(f"With difference: MSE = {mse_difference:.6f}")

# With both
model_both = NonLinearMixtureModel(data='full', use_interaction=True, use_difference=True, use_ratio=False)
model_both.train_model(X_full[mask], Y_full[mask], epochs=150)
preds_both = model_both.predict(X_full[~mask])
mse_both = np.mean((actuals - preds_both.numpy())**2)
print(f"With interaction + difference: MSE = {mse_both:.6f}")


Comparing baseline vs non-linear features...


Baseline (linear mixing): MSE = 0.013364


With interaction: MSE = 0.008460


With difference: MSE = 0.016322


With interaction + difference: MSE = 0.012944


In [9]:
# Run full leave-one-ramp-out CV on mixture data
print("Running full leave-one-ramp-out CV on mixture data...")
print()

# Test baseline vs interaction features
configs = [
    ('Baseline (linear)', False, False),
    ('Interaction only', True, False),
    ('Difference only', False, True),
    ('Both', True, True)
]

for config_name, use_int, use_diff in configs:
    fold_mses = []
    
    for test_ramp in unique_ramps:
        mask = X_full_copy['ramp'] != test_ramp
        
        model = NonLinearMixtureModel(data='full', use_interaction=use_int, use_difference=use_diff)
        model.train_model(X_full[mask], Y_full[mask], epochs=150)
        preds = model.predict(X_full[~mask])
        
        actuals = Y_full[~mask].values
        mse = np.mean((actuals - preds.numpy())**2)
        fold_mses.append(mse)
    
    mean_mse = np.mean(fold_mses)
    std_mse = np.std(fold_mses)
    print(f"{config_name}: CV MSE = {mean_mse:.6f} +/- {std_mse:.6f}")

Running full leave-one-ramp-out CV on mixture data...



Baseline (linear): CV MSE = 0.078499 +/- 0.151869


Interaction only: CV MSE = 0.076290 +/- 0.128964


Difference only: CV MSE = 0.080358 +/- 0.127286


Both: CV MSE = 0.073776 +/- 0.128533
