# Experiment 044: Hybrid Model

**Strategy:** Use baseline features for single solvents (CV 0.008194) and non-linear features for mixtures (CV 0.073776).

**Why:** 
- Non-linear features improve mixture CV by 12.5% (0.084 → 0.074)
- Non-linear features hurt single solvent CV by 9.8% (0.008194 → 0.008994)
- Hybrid approach captures the best of both worlds

**Goal:** Submit to verify if mixture improvements translate to LB improvement.

In [None]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# Data loading
DATA_PATH = '/home/data'

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[["SM", "Product 2", "Product 3"]]
    return X, Y

# Load feature lookup tables
spange_df = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
drfp_df = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)

SPANGE_COLS = [c for c in spange_df.columns if c != 'solvent smiles']
DRFP_COLS = [c for c in drfp_df.columns if str(c).isdigit() or isinstance(c, int)]

print(f'Spange: {len(SPANGE_COLS)} features')
print(f'DRFP: {len(DRFP_COLS)} features')

# Load data
X_single, Y_single = load_data('single_solvent')
X_full, Y_full = load_data('full')

print(f'Single solvent: {len(X_single)} samples')
print(f'Full data: {len(X_full)} samples')

In [None]:
# MLP Model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[32, 16]):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(0.3)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print('MLPModel defined')

In [None]:
# Hybrid Model: baseline for single solvents, non-linear for mixtures
class HybridModel:
    """Hybrid model that uses different feature extraction for single vs mixture data.
    
    For single solvents: baseline Spange + DRFP features (no interaction terms)
    For mixtures: adds interaction and difference features
    """
    
    def __init__(self, data='single', gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.3):
        self.data_type = data
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        
        self.scaler = None
        self.gp_models = []
        self.mlp_models = []
        self.lgbm_models = []
    
    def _get_features(self, X):
        """Extract features based on data type."""
        features_list = []
        
        for idx, row in X.iterrows():
            # Kinetics features (5 features)
            time_m = row['Residence Time']
            temp_c = row['Temperature']
            temp_k = temp_c + 273.15
            
            kinetics = np.array([
                time_m,
                temp_c,
                1.0 / temp_k,
                np.log(time_m + 1),
                time_m / temp_k
            ], dtype=np.float32)
            
            if self.data_type == 'single':
                # Single solvent: baseline features (no interaction terms)
                solvent = row['SOLVENT NAME']
                spange = spange_df.loc[solvent, SPANGE_COLS].values.astype(np.float32) if solvent in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
                drfp = drfp_df.loc[solvent, DRFP_COLS].values.astype(np.float32) if solvent in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
                
                features = np.concatenate([kinetics, spange, drfp])
            else:
                # Mixture: add non-linear features
                solvent_a = row['SOLVENT A NAME']
                solvent_b = row['SOLVENT B NAME']
                pct_b = row['SolventB%'] / 100.0
                pct_a = 1 - pct_b
                
                # Get Spange descriptors
                sp_a = spange_df.loc[solvent_a, SPANGE_COLS].values.astype(np.float32) if solvent_a in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
                sp_b = spange_df.loc[solvent_b, SPANGE_COLS].values.astype(np.float32) if solvent_b in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
                
                # Linear mixing (baseline)
                spange_linear = pct_a * sp_a + pct_b * sp_b
                
                # Non-linear features (interaction + difference)
                interaction = sp_a * sp_b * pct_a * pct_b * 4  # Scaled interaction
                difference = np.abs(sp_a - sp_b)  # Absolute difference
                
                # DRFP features (linear mixing)
                dr_a = drfp_df.loc[solvent_a, DRFP_COLS].values.astype(np.float32) if solvent_a in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
                dr_b = drfp_df.loc[solvent_b, DRFP_COLS].values.astype(np.float32) if solvent_b in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
                drfp = pct_a * dr_a + pct_b * dr_b
                
                features = np.concatenate([kinetics, spange_linear, interaction, difference, drfp])
            
            features_list.append(features)
        
        return np.array(features_list, dtype=np.float32)
    
    def train_model(self, X_train, y_train, epochs=200):
        X_feat = self._get_features(X_train)
        y_np = y_train.values.astype(np.float32)
        
        # Scale features
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # GP feature count (kinetics + spange + interaction + difference for mixtures)
        if self.data_type == 'single':
            gp_feat_count = 18  # 5 kinetics + 13 spange
        else:
            gp_feat_count = 18 + 13 + 13  # 5 kinetics + 13 spange + 13 interaction + 13 difference
        gp_feat_count = min(gp_feat_count, X_scaled.shape[1])
        
        # Train GP models (one per target)
        self.gp_models = []
        for i in range(3):
            kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
            gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)
            gp.fit(X_scaled[:, :gp_feat_count], y_np[:, i])
            self.gp_models.append(gp)
        
        # Train MLP models (ensemble of 3)
        self.mlp_models = []
        for _ in range(3):
            model = MLPModel(X_scaled.shape[1], hidden_dims=[32, 16]).to(device)
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
            
            X_tensor = torch.tensor(X_scaled).to(device)
            y_tensor = torch.tensor(y_np).to(device)
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    weights = torch.tensor([1.0, 1.0, 2.0]).to(device)
                    loss = (weights * (pred - y_batch)**2).mean()
                    loss.backward()
                    optimizer.step()
                scheduler.step()
            
            model.eval()
            self.mlp_models.append(model)
        
        # Train LGBM models (one per target)
        self.lgbm_models = []
        for i in range(3):
            lgbm_model = lgb.LGBMRegressor(
                n_estimators=100, learning_rate=0.05, max_depth=5,
                num_leaves=31, random_state=42, verbose=-1
            )
            lgbm_model.fit(X_scaled, y_np[:, i])
            self.lgbm_models.append(lgbm_model)
        
        return self
    
    def predict(self, X_test):
        X_feat = self._get_features(X_test)
        X_scaled = self.scaler.transform(X_feat)
        
        if self.data_type == 'single':
            gp_feat_count = 18
        else:
            gp_feat_count = 18 + 13 + 13
        gp_feat_count = min(gp_feat_count, X_scaled.shape[1])
        
        # GP predictions
        gp_preds = np.zeros((len(X_test), 3))
        for i, gp in enumerate(self.gp_models):
            gp_preds[:, i] = gp.predict(X_scaled[:, :gp_feat_count])
        
        # MLP predictions
        mlp_preds = []
        for model in self.mlp_models:
            X_tensor = torch.tensor(X_scaled).to(device)
            with torch.no_grad():
                pred = model(X_tensor).cpu().numpy()
            mlp_preds.append(pred)
        mlp_preds = np.mean(mlp_preds, axis=0)
        
        # LGBM predictions
        lgbm_preds = np.zeros((len(X_test), 3))
        for i, lgbm_model in enumerate(self.lgbm_models):
            lgbm_preds[:, i] = lgbm_model.predict(X_scaled)
        
        # Ensemble
        ensemble_preds = self.gp_weight * gp_preds + self.mlp_weight * mlp_preds + self.lgbm_weight * lgbm_preds
        ensemble_preds = np.clip(ensemble_preds, 0, 1)
        
        return torch.tensor(ensemble_preds, dtype=torch.float32)

print('HybridModel defined')

In [None]:
# Test the hybrid model on single solvent data (should match baseline)
print("Testing hybrid model on single solvent data...")
print()

all_solvents = sorted(X_single["SOLVENT NAME"].unique())
fold_mses_single = []

for test_solvent in all_solvents:
    mask = X_single["SOLVENT NAME"] != test_solvent
    
    model = HybridModel(data='single')
    model.train_model(X_single[mask], Y_single[mask], epochs=150)
    preds = model.predict(X_single[~mask])
    
    actuals = Y_single[~mask].values
    mse = np.mean((actuals - preds.numpy())**2)
    fold_mses_single.append(mse)

mean_mse_single = np.mean(fold_mses_single)
std_mse_single = np.std(fold_mses_single)
print(f"Single solvent CV MSE: {mean_mse_single:.6f} +/- {std_mse_single:.6f}")
print(f"Baseline (exp_035): CV = 0.008194")

In [None]:
# Test the hybrid model on mixture data (should be better than baseline)
print("Testing hybrid model on mixture data...")
print()

# Create ramp identifier
X_full_copy = X_full.copy()
X_full_copy['ramp'] = X_full_copy['SOLVENT A NAME'] + '_' + X_full_copy['SOLVENT B NAME']
unique_ramps = X_full_copy['ramp'].unique()

fold_mses_mixture = []

for test_ramp in unique_ramps:
    mask = X_full_copy['ramp'] != test_ramp
    
    model = HybridModel(data='full')
    model.train_model(X_full[mask], Y_full[mask], epochs=150)
    preds = model.predict(X_full[~mask])
    
    actuals = Y_full[~mask].values
    mse = np.mean((actuals - preds.numpy())**2)
    fold_mses_mixture.append(mse)
    print(f"{test_ramp}: MSE = {mse:.6f}")

mean_mse_mixture = np.mean(fold_mses_mixture)
std_mse_mixture = np.std(fold_mses_mixture)
print(f"\nMixture CV MSE: {mean_mse_mixture:.6f} +/- {std_mse_mixture:.6f}")
print(f"Baseline mixture (exp_043): CV = 0.084319")
print(f"Non-linear mixture (exp_043): CV = 0.073776")