# Experiment 047: Full Pipeline Implementation

**Goal:** Implement the FULL pipeline from top kernels (gentilless/best-work-here)

**Key techniques:**
1. Non-linear mixture formula: `A * (1 - r) + B * r + 0.05 * A * B * r * (1 - r)`
2. Advanced feature engineering (polynomial, interaction, statistical)
3. Stronger hyperparameters (3000+ iterations for tree models)
4. CatBoost + XGBoost + LightGBM ensemble
5. Adaptive ensemble weighting

**Hypothesis:** These techniques may CHANGE the CV-LB relationship, not just improve CV.

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
import warnings
warnings.filterwarnings('ignore')

# Check for CatBoost
try:
    from catboost import CatBoostRegressor
    HAS_CATBOOST = True
    print('CatBoost available')
except ImportError:
    HAS_CATBOOST = False
    print('CatBoost not available, will use XGBoost instead')

np.random.seed(42)
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

CatBoost available


Using device: cuda


In [2]:
# Data loading
DATA_PATH = '/home/data'

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[["SM", "Product 2", "Product 3"]]
    return X, Y

# Load feature lookup tables
spange_df = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
drfp_df = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)

SPANGE_COLS = [c for c in spange_df.columns if c != 'solvent smiles']
DRFP_COLS = [c for c in drfp_df.columns if str(c).isdigit() or isinstance(c, int)]

print(f'Spange: {len(SPANGE_COLS)} features')
print(f'DRFP: {len(DRFP_COLS)} features')

# Load data
X_single, Y_single = load_data('single_solvent')
X_full, Y_full = load_data('full')

print(f'Single solvent: {len(X_single)} samples')
print(f'Full data: {len(X_full)} samples')

Spange: 13 features
DRFP: 2048 features
Single solvent: 656 samples
Full data: 1227 samples


In [3]:
# Advanced feature extraction with non-linear mixture and polynomial features
def get_advanced_features(X, data_type='single'):
    """Extract features with advanced engineering from gentilless kernel."""
    features_list = []
    
    for idx, row in X.iterrows():
        # Basic kinetics features
        time_m = row['Residence Time']
        temp_c = row['Temperature']
        temp_k = temp_c + 273.15
        
        # Kinetics features (expanded)
        kinetics = np.array([
            time_m,
            temp_c,
            1.0 / temp_k,  # Arrhenius
            np.log(time_m + 1),  # Log time
            time_m / temp_k,  # Interaction
            time_m * temp_c,  # T_x_RT interaction
            np.sqrt(time_m + 1e-8),  # Sqrt time
            temp_c ** 2,  # Polynomial temp
        ], dtype=np.float32)
        
        if data_type == 'single':
            solvent = row['SOLVENT NAME']
            spange = spange_df.loc[solvent, SPANGE_COLS].values.astype(np.float32) if solvent in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            drfp = drfp_df.loc[solvent, DRFP_COLS].values.astype(np.float32) if solvent in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            
            # Statistical features from molecular descriptors
            mol_stats = np.array([
                spange.mean(),
                spange.std(),
                spange.max(),
                spange.min(),
            ], dtype=np.float32)
            
            features = np.concatenate([kinetics, spange, mol_stats, drfp])
        else:
            # Full solvent (mixture) - use NON-LINEAR mixing
            solvent_a = row['SOLVENT A NAME']
            solvent_b = row['SOLVENT B NAME']
            pct_b = row['SolventB%'] / 100.0
            pct_a = 1 - pct_b
            
            # Get Spange descriptors
            sp_a = spange_df.loc[solvent_a, SPANGE_COLS].values.astype(np.float32) if solvent_a in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            sp_b = spange_df.loc[solvent_b, SPANGE_COLS].values.astype(np.float32) if solvent_b in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            
            # NON-LINEAR mixture formula from gentilless kernel:
            # mixture = A * (1 - r) + B * r + 0.05 * A * B * r * (1 - r)
            spange_linear = pct_a * sp_a + pct_b * sp_b
            spange_interaction = 0.05 * sp_a * sp_b * pct_a * pct_b
            spange = spange_linear + spange_interaction
            
            # Additional interaction features
            interaction_feat = sp_a * sp_b * pct_a * pct_b * 4  # Scaled interaction
            difference_feat = np.abs(sp_a - sp_b)  # Absolute difference
            
            # DRFP features (linear mixing)
            dr_a = drfp_df.loc[solvent_a, DRFP_COLS].values.astype(np.float32) if solvent_a in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            dr_b = drfp_df.loc[solvent_b, DRFP_COLS].values.astype(np.float32) if solvent_b in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            drfp = pct_a * dr_a + pct_b * dr_b
            
            # Statistical features
            mol_stats = np.array([
                spange.mean(),
                spange.std(),
                spange.max(),
                spange.min(),
            ], dtype=np.float32)
            
            # Mixture-specific features
            mixture_feat = np.array([pct_b], dtype=np.float32)  # Mixture ratio
            
            features = np.concatenate([kinetics, spange, interaction_feat, difference_feat, mol_stats, mixture_feat, drfp])
        
        features_list.append(features)
    
    return np.array(features_list, dtype=np.float32)

print('Advanced feature extraction defined')

Advanced feature extraction defined


In [4]:
# MLP Model with optional SE attention
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128, 64], dropout=0.3):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print('MLPModel defined')

MLPModel defined


In [5]:
# Full Pipeline Model with advanced features and stronger hyperparameters
class FullPipelineModel:
    def __init__(self, data='single', gp_weight=0.15, mlp_weight=0.35, lgbm_weight=0.25, xgb_weight=0.25):
        self.data_type = data
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        self.xgb_weight = xgb_weight
        
        self.scaler = None
        self.gp_models = []
        self.mlp_models = []
        self.lgbm_models = []
        self.xgb_models = []
    
    def train_model(self, X_train, y_train, epochs=200):
        X_feat = get_advanced_features(X_train, self.data_type)
        y_np = y_train.values.astype(np.float32)
        
        # Scale features
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # Determine GP feature count (kinetics + spange + stats)
        gp_feat_count = min(30, X_scaled.shape[1])  # Use first 30 features for GP
        
        # Train GP models (one per target)
        self.gp_models = []
        for i in range(3):
            kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
            gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)
            gp.fit(X_scaled[:, :gp_feat_count], y_np[:, i])
            self.gp_models.append(gp)
        
        # Train MLP models (ensemble of 3) with stronger architecture
        self.mlp_models = []
        for _ in range(3):
            model = MLPModel(X_scaled.shape[1], hidden_dims=[256, 128, 64], dropout=0.3).to(device)
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
            
            X_tensor = torch.tensor(X_scaled).to(device)
            y_tensor = torch.tensor(y_np).to(device)
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    # Weighted loss
                    weights = torch.tensor([1.0, 1.0, 2.0]).to(device)
                    loss = (weights * (pred - y_batch)**2).mean()
                    loss.backward()
                    optimizer.step()
                scheduler.step()
            
            model.eval()
            self.mlp_models.append(model)
        
        # Train LightGBM models with STRONGER hyperparameters
        self.lgbm_models = []
        for i in range(3):
            lgbm_model = lgb.LGBMRegressor(
                n_estimators=500,  # Increased from 100
                max_depth=8,  # Increased from 5
                learning_rate=0.02,  # Decreased for more iterations
                num_leaves=63,  # Increased from 31
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                verbose=-1
            )
            lgbm_model.fit(X_scaled, y_np[:, i])
            self.lgbm_models.append(lgbm_model)
        
        # Train XGBoost models with STRONGER hyperparameters
        self.xgb_models = []
        for i in range(3):
            xgb_model = xgb.XGBRegressor(
                n_estimators=500,  # Increased
                max_depth=8,  # Increased
                learning_rate=0.02,  # Decreased
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                verbosity=0
            )
            xgb_model.fit(X_scaled, y_np[:, i])
            self.xgb_models.append(xgb_model)
        
        return self
    
    def predict(self, X_test):
        X_feat = get_advanced_features(X_test, self.data_type)
        X_scaled = self.scaler.transform(X_feat)
        
        gp_feat_count = min(30, X_scaled.shape[1])
        
        # GP predictions
        gp_preds = np.zeros((len(X_test), 3))
        for i, gp in enumerate(self.gp_models):
            gp_preds[:, i] = gp.predict(X_scaled[:, :gp_feat_count])
        
        # MLP predictions
        mlp_preds = []
        for model in self.mlp_models:
            X_tensor = torch.tensor(X_scaled).to(device)
            with torch.no_grad():
                pred = model(X_tensor).cpu().numpy()
            mlp_preds.append(pred)
        mlp_preds = np.mean(mlp_preds, axis=0)
        
        # LightGBM predictions
        lgbm_preds = np.zeros((len(X_test), 3))
        for i, lgbm_model in enumerate(self.lgbm_models):
            lgbm_preds[:, i] = lgbm_model.predict(X_scaled)
        
        # XGBoost predictions
        xgb_preds = np.zeros((len(X_test), 3))
        for i, xgb_model in enumerate(self.xgb_models):
            xgb_preds[:, i] = xgb_model.predict(X_scaled)
        
        # Weighted ensemble
        final_preds = (self.gp_weight * gp_preds + 
                       self.mlp_weight * mlp_preds + 
                       self.lgbm_weight * lgbm_preds +
                       self.xgb_weight * xgb_preds)
        
        final_preds = np.clip(final_preds, 0, 1)
        return torch.tensor(final_preds, dtype=torch.float32)

print('FullPipelineModel defined')

FullPipelineModel defined


In [None]:
# Test on single solvent data
print("Testing full pipeline on single solvent data...")
print()

all_solvents = sorted(X_single["SOLVENT NAME"].unique())
fold_mses = []

for test_solvent in all_solvents:
    mask = X_single["SOLVENT NAME"] != test_solvent
    
    model = FullPipelineModel(data='single')
    model.train_model(X_single[mask], Y_single[mask], epochs=150)
    preds = model.predict(X_single[~mask])
    
    actuals = Y_single[~mask].values
    mse = np.mean((actuals - preds.numpy())**2)
    fold_mses.append(mse)

mean_mse = np.mean(fold_mses)
std_mse = np.std(fold_mses)
print(f"Full Pipeline CV MSE: {mean_mse:.6f} +/- {std_mse:.6f}")
print(f"Baseline (exp_030): CV = 0.008298")
if mean_mse < 0.008298:
    print(f"Improvement: {(0.008298 - mean_mse) / 0.008298 * 100:.2f}%")
else:
    print(f"Degradation: {(mean_mse - 0.008298) / 0.008298 * 100:.2f}%")