# Experiment 045: Mean Reversion

**Problem:** CV-LB relationship has intercept 0.0528, which is 72% of target (0.073). Even CV=0 would give LB=0.0528.

**Hypothesis:** The large intercept suggests predictions are systematically biased away from the mean. Blending predictions toward the training mean could reduce this bias.

**Implementation:**
1. Start with exp_030's GP+MLP+LGBM ensemble (best LB model)
2. Compute training mean for each target
3. Blend predictions: `final_pred = alpha * model_pred + (1-alpha) * train_mean`
4. Test alpha values: 0.7, 0.8, 0.85, 0.9, 0.95
5. Select best alpha based on CV

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading
DATA_PATH = '/home/data'

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[["SM", "Product 2", "Product 3"]]
    return X, Y

# Load feature lookup tables
spange_df = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
drfp_df = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)

SPANGE_COLS = [c for c in spange_df.columns if c != 'solvent smiles']
DRFP_COLS = [c for c in drfp_df.columns if str(c).isdigit() or isinstance(c, int)]

print(f'Spange: {len(SPANGE_COLS)} features')
print(f'DRFP: {len(DRFP_COLS)} features')

# Load data
X_single, Y_single = load_data('single_solvent')
X_full, Y_full = load_data('full')

print(f'Single solvent: {len(X_single)} samples')
print(f'Full data: {len(X_full)} samples')

Spange: 13 features
DRFP: 2048 features
Single solvent: 656 samples
Full data: 1227 samples


In [3]:
# Feature extraction (baseline from exp_030)
def get_features(X, data_type='single'):
    features_list = []
    
    for idx, row in X.iterrows():
        time_m = row['Residence Time']
        temp_c = row['Temperature']
        temp_k = temp_c + 273.15
        
        kinetics = np.array([
            time_m, temp_c, 1.0 / temp_k,
            np.log(time_m + 1), time_m / temp_k
        ], dtype=np.float32)
        
        if data_type == 'single':
            solvent = row['SOLVENT NAME']
            spange = spange_df.loc[solvent, SPANGE_COLS].values.astype(np.float32) if solvent in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            drfp = drfp_df.loc[solvent, DRFP_COLS].values.astype(np.float32) if solvent in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
        else:
            solvent_a = row['SOLVENT A NAME']
            solvent_b = row['SOLVENT B NAME']
            pct_b = row['SolventB%'] / 100.0
            pct_a = 1 - pct_b
            
            sp_a = spange_df.loc[solvent_a, SPANGE_COLS].values.astype(np.float32) if solvent_a in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            sp_b = spange_df.loc[solvent_b, SPANGE_COLS].values.astype(np.float32) if solvent_b in spange_df.index else np.zeros(len(SPANGE_COLS), dtype=np.float32)
            spange = pct_a * sp_a + pct_b * sp_b
            
            dr_a = drfp_df.loc[solvent_a, DRFP_COLS].values.astype(np.float32) if solvent_a in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            dr_b = drfp_df.loc[solvent_b, DRFP_COLS].values.astype(np.float32) if solvent_b in drfp_df.index else np.zeros(len(DRFP_COLS), dtype=np.float32)
            drfp = pct_a * dr_a + pct_b * dr_b
        
        features = np.concatenate([kinetics, spange, drfp])
        features_list.append(features)
    
    return np.array(features_list, dtype=np.float32)

print('Feature extraction defined')

Feature extraction defined


In [4]:
# MLP Model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[32, 16]):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(0.3)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 3))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

print('MLPModel defined')

MLPModel defined


In [5]:
# Mean Reversion Model
class MeanReversionModel:
    def __init__(self, data='single', alpha=0.85, gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.3):
        self.data_type = data
        self.alpha = alpha
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        
        self.scaler = None
        self.gp_models = []
        self.mlp_models = []
        self.lgbm_models = []
        self.train_mean = None
    
    def train_model(self, X_train, y_train, epochs=200):
        self.train_mean = y_train.mean().values.astype(np.float32)
        
        X_feat = get_features(X_train, self.data_type)
        y_np = y_train.values.astype(np.float32)
        
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # Train GP models
        self.gp_models = []
        for i in range(3):
            kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
            gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)
            gp.fit(X_scaled[:, :18], y_np[:, i])
            self.gp_models.append(gp)
        
        # Train MLP models
        self.mlp_models = []
        for _ in range(3):
            model = MLPModel(X_scaled.shape[1], hidden_dims=[32, 16]).to(device)
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
            
            X_tensor = torch.tensor(X_scaled).to(device)
            y_tensor = torch.tensor(y_np).to(device)
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    weights = torch.tensor([1.0, 1.0, 2.0]).to(device)
                    loss = (weights * (pred - y_batch)**2).mean()
                    loss.backward()
                    optimizer.step()
                scheduler.step()
            
            model.eval()
            self.mlp_models.append(model)
        
        # Train LGBM models
        self.lgbm_models = []
        for i in range(3):
            lgbm_model = lgb.LGBMRegressor(
                n_estimators=100, learning_rate=0.05, max_depth=5,
                num_leaves=31, random_state=42, verbose=-1
            )
            lgbm_model.fit(X_scaled, y_np[:, i])
            self.lgbm_models.append(lgbm_model)
        
        return self
    
    def predict(self, X_test):
        X_feat = get_features(X_test, self.data_type)
        X_scaled = self.scaler.transform(X_feat)
        
        # GP predictions
        gp_preds = np.zeros((len(X_test), 3))
        for i, gp in enumerate(self.gp_models):
            gp_preds[:, i] = gp.predict(X_scaled[:, :18])
        
        # MLP predictions
        mlp_preds = []
        for model in self.mlp_models:
            X_tensor = torch.tensor(X_scaled).to(device)
            with torch.no_grad():
                pred = model(X_tensor).cpu().numpy()
            mlp_preds.append(pred)
        mlp_preds = np.mean(mlp_preds, axis=0)
        
        # LGBM predictions
        lgbm_preds = np.zeros((len(X_test), 3))
        for i, lgbm_model in enumerate(self.lgbm_models):
            lgbm_preds[:, i] = lgbm_model.predict(X_scaled)
        
        # Ensemble
        ensemble_preds = self.gp_weight * gp_preds + self.mlp_weight * mlp_preds + self.lgbm_weight * lgbm_preds
        
        # Mean reversion: blend toward training mean
        final_preds = self.alpha * ensemble_preds + (1 - self.alpha) * self.train_mean
        final_preds = np.clip(final_preds, 0, 1)
        
        return torch.tensor(final_preds, dtype=torch.float32)

print('MeanReversionModel defined')

MeanReversionModel defined


In [None]:
# Test different alpha values on single solvent data
print("Testing mean reversion with different alpha values...")
print()

all_solvents = sorted(X_single["SOLVENT NAME"].unique())

alpha_values = [1.0, 0.95, 0.9, 0.85, 0.8, 0.7]
results = {}

for alpha in alpha_values:
    fold_mses = []
    
    for test_solvent in all_solvents:
        mask = X_single["SOLVENT NAME"] != test_solvent
        
        model = MeanReversionModel(data='single', alpha=alpha)
        model.train_model(X_single[mask], Y_single[mask], epochs=150)
        preds = model.predict(X_single[~mask])
        
        actuals = Y_single[~mask].values
        mse = np.mean((actuals - preds.numpy())**2)
        fold_mses.append(mse)
    
    mean_mse = np.mean(fold_mses)
    std_mse = np.std(fold_mses)
    results[alpha] = (mean_mse, std_mse)
    print(f"Alpha={alpha:.2f}: CV MSE = {mean_mse:.6f} +/- {std_mse:.6f}")

print()
print("Baseline (exp_030, alpha=1.0): CV = 0.008298")

In [None]:
# Find best alpha
best_alpha = min(results.keys(), key=lambda a: results[a][0])
best_mse = results[best_alpha][0]

print(f"Best alpha: {best_alpha}")
print(f"Best CV MSE: {best_mse:.6f}")
print(f"Baseline (exp_030): CV = 0.008298")
if best_mse < 0.008298:
    print(f"Improvement: {(0.008298 - best_mse) / 0.008298 * 100:.2f}%")
else:
    print(f"Degradation: {(best_mse - 0.008298) / 0.008298 * 100:.2f}%")