# Experiment 092: Conservative Predictions for Extrapolation

**Goal**: Attack the CV-LB intercept problem by detecting extrapolation and blending predictions toward training mean.

**Rationale**: The CV-LB relationship has intercept 0.0525 > target 0.0347. This intercept represents structural distribution shift. If we detect when we're extrapolating (predicting for solvents far from training distribution) and blend toward the training mean, we can reduce the intercept.

**Approach**:
1. Use our best model (GP+MLP+LGBM ensemble) as base
2. Compute extrapolation score based on distance to training solvents
3. Blend predictions toward training mean for high-uncertainty cases

In [1]:
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import NearestNeighbors
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
import lightgbm as lgb
from abc import ABC, abstractmethod
import warnings
warnings.filterwarnings('ignore')

# Define constants
INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%",
]
INPUT_LABELS_SINGLE_SOLVENT = [
    "Residence Time", "Temperature", "SOLVENT NAME",
]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

# Local data loading functions
def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv('/home/data/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    features = pd.read_csv(f'/home/data/{name}_lookup.csv', index_col=0)
    return features

def generate_leave_one_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvents."""
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvent ramps."""
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != solvent_pair).all(axis=1)
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

print("Imports complete")
print(f"GPU available: {torch.cuda.is_available()}")

Imports complete
GPU available: True


In [2]:
# Base classes
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError

In [3]:
# Featurizers with Arrhenius features (from best model)
class PrecomputedFeaturizerWithArrhenius(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 5  # +5 for Time, Temp, 1/T, log(t), t/T
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        
        # Arrhenius-inspired features
        inv_temp = 1.0 / (temp + 273.15)  # 1/T in Kelvin
        log_time = np.log(res_time + 1)  # log(t+1)
        time_over_temp = res_time / (temp + 273.15)  # t/T
        
        solvent_names = X['SOLVENT NAME']
        feats = self.features.loc[solvent_names].values
        
        final_feats = np.hstack([res_time, temp, inv_temp, log_time, time_over_temp, feats])
        return torch.tensor(final_feats, dtype=torch.float32)
    
    def get_solvent_features(self, X):
        """Get only solvent features for extrapolation detection."""
        solvent_names = X['SOLVENT NAME']
        return self.features.loc[solvent_names].values

class PrecomputedFeaturizerMixedWithArrhenius(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 6  # +6 for Time, Temp, %B, 1/T, log(t), t/T
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        sb_pct = X['SolventB%'].values.reshape(-1, 1)
        
        # Arrhenius-inspired features
        inv_temp = 1.0 / (temp + 273.15)
        log_time = np.log(res_time + 1)
        time_over_temp = res_time / (temp + 273.15)
        
        desc_a = self.features.loc[X['SOLVENT A NAME']].values
        desc_b = self.features.loc[X['SOLVENT B NAME']].values
        
        mixture_feats = (1 - sb_pct) * desc_a + sb_pct * desc_b
        
        final_feats = np.hstack([res_time, temp, sb_pct, inv_temp, log_time, time_over_temp, mixture_feats])
        return torch.tensor(final_feats, dtype=torch.float32)
    
    def get_solvent_features(self, X):
        """Get mixture solvent features for extrapolation detection."""
        sb_pct = X['SolventB%'].values.reshape(-1, 1)
        desc_a = self.features.loc[X['SOLVENT A NAME']].values
        desc_b = self.features.loc[X['SOLVENT B NAME']].values
        return (1 - sb_pct) * desc_a + sb_pct * desc_b

print("Featurizers defined")

Featurizers defined


In [4]:
# MLP model
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, output_dim=3, hidden_dims=[64, 32], dropout=0.1):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.network(x)

print("MLP defined")

MLP defined


In [5]:
# Conservative Extrapolation Model
class ConservativeExtrapolationModel(BaseModel):
    """Model that detects extrapolation and blends toward training mean."""
    
    def __init__(self, data='single', blend_threshold=0.5, blend_strength=0.3):
        super().__init__()
        self.data = data
        self.blend_threshold = blend_threshold  # Percentile threshold for extrapolation
        self.blend_strength = blend_strength  # How much to blend toward mean (0-1)
        
        # Featurizer
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizerWithArrhenius('spange_descriptors')
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixedWithArrhenius('spange_descriptors')
        
        # MLP
        self.mlp = SimpleMLP(
            input_dim=self.smiles_featurizer.feats_dim,
            output_dim=3,
            hidden_dims=[64, 32],
            dropout=0.1
        )
        
        # LightGBM
        self.lgbm = MultiOutputRegressor(lgb.LGBMRegressor(
            num_leaves=31,
            learning_rate=0.1,
            n_estimators=100,
            random_state=42,
            verbosity=-1
        ))
        
        # GP for uncertainty
        kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        self.gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)
        
        # Scaler and extrapolation detector
        self.scaler = StandardScaler()
        self.solvent_scaler = StandardScaler()
        self.nn_detector = None
        self.train_mean = None
        self.train_distances = None
        
        # Ensemble weights (from best model)
        self.weights = [0.3, 0.4, 0.3]  # GP, MLP, LGBM
        
    def train_model(self, train_X, train_Y, num_epochs=100, lr=1e-3, batch_size=32,
                    optimizer=torch.optim.Adam, criterion=nn.MSELoss, device=None, verbose=False):
        # Store training mean for blending
        self.train_mean = train_Y.values.mean(axis=0)
        
        # Featurize
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.numpy()
        train_Y_np = train_Y.values
        
        # Scale
        X_scaled = self.scaler.fit_transform(X_np)
        
        # Get solvent features for extrapolation detection
        solvent_feats = self.smiles_featurizer.get_solvent_features(train_X)
        solvent_feats_scaled = self.solvent_scaler.fit_transform(solvent_feats)
        
        # Fit nearest neighbor detector on training solvent features
        self.nn_detector = NearestNeighbors(n_neighbors=min(5, len(solvent_feats_scaled)))
        self.nn_detector.fit(solvent_feats_scaled)
        
        # Compute training distances for threshold calibration
        train_distances, _ = self.nn_detector.kneighbors(solvent_feats_scaled)
        self.train_distances = train_distances.mean(axis=1)
        
        # DataFrame for GBDT
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # Train LightGBM
        self.lgbm.fit(X_scaled_df, train_Y_np)
        
        # Train GP (on subset for speed)
        n_gp = min(200, len(X_scaled))
        indices = np.random.choice(len(X_scaled), n_gp, replace=False)
        self.gp.fit(X_scaled[indices], train_Y_np[indices, 0])  # GP for first target only
        
        # Train MLP
        X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32)
        train_Y_tensor = torch.tensor(train_Y_np, dtype=torch.float32)
        
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.mlp.to(device)
        
        optimizer_inst = optimizer(self.mlp.parameters(), lr=lr)
        train_loader = DataLoader(
            TensorDataset(X_tensor_scaled, train_Y_tensor),
            batch_size=batch_size, shuffle=True, drop_last=True
        )
        
        criterion_inst = criterion()
        for epoch in range(num_epochs):
            self.mlp.train()
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer_inst.zero_grad()
                loss = criterion_inst(self.mlp(inputs), targets)
                loss.backward()
                optimizer_inst.step()
    
    def predict(self, test_X):
        X_tensor = self.smiles_featurizer.featurize(test_X)
        X_np = X_tensor.numpy()
        X_scaled = self.scaler.transform(X_np)
        
        # Get solvent features for extrapolation detection
        solvent_feats = self.smiles_featurizer.get_solvent_features(test_X)
        solvent_feats_scaled = self.solvent_scaler.transform(solvent_feats)
        
        # Compute extrapolation scores
        test_distances, _ = self.nn_detector.kneighbors(solvent_feats_scaled)
        extrapolation_scores = test_distances.mean(axis=1)
        
        # Compute threshold based on training distances
        threshold = np.percentile(self.train_distances, self.blend_threshold * 100)
        
        # DataFrame for GBDT
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # MLP predictions
        self.mlp.eval()
        device = next(self.mlp.parameters()).device
        with torch.no_grad():
            X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32).to(device)
            mlp_preds = self.mlp(X_tensor_scaled).cpu().numpy()
        
        # LGBM predictions
        lgb_preds = self.lgbm.predict(X_scaled_df)
        
        # GP predictions (with uncertainty)
        gp_mean, gp_std = self.gp.predict(X_scaled, return_std=True)
        # Expand GP prediction to all targets (use same prediction for simplicity)
        gp_preds = np.column_stack([gp_mean, gp_mean, gp_mean])
        
        # Weighted ensemble
        ensemble_preds = (
            self.weights[0] * gp_preds +
            self.weights[1] * mlp_preds +
            self.weights[2] * lgb_preds
        )
        
        # Apply conservative blending for extrapolation
        # Higher extrapolation score -> blend more toward training mean
        blend_weights = np.clip((extrapolation_scores - threshold) / threshold, 0, 1)
        blend_weights = blend_weights.reshape(-1, 1) * self.blend_strength
        
        # Blend toward training mean
        final_preds = (1 - blend_weights) * ensemble_preds + blend_weights * self.train_mean
        
        return torch.tensor(final_preds)

print("ConservativeExtrapolationModel defined")

ConservativeExtrapolationModel defined


In [6]:
# Test the model quickly
X_single, Y_single = load_data("single_solvent")
print(f"Single solvent data: {X_single.shape}, {Y_single.shape}")

# Quick test
model = ConservativeExtrapolationModel(data='single', blend_threshold=0.5, blend_strength=0.3)
model.train_model(X_single, Y_single, num_epochs=10)
preds = model.predict(X_single[:5])
print(f"Test predictions shape: {preds.shape}")
print(f"Sample predictions:\n{preds[:3]}")
print(f"Training mean: {model.train_mean}")

Single solvent data: (656, 3), (656, 3)


Test predictions shape: torch.Size([5, 3])
Sample predictions:
tensor([[nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan]], dtype=torch.float64)
Training mean: [0.14993233 0.12337957 0.52219232]


In [None]:
# Run CV with different blend parameters
import tqdm

def compute_cv_score(blend_threshold=0.5, blend_strength=0.3, verbose=True):
    """Compute CV score with conservative extrapolation."""
    
    # Single solvent CV
    X_single, Y_single = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    
    single_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = ConservativeExtrapolationModel(
            data='single', 
            blend_threshold=blend_threshold, 
            blend_strength=blend_strength
        )
        model.train_model(train_X, train_Y, num_epochs=100)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        single_mse_list.append(mse)
        if verbose:
            print(f"Single Fold {fold_idx}: MSE = {mse:.6f}")
    
    single_cv = np.mean(single_mse_list)
    if verbose:
        print(f"\nSingle Solvent CV MSE: {single_cv:.6f}")
    
    # Full data CV
    X_full, Y_full = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    full_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = ConservativeExtrapolationModel(
            data='full', 
            blend_threshold=blend_threshold, 
            blend_strength=blend_strength
        )
        model.train_model(train_X, train_Y, num_epochs=100)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        full_mse_list.append(mse)
        if verbose:
            print(f"Full Fold {fold_idx}: MSE = {mse:.6f}")
    
    full_cv = np.mean(full_mse_list)
    if verbose:
        print(f"\nFull Data CV MSE: {full_cv:.6f}")
    
    # Combined CV
    combined_cv = (single_cv + full_cv) / 2
    if verbose:
        print(f"\n=== Combined CV MSE: {combined_cv:.6f} ===")
    
    return single_cv, full_cv, combined_cv

# First test with default parameters
print("Testing with blend_threshold=0.5, blend_strength=0.3")
single_cv, full_cv, combined_cv = compute_cv_score(blend_threshold=0.5, blend_strength=0.3)

In [None]:
# Test different blend parameters
print("\n" + "="*60)
print("Testing different blend parameters")
print("="*60)

results = []

# Test different combinations
for threshold in [0.3, 0.5, 0.7]:
    for strength in [0.1, 0.3, 0.5]:
        print(f"\nTesting threshold={threshold}, strength={strength}")
        _, _, cv = compute_cv_score(blend_threshold=threshold, blend_strength=strength, verbose=False)
        results.append({'threshold': threshold, 'strength': strength, 'cv': cv})
        print(f"CV = {cv:.6f}")

# Find best
results_df = pd.DataFrame(results)
best_idx = results_df['cv'].idxmin()
best_params = results_df.loc[best_idx]
print(f"\n=== Best Parameters ===")
print(f"Threshold: {best_params['threshold']}")
print(f"Strength: {best_params['strength']}")
print(f"CV: {best_params['cv']:.6f}")
print(f"\nBaseline CV: 0.008298")

In [None]:
# Save results
import json

best_threshold = best_params['threshold']
best_strength = best_params['strength']
best_cv = best_params['cv']

results_dict = {
    'cv_score': float(best_cv),
    'single_cv': float(single_cv),
    'full_cv': float(full_cv),
    'combined_cv': float(combined_cv),
    'best_threshold': float(best_threshold),
    'best_strength': float(best_strength),
    'model': 'ConservativeExtrapolationModel (GP+MLP+LGBM with extrapolation blending)',
    'baseline_cv': 0.008298,
    'all_results': results
}

with open('/home/code/experiments/092_conservative_extrapolation/metrics.json', 'w') as f:
    json.dump(results_dict, f, indent=2)

print("Results saved")
print(f"Best CV: {best_cv:.6f}")
print(f"Baseline CV: 0.008298")
print(f"Improvement: {(0.008298 - best_cv) / 0.008298 * 100:.2f}%")

## Generate Submission (if CV is better than baseline)

The following cells follow the official template structure.

In [None]:
# Use best parameters for submission
BEST_THRESHOLD = best_threshold
BEST_STRENGTH = best_strength

print(f"Using threshold={BEST_THRESHOLD}, strength={BEST_STRENGTH}")
print(f"Best CV: {best_cv:.6f}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ConservativeExtrapolationModel(data='single', blend_threshold=BEST_THRESHOLD, blend_strength=BEST_STRENGTH)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ConservativeExtrapolationModel(data='full', blend_threshold=BEST_THRESHOLD, blend_strength=BEST_STRENGTH)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to standard location
import shutil
import os
os.makedirs('/home/submission', exist_ok=True)
shutil.copy("submission.csv", "/home/submission/submission.csv")
print("Submission saved!")
print(f"Shape: {submission.shape}")
print(submission.head())

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################