# Experiment 058: Extrapolation Detection + Conservative Predictions

**Goal:** Reduce the CV-LB intercept by detecting when we're extrapolating and making conservative predictions.

**Key insight:** The CV-LB relationship is LB = 4.31*CV + 0.0525. The intercept (0.0525) represents extrapolation error. If we can detect when we're extrapolating and make more conservative predictions, we might reduce this intercept.

**Approach:**
1. Compute solvent similarity using fingerprints (Tanimoto similarity)
2. For each test solvent, find its similarity to nearest training solvents
3. When similarity is low (extrapolating), blend predictions toward population mean
4. This should reduce error on "hard" test solvents

**Hypothesis:** Conservative predictions for outlier solvents will reduce the intercept, not just improve CV.

In [1]:
# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from abc import ABC, abstractmethod
import tqdm
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

torch.set_default_dtype(torch.double)

# Data path for local execution
DATA_PATH = "/home/data"

print("Imports complete.")

Imports complete.


In [2]:
# Constants from official template
INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time",
    "Temperature",
    "SOLVENT A NAME",
    "SOLVENT B NAME",
    "SolventB%",
]

INPUT_LABELS_SINGLE_SOLVENT = [
    "Residence Time",
    "Temperature",
    "SOLVENT NAME",
]

INPUT_LABELS_NUMERIC = [
    "Residence Time",
    "Temperature",
]

TARGET_LABELS = [
    "Product 2",
    "Product 3",
    "SM",
]

print("Constants defined.")

Constants defined.


In [3]:
# Data loading functions
def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    assert name in ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints", "smiles"]
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

# CV functions from official template
def generate_leave_one_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvents."""
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvent ramps."""
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = ~((X["SOLVENT A NAME"] == solvent_pair["SOLVENT A NAME"]) & 
                           (X["SOLVENT B NAME"] == solvent_pair["SOLVENT B NAME"]))
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

print("Data loading and CV functions defined.")

Data loading and CV functions defined.


In [4]:
# Base classes from official template
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

print("Base classes defined.")

Base classes defined.


In [5]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
FRAGPRINTS_DF = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)

print(f'Spange: {SPANGE_DF.shape}')
print(f'DRFP: {DRFP_DF.shape}')
print(f'ACS PCA: {ACS_PCA_DF.shape}')
print(f'Fragprints: {FRAGPRINTS_DF.shape}')

Spange: (26, 13)
DRFP: (24, 2048)
ACS PCA: (24, 5)
Fragprints: (24, 2133)


In [6]:
# Compute Tanimoto similarity between solvents using fingerprints
def compute_tanimoto_similarity(fp1, fp2):
    """Compute Tanimoto similarity between two binary fingerprints."""
    intersection = np.sum(np.minimum(fp1, fp2))
    union = np.sum(np.maximum(fp1, fp2))
    if union == 0:
        return 1.0  # Both are zero vectors
    return intersection / union

def compute_solvent_similarity_matrix(fingerprints_df):
    """Compute pairwise Tanimoto similarity matrix for all solvents."""
    solvents = fingerprints_df.index.tolist()
    n = len(solvents)
    similarity_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            fp1 = fingerprints_df.iloc[i].values
            fp2 = fingerprints_df.iloc[j].values
            similarity_matrix[i, j] = compute_tanimoto_similarity(fp1, fp2)
    
    return pd.DataFrame(similarity_matrix, index=solvents, columns=solvents)

# Compute similarity matrix using fragprints
SIMILARITY_MATRIX = compute_solvent_similarity_matrix(FRAGPRINTS_DF)
print(f"Similarity matrix shape: {SIMILARITY_MATRIX.shape}")
print(f"\nSample similarities for 'Acetonitrile':")
print(SIMILARITY_MATRIX.loc['Acetonitrile'].sort_values(ascending=False).head(5))

Similarity matrix shape: (24, 24)

Sample similarities for 'Acetonitrile':
Water.Acetonitrile             1.000000
Acetonitrile                   1.000000
Acetonitrile.Acetic Acid       0.444444
DMA [N,N-Dimethylacetamide]    0.100000
Methanol                       0.083333
Name: Acetonitrile, dtype: float64


In [7]:
# Extrapolation-aware model
class ExtrapolationAwareModel(BaseModel):
    """Model that detects extrapolation and makes conservative predictions.
    
    When predicting for a solvent that is dissimilar to training solvents,
    blend the prediction toward the population mean.
    """
    
    def __init__(self, data='single', blend_threshold=0.5, blend_strength=0.3):
        self.data = data
        self.blend_threshold = blend_threshold  # Similarity below this triggers blending
        self.blend_strength = blend_strength  # How much to blend toward mean (0-1)
        self.spange_df = SPANGE_DF
        self.similarity_matrix = SIMILARITY_MATRIX
        self.models = None
        self.scaler = None
        self.train_mean = None
        self.train_solvents = None
    
    def _prepare_features(self, X):
        """Prepare features for the model."""
        # Numeric features with Arrhenius engineering
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.data == 'single':
            # Single solvent features
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            return np.hstack([X_kinetic, X_spange])
        else:
            # Mixed solvent features (weighted average)
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1) / 100.0
            X_spange = A_spange * (1 - pct) + B_spange * pct
            return np.hstack([X_kinetic, pct, X_spange])
    
    def _get_max_similarity_to_training(self, test_solvent, train_solvents):
        """Get maximum similarity between test solvent and any training solvent."""
        if test_solvent not in self.similarity_matrix.index:
            return 0.0  # Unknown solvent - maximum extrapolation
        
        similarities = []
        for train_solvent in train_solvents:
            if train_solvent in self.similarity_matrix.columns:
                sim = self.similarity_matrix.loc[test_solvent, train_solvent]
                similarities.append(sim)
        
        if not similarities:
            return 0.0
        return max(similarities)
    
    def train_model(self, train_X, train_Y, device=None, verbose=False):
        """Train CatBoost + XGBoost ensemble."""
        X_features = self._prepare_features(train_X)
        Y_np = train_Y.values
        
        # Store training mean for blending
        self.train_mean = Y_np.mean(axis=0)
        
        # Store training solvents for similarity computation
        if self.data == 'single':
            self.train_solvents = train_X["SOLVENT NAME"].unique().tolist()
        else:
            self.train_solvents = list(set(train_X["SOLVENT A NAME"].tolist() + train_X["SOLVENT B NAME"].tolist()))
        
        # Scale features
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_features)
        
        # Train CatBoost models
        self.cat_models = []
        for i in range(3):
            model = CatBoostRegressor(
                iterations=500,
                depth=6,
                learning_rate=0.05,
                l2_leaf_reg=3.0,
                random_seed=42,
                verbose=False,
            )
            model.fit(X_scaled, Y_np[:, i])
            self.cat_models.append(model)
        
        # Train XGBoost models
        self.xgb_models = []
        for i in range(3):
            model = XGBRegressor(
                n_estimators=400,
                max_depth=5,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.1,
                reg_lambda=1.0,
                random_state=42,
                verbosity=0,
            )
            model.fit(X_scaled, Y_np[:, i])
            self.xgb_models.append(model)
    
    def predict(self, test_X):
        """Predict with extrapolation-aware blending."""
        X_features = self._prepare_features(test_X)
        X_scaled = self.scaler.transform(X_features)
        
        # Get base predictions from ensemble
        cat_preds = np.column_stack([m.predict(X_scaled) for m in self.cat_models])
        xgb_preds = np.column_stack([m.predict(X_scaled) for m in self.xgb_models])
        
        # Ensemble: 60% CatBoost + 40% XGBoost
        base_preds = 0.6 * cat_preds + 0.4 * xgb_preds
        
        # Compute extrapolation score for each sample
        if self.data == 'single':
            test_solvents = test_X["SOLVENT NAME"].values
            extrapolation_scores = []
            for solvent in test_solvents:
                max_sim = self._get_max_similarity_to_training(solvent, self.train_solvents)
                # Extrapolation score: 0 = similar to training, 1 = very different
                extrapolation_scores.append(1.0 - max_sim)
            extrapolation_scores = np.array(extrapolation_scores).reshape(-1, 1)
        else:
            # For mixed solvents, use average similarity of both components
            test_solvents_A = test_X["SOLVENT A NAME"].values
            test_solvents_B = test_X["SOLVENT B NAME"].values
            extrapolation_scores = []
            for solv_a, solv_b in zip(test_solvents_A, test_solvents_B):
                max_sim_a = self._get_max_similarity_to_training(solv_a, self.train_solvents)
                max_sim_b = self._get_max_similarity_to_training(solv_b, self.train_solvents)
                avg_sim = (max_sim_a + max_sim_b) / 2
                extrapolation_scores.append(1.0 - avg_sim)
            extrapolation_scores = np.array(extrapolation_scores).reshape(-1, 1)
        
        # Blend toward mean based on extrapolation score
        # When extrapolation_score > threshold, blend more toward mean
        blend_weights = np.clip(
            (extrapolation_scores - self.blend_threshold) / (1.0 - self.blend_threshold),
            0.0, 1.0
        ) * self.blend_strength
        
        # Final predictions: blend between base prediction and training mean
        final_preds = (1 - blend_weights) * base_preds + blend_weights * self.train_mean
        
        # Clip to [0, 1]
        final_preds = np.clip(final_preds, 0.0, 1.0)
        
        return torch.tensor(final_preds, dtype=torch.double)

print("ExtrapolationAwareModel defined.")

ExtrapolationAwareModel defined.


In [8]:
# Quick test
print("Testing ExtrapolationAwareModel...")
X, Y = load_data("single_solvent")
print(f"Single solvent data: X={X.shape}, Y={Y.shape}")

# Test one fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

print(f"Test solvent: {test_X['SOLVENT NAME'].iloc[0]}")
print(f"Training solvents: {train_X['SOLVENT NAME'].unique()[:5]}...")

model = ExtrapolationAwareModel(blend_threshold=0.5, blend_strength=0.3)
model.train_model(train_X, train_Y)
preds = model.predict(test_X)

print(f"Predictions shape: {preds.shape}")
print(f"Predictions sample: {preds[0]}")
print(f"Actual sample: {test_Y.iloc[0].values}")
print(f"Training mean: {model.train_mean}")
print("Test passed!")

Testing ExtrapolationAwareModel...
Single solvent data: X=(656, 3), Y=(656, 3)
Test solvent: 1,1,1,3,3,3-Hexafluoropropan-2-ol
Training solvents: ['Methanol' 'Ethylene Glycol [1,2-Ethanediol]'
 '2-Methyltetrahydrofuran [2-MeTHF]' 'Cyclohexane' 'IPA [Propan-2-ol]']...


Predictions shape: torch.Size([37, 3])
Predictions sample: tensor([0.0258, 0.0194, 0.9150])
Actual sample: [0.01692854 0.02519112 0.95783321]
Training mean: [0.13978304 0.11369471 0.54321009]
Test passed!


In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ExtrapolationAwareModel(blend_threshold=0.5, blend_strength=0.3) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Single solvent predictions: {len(submission_single_solvent)}")
print(f"Unique folds: {submission_single_solvent['fold'].nunique()}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ExtrapolationAwareModel(data='full', blend_threshold=0.5, blend_strength=0.3) # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Full data predictions: {len(submission_full_data)}")
print(f"Unique folds: {submission_full_data['fold'].nunique()}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Submission saved to /home/submission/submission.csv")
print(f"Total rows: {len(submission)}")

In [None]:
# Calculate CV for logging
print("\n" + "="*60)
print("CV CALCULATION")
print("="*60)

# Single solvent CV
X, Y = load_data("single_solvent")
fold_mses = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X, Y)):
    model = ExtrapolationAwareModel(blend_threshold=0.5, blend_strength=0.3)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    fold_mses.append(mse)
    if fold_idx % 5 == 0:
        print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

single_cv = np.mean(fold_mses)
single_std = np.std(fold_mses)
print(f"\nSingle solvent CV MSE: {single_cv:.6f} ± {single_std:.6f}")

# Full data CV
X, Y = load_data("full")
full_fold_mses = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X, Y)):
    model = ExtrapolationAwareModel(data='full', blend_threshold=0.5, blend_strength=0.3)
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    full_fold_mses.append(mse)
    print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

full_cv = np.mean(full_fold_mses)
full_std = np.std(full_fold_mses)
print(f"\nFull data CV MSE: {full_cv:.6f} ± {full_std:.6f}")

print(f"\nFINAL CV FOR LOGGING: {single_cv:.6f}")

In [None]:
# Summary
print("\n" + "="*60)
print("EXPERIMENT 058: EXTRAPOLATION DETECTION SUMMARY")
print("="*60)

print("\nAPPROACH:")
print("  - Compute Tanimoto similarity between solvents using fragprints")
print("  - When test solvent is dissimilar to training solvents (extrapolating),")
print("    blend predictions toward population mean")
print("  - blend_threshold=0.5, blend_strength=0.3")
print("  - Base model: CatBoost + XGBoost ensemble (60:40)")

print(f"\nCV SCORES:")
print(f"  Single solvent: {single_cv:.6f} ± {single_std:.6f}")
print(f"  Full data: {full_cv:.6f} ± {full_std:.6f}")

# Predicted LB using the CV-LB relationship
predicted_lb = 4.31 * single_cv + 0.0525
print(f"\nPREDICTED LB (using CV-LB relationship): {predicted_lb:.4f}")
print(f"  Best LB so far: 0.0877")
print(f"  Target: 0.0347")

print("\nHYPOTHESIS:")
print("  Conservative predictions for outlier solvents should reduce the intercept,")
print("  not just improve CV. This could change the CV-LB relationship.")