# Experiment 104: Post-hoc Bias Correction

**Goal**: Reduce the CV-LB intercept by applying post-hoc bias correction.

**Rationale**:
- CV-LB relationship: LB = 4.29 × CV + 0.0528 (R² = 0.95)
- Intercept (0.0528) > Target (0.0347) - target is mathematically unreachable
- Post-hoc bias correction can reduce systematic offset

**Implementation**:
1. During training, compute per-fold bias: `bias = preds.mean() - y_val.mean()`
2. Store average bias across folds
3. Apply bias correction to predictions: `corrected = preds - bias`

**Key insight from web research**:
> "Apply a post-hoc intercept-bias correction – after fitting, compute the average residual on a small validation set of known compounds and subtract that mean bias from all future predictions."

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def load_features(name):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

# Leave-One-Out validation (same as exp_030)
def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

In [None]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

In [None]:
# Base classes
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

print('Base classes defined')

In [None]:
# Featurizer with all features (like ens-model)
class FullFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)  # Already in [0, 1]
            X_spange = A_spange * (1 - pct) + B_spange * pct
            X_drfp = A_drfp * (1 - pct) + B_drfp * pct
            X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])

print(f'Full feature dimension: {FullFeaturizer().feats_dim}')

In [None]:
# Bias-Corrected CatBoost + XGBoost Ensemble
class BiasCorrectedEnsemble(BaseModel):
    """CatBoost + XGBoost ensemble with post-hoc bias correction."""
    
    def __init__(self, data='single'):
        self.data = data
        self.mixed = (data == 'full')
        self.featurizer = FullFeaturizer(mixed=self.mixed)
        self.scaler = StandardScaler()
        
        # Weights from ens-model kernel
        if data == 'single':
            self.cat_weight = 7.0 / 13.0
            self.xgb_weight = 6.0 / 13.0
        else:
            self.cat_weight = 1.0 / 3.0
            self.xgb_weight = 2.0 / 3.0
        
        # CatBoost params
        self.cat_params = dict(
            random_state=42,
            iterations=500,
            learning_rate=0.05,
            depth=6,
            l2_leaf_reg=3,
            verbose=0,
        )
        
        # XGBoost params
        self.xgb_params = dict(
            random_state=42,
            n_estimators=300,
            learning_rate=0.05,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            verbosity=0,
        )
        
        self.cat_models = None
        self.xgb_models = None
        self.bias = None  # Per-target bias correction
        self.train_mean = None
        
    def train_model(self, train_X, train_Y):
        # Featurize
        X_np = self.featurizer.featurize(train_X)
        X_scaled = self.scaler.fit_transform(X_np)
        Y_np = train_Y.values
        
        # Store training mean for reference
        self.train_mean = Y_np.mean(axis=0)
        
        # Train CatBoost (one per target)
        self.cat_models = []
        for t in range(3):
            m = CatBoostRegressor(**self.cat_params)
            m.fit(X_scaled, Y_np[:, t])
            self.cat_models.append(m)
        
        # Train XGBoost (one per target)
        self.xgb_models = []
        for t in range(3):
            m = xgb.XGBRegressor(**self.xgb_params)
            m.fit(X_scaled, Y_np[:, t])
            self.xgb_models.append(m)
        
        # Compute bias on training data (in-sample bias estimate)
        # This is a proxy for the validation bias
        cat_preds = np.column_stack([m.predict(X_scaled) for m in self.cat_models])
        xgb_preds = np.column_stack([m.predict(X_scaled) for m in self.xgb_models])
        ensemble_preds = self.cat_weight * cat_preds + self.xgb_weight * xgb_preds
        
        # Compute per-target bias
        self.bias = ensemble_preds.mean(axis=0) - Y_np.mean(axis=0)
        
    def predict(self, test_X):
        # Featurize
        X_np = self.featurizer.featurize(test_X)
        X_scaled = self.scaler.transform(X_np)
        
        # Get predictions
        cat_preds = np.column_stack([m.predict(X_scaled) for m in self.cat_models])
        xgb_preds = np.column_stack([m.predict(X_scaled) for m in self.xgb_models])
        
        # Ensemble
        ensemble_preds = self.cat_weight * cat_preds + self.xgb_weight * xgb_preds
        
        # Apply bias correction
        corrected_preds = ensemble_preds - self.bias
        
        # Clip to valid range [0, 1]
        corrected_preds = np.clip(corrected_preds, 0.0, 1.0)
        
        # Renormalize if sum > 1
        totals = corrected_preds.sum(axis=1, keepdims=True)
        divisor = np.maximum(totals, 1.0)
        corrected_preds = corrected_preds / divisor
        
        return torch.tensor(corrected_preds)

print('BiasCorrectedEnsemble defined')
print('Bias correction: preds - (preds.mean() - y_train.mean())')

In [None]:
# Run CV to compute local score
import tqdm

def compute_cv_score(verbose=True):
    """Compute CV score with bias correction."""
    
    # Single solvent CV (Leave-One-Out: 24 folds)
    X_single, Y_single = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    
    single_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = BiasCorrectedEnsemble(data='single')
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        single_mse_list.append(mse)
        if verbose:
            print(f"Single Fold {fold_idx}: MSE = {mse:.6f}")
    
    single_cv = np.mean(single_mse_list)
    if verbose:
        print(f"\nSingle Solvent CV MSE (24-fold LOO): {single_cv:.6f}")
    
    # Full data CV (Leave-One-Ramp-Out: 13 folds)
    X_full, Y_full = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    full_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = BiasCorrectedEnsemble(data='full')
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        full_mse_list.append(mse)
        if verbose:
            print(f"Full Fold {fold_idx}: MSE = {mse:.6f}")
    
    full_cv = np.mean(full_mse_list)
    if verbose:
        print(f"\nFull Data CV MSE (13-fold LORO): {full_cv:.6f}")
    
    combined_cv = (single_cv + full_cv) / 2
    if verbose:
        print(f"\n=== Combined CV MSE: {combined_cv:.6f} ===")
    
    return single_cv, full_cv, combined_cv

print("Running CV with bias correction...")
single_cv, full_cv, combined_cv = compute_cv_score()

In [None]:
# Save results
import json

results = {
    'cv_score': float(combined_cv),
    'single_cv': float(single_cv),
    'full_cv': float(full_cv),
    'model': 'BiasCorrectedEnsemble (CatBoost + XGBoost with bias correction)',
    'validation': 'Leave-One-Out (24 folds single, 13 folds full)',
    'baseline_cv': 0.0081,
    'improvement': f"{(0.0081 - combined_cv) / 0.0081 * 100:.2f}%"
}

with open('/home/code/experiments/104_bias_correction/metrics.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved")
print(f"Combined CV: {combined_cv:.6f}")
print(f"Baseline CV (exp_030): 0.0081")
print(f"Improvement: {(0.0081 - combined_cv) / 0.0081 * 100:.2f}%")

## Generate Submission

The following cells follow the official template structure.

**CRITICAL**: The model class in submission cells MUST match the CV computation class (`BiasCorrectedEnsemble`).

In [None]:
# Generate submission
print(f"CV: {combined_cv:.6f}")
print("Generating submission...")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = BiasCorrectedEnsemble(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = BiasCorrectedEnsemble(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to standard location
import shutil
import os
os.makedirs('/home/submission', exist_ok=True)
shutil.copy("submission.csv", "/home/submission/submission.csv")
print("Submission saved!")
print(f"Shape: {submission.shape}")
print(submission.head())

# Verify predictions are valid
print(f"\nMin target_1: {submission['target_1'].min():.6f}")
print(f"Min target_2: {submission['target_2'].min():.6f}")
print(f"Min target_3: {submission['target_3'].min():.6f}")
print(f"Max sum: {(submission['target_1'] + submission['target_2'] + submission['target_3']).max():.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################