# Gaussian Process Regression

**Problem**: The CV-LB gap is ~10x (LB = 4.22*CV + 0.0533). The intercept (0.0533) is 3x higher than target (0.01727).

**Hypothesis**: GPs have fundamentally different inductive biases than neural networks and may generalize better. They are explicitly mentioned in the competition description.

**Approach**:
- Use GaussianProcessRegressor with Matern kernel + WhiteKernel
- Use a subset of features (Spange + Arrhenius = 18 features) to avoid scaling issues
- Train separate GPs for each target (multi-output)

**Baseline**: exp_026 CV 0.008465, LB 0.0887

In [1]:
# Standard imports
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, Matern, ConstantKernel
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [2]:
# Load feature lookups - ONLY Spange (NO DRFP, NO ACS PCA to keep dimensionality low for GP)
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)

print(f'Spange: {SPANGE_DF.shape}')
print(f'Total features: 5 (kinetic) + {SPANGE_DF.shape[1]} (Spange) = {5 + SPANGE_DF.shape[1]}')
print('\nUsing minimal feature set for GP to avoid curse of dimensionality')

Spange: (26, 13)
Total features: 5 (kinetic) + 13 (Spange) = 18

Using minimal feature set for GP to avoid curse of dimensionality


In [3]:
# Simple Featurizer - Spange + Arrhenius kinetics ONLY (18 features)
class GPFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1]  # 18 features

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange])

print(f'GPFeaturizer defined with {GPFeaturizer().feats_dim} features')

GPFeaturizer defined with 18 features


In [4]:
# Gaussian Process Wrapper
class GPWrapper:
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = GPFeaturizer(mixed=(data=='full'))
        self.models = []
        self.scalers_X = []
        self.scalers_y = []

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
            
        # Scale data for GP
        scaler_X = StandardScaler()
        X_scaled = scaler_X.fit_transform(X_all)
        
        # Train separate GP for each target
        self.models = []
        self.scalers_X = []
        
        # Kernel: Matern + WhiteKernel (for noise)
        # Matern is generally better than RBF for physical processes
        kernel = 1.0 * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        
        for i in range(3):
            # Normalize y for each target
            y_target = y_all[:, i]
            
            # GP with restarts to avoid local optima
            gp = GaussianProcessRegressor(
                kernel=kernel,
                n_restarts_optimizer=5,
                normalize_y=True,  # Important!
                random_state=42
            )
            
            gp.fit(X_scaled, y_target)
            self.models.append(gp)
        
        self.scaler_X = scaler_X

    def predict(self, X_test):
        X_feat = self.featurizer.featurize(X_test, flip=False)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_test, flip=True)
            X_feat_scaled = self.scaler_X.transform(X_feat)
            X_flip_scaled = self.scaler_X.transform(X_flip)
        else:
            X_feat_scaled = self.scaler_X.transform(X_feat)
        
        preds = []
        for i, model in enumerate(self.models):
            pred_mean = model.predict(X_feat_scaled)
            
            if self.data_type == 'full':
                pred_flip = model.predict(X_flip_scaled)
                pred_mean = (pred_mean + pred_flip) / 2
                
            preds.append(pred_mean)
        
        return np.column_stack(preds)

print('GPWrapper defined')

GPWrapper defined


In [5]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GPWrapper(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:55, 55.31s/it]

2it [01:51, 56.01s/it]

3it [02:40, 52.58s/it]

4it [03:33, 52.71s/it]

5it [04:31, 54.57s/it]

6it [05:24, 54.08s/it]

7it [06:14, 52.78s/it]

8it [07:12, 54.50s/it]

9it [08:12, 56.07s/it]

10it [09:09, 56.57s/it]

11it [10:05, 56.19s/it]

12it [10:59, 55.76s/it]

13it [11:55, 55.74s/it]

14it [12:49, 55.27s/it]

15it [13:44, 55.25s/it]

16it [14:45, 56.89s/it]

17it [15:40, 56.42s/it]

18it [16:35, 55.78s/it]

19it [17:30, 55.57s/it]

20it [18:23, 54.87s/it]

21it [19:22, 55.98s/it]

22it [20:15, 55.36s/it]

23it [21:15, 56.63s/it]

24it [22:09, 55.84s/it]

24it [22:09, 55.40s/it]




In [6]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GPWrapper(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [05:44, 344.44s/it]

2it [11:33, 347.25s/it]

3it [17:17, 345.72s/it]

4it [22:58, 343.69s/it]

5it [28:59, 350.04s/it]

6it [34:16, 338.88s/it]

7it [39:21, 327.77s/it]

8it [45:22, 338.47s/it]

9it [50:27, 327.86s/it]

10it [57:01, 348.34s/it]

11it [1:03:04, 352.86s/it]

12it [1:09:26, 361.50s/it]

13it [1:15:38, 364.78s/it]

13it [1:15:38, 349.10s/it]




In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################