# Experiment 008: Gaussian Process Model

**Goal**: Try a fundamentally different model family that may extrapolate better to unseen solvents.

Based on Loop 7 analysis:
- GP (0.0988) comparable to ETR (0.1053) on initial tests
- GPs are designed for small datasets and may extrapolate better
- May have smaller CV-LB gap due to better extrapolation

**Implementation**:
- Matern kernel (nu=2.5) with WhiteKernel
- Arrhenius kinetics features + Spange descriptors
- Per-target GP models (one for each target)
- alpha=0.1 for regularization
- NO TTA

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel, RBF
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print("Setup complete")

Setup complete


In [2]:
# --- UTILITY FUNCTIONS ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load feature sets
SPANGE_DF = load_features('spange_descriptors')
print(f"Spange: {SPANGE_DF.shape}")

Spange: (26, 13)


In [3]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [4]:
# --- GAUSSIAN PROCESS MODEL ---
class GPModel(BaseModel):
    """Gaussian Process model for reaction yield prediction.
    
    Key advantages:
    - Designed for small datasets
    - May extrapolate better to unseen solvents
    - Provides uncertainty estimates
    
    Implementation:
    - Matern kernel (nu=2.5) - twice differentiable, good for smooth functions
    - WhiteKernel for noise estimation
    - Per-target models (one GP for each of Product 2, Product 3, SM)
    - alpha=0.1 for regularization
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.targets = ['Product 2', 'Product 3', 'SM']
        self.spange = SPANGE_DF
        self.scaler = StandardScaler()
        self.models = {}
    
    def _build_features(self, X):
        """Build features: Arrhenius kinetics + Spange descriptors."""
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        # Arrhenius kinetic features
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(rt + 1e-6)
        interaction = inv_temp * log_time
        
        process_feats = np.hstack([rt, temp, inv_temp, log_time, interaction])
        
        if self.mixed:
            pct = X['SolventB%'].values.reshape(-1, 1)
            A_spange = self.spange.loc[X['SOLVENT A NAME']].values
            B_spange = self.spange.loc[X['SOLVENT B NAME']].values
            spange_feats = A_spange * (1 - pct) + B_spange * pct
            return np.hstack([process_feats, pct, spange_feats])
        else:
            spange_feats = self.spange.loc[X['SOLVENT NAME']].values
            return np.hstack([process_feats, spange_feats])
    
    def train_model(self, X_train, y_train):
        X_feat = self._build_features(X_train)
        X_scaled = self.scaler.fit_transform(X_feat)
        y = y_train.values
        
        for i, target in enumerate(self.targets):
            # Matern kernel with nu=2.5 (twice differentiable)
            # ConstantKernel for amplitude, WhiteKernel for noise
            kernel = ConstantKernel(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e2), nu=2.5) + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-5, 1e1))
            
            gp = GaussianProcessRegressor(
                kernel=kernel,
                alpha=0.1,  # Regularization
                normalize_y=True,  # Better convergence
                n_restarts_optimizer=2,  # Find better hyperparameters
                random_state=42
            )
            gp.fit(X_scaled, y[:, i])
            self.models[target] = gp
    
    def predict(self, X):
        X_feat = self._build_features(X)
        X_scaled = self.scaler.transform(X_feat)
        
        preds = []
        for target in self.targets:
            pred = self.models[target].predict(X_scaled)
            preds.append(pred.reshape(-1, 1))
        
        preds = np.hstack(preds)
        preds = np.clip(preds, 0, 1)
        return torch.tensor(preds, dtype=torch.double)

In [5]:
# --- QUICK VALIDATION TEST ---
print("Testing GPModel...")
X_test, Y_test = load_data("single_solvent")

# Quick leave-one-out test on first 5 solvents
errors = []
split_gen = generate_leave_one_out_splits(X_test, Y_test)
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(split_gen):
    if i >= 5: break
    model = GPModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nQuick test MAE (single): {np.mean(errors):.4f}")

# Also test on full data
print("\nTesting on full data...")
X_full, Y_full = load_data("full")
errors_full = []
split_gen = generate_leave_one_ramp_out_splits(X_full, Y_full)
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(split_gen):
    if i >= 3: break
    model = GPModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors_full.append(mae)
    print(f"Fold {i}: MAE = {mae:.4f}")

print(f"\nQuick test MAE (full): {np.mean(errors_full):.4f}")

Testing GPModel...


Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1738


Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.0681


Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0505


Fold 3 (Acetonitrile): MAE = 0.0938


Fold 4 (Acetonitrile.Acetic Acid): MAE = 0.1170

Quick test MAE (single): 0.1006

Testing on full data...


Fold 0: MAE = 0.0606


Fold 1: MAE = 0.1249


Fold 2: MAE = 0.1011

Quick test MAE (full): 0.0955


## Template-Compliant Cross-Validation

The following 3 cells are the FINAL 3 cells - EXACTLY as in the template.

In [6]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GPModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:19, 19.96s/it]

2it [00:40, 20.11s/it]

3it [00:57, 19.00s/it]

4it [01:14, 18.21s/it]

5it [01:36, 19.43s/it]

6it [01:54, 19.11s/it]

7it [02:11, 18.20s/it]

8it [02:26, 17.37s/it]

9it [02:43, 17.19s/it]

10it [03:00, 17.16s/it]

11it [03:17, 17.11s/it]

12it [03:34, 17.08s/it]

13it [03:50, 16.66s/it]

14it [04:07, 16.92s/it]

15it [04:25, 17.12s/it]

16it [04:41, 16.76s/it]

17it [04:59, 17.10s/it]

18it [05:17, 17.49s/it]

19it [05:32, 16.80s/it]

20it [05:50, 16.92s/it]

21it [06:07, 17.13s/it]

22it [06:24, 17.03s/it]

23it [06:41, 16.90s/it]

24it [06:58, 17.14s/it]

24it [06:58, 17.45s/it]




In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GPModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:44, 44.30s/it]

2it [01:29, 45.00s/it]

3it [02:14, 44.86s/it]

4it [03:01, 45.90s/it]

5it [03:49, 46.41s/it]

6it [04:33, 45.55s/it]

7it [05:17, 45.21s/it]

8it [06:03, 45.27s/it]

9it [06:47, 44.93s/it]

10it [07:38, 46.78s/it]

11it [08:30, 48.56s/it]

12it [09:21, 49.30s/it]

13it [10:07, 48.21s/it]

13it [10:07, 46.73s/it]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################