# Gaussian Process Model for Catechol Prediction

GPs excel at extrapolation to unseen data points - the core challenge here.
With ~600-1200 samples, GPs are in their sweet spot.

This notebook implements:
1. Multi-output GP using GPyTorch
2. Arrhenius kinetics features
3. Spange + ACS PCA descriptors
4. Symmetry TTA for mixed solvents

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from abc import ABC, abstractmethod
import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = True

torch.set_default_dtype(torch.double)

# Check GPU
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# GPyTorch imports
import gpytorch
from gpytorch.models import ExactGP
from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel, MaternKernel
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.mlls import ExactMarginalLogLikelihood
from gpytorch.distributions import MultivariateNormal

print(f"GPyTorch version: {gpytorch.__version__}")

GPyTorch version: 1.14.3


In [3]:
# Data loading utilities
DATA_PATH = '/home/data'

INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load feature lookup tables
SPANGE_DF = load_features('spange_descriptors')
ACS_PCA_DF = load_features('acs_pca_descriptors')
print(f"Spange: {SPANGE_DF.shape}, ACS PCA: {ACS_PCA_DF.shape}")

Spange: (26, 13), ACS PCA: (24, 5)


In [4]:
# Base classes
class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

In [5]:
# Single-output GP model
class SingleOutputGP(ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(SingleOutputGP, self).__init__(train_x, train_y, likelihood)
        self.mean_module = ConstantMean()
        self.covar_module = ScaleKernel(MaternKernel(nu=2.5, ard_num_dims=train_x.shape[1]))
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return MultivariateNormal(mean_x, covar_x)

In [6]:
# GP-based model with per-target GPs
from sklearn.preprocessing import StandardScaler

class GPModel(BaseModel):
    """Gaussian Process model with Arrhenius features and symmetry TTA."""
    
    def __init__(self, data='single', n_epochs=100):
        super().__init__()
        self.data_type = data
        self.spange_df = SPANGE_DF
        self.acs_pca_df = ACS_PCA_DF
        self.scaler = StandardScaler()
        self.n_epochs = n_epochs
        self.models = {}  # Per-target GP models
        self.likelihoods = {}  # Per-target likelihoods
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def _create_features(self, X, flip=False):
        """Create feature matrix with Arrhenius kinetics and solvent descriptors."""
        time_m = X["Residence Time"].values.reshape(-1, 1)
        temp_c = X["Temperature"].values.reshape(-1, 1)
        
        # Arrhenius kinetics features
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        
        # Solvent features
        if self.data_type == 'full':
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                spange_A = self.spange_df.loc[X["SOLVENT B NAME"]].values
                spange_B = self.spange_df.loc[X["SOLVENT A NAME"]].values
                acs_A = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
                acs_B = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
                pct_use = 1 - pct
            else:
                spange_A = self.spange_df.loc[X["SOLVENT A NAME"]].values
                spange_B = self.spange_df.loc[X["SOLVENT B NAME"]].values
                acs_A = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
                acs_B = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
                pct_use = pct
            spange_feat = spange_A * (1 - pct_use) + spange_B * pct_use
            acs_feat = acs_A * (1 - pct_use) + acs_B * pct_use
        else:
            spange_feat = self.spange_df.loc[X["SOLVENT NAME"]].values
            acs_feat = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        features = np.hstack([
            time_m, temp_c, inv_temp, log_time, interaction,
            spange_feat, acs_feat
        ])
        return features
    
    def train_model(self, X_train, y_train):
        """Train per-target GP models."""
        # Create features
        X_feat = self._create_features(X_train, flip=False)
        
        # Data augmentation for mixed solvents
        if self.data_type == 'full':
            X_flip = self._create_features(X_train, flip=True)
            X_feat = np.vstack([X_feat, X_flip])
            y_train_aug = pd.concat([y_train, y_train], ignore_index=True)
        else:
            y_train_aug = y_train
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X_feat)
        X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(self.device)
        
        # Train per-target GP models
        for target in TARGET_LABELS:
            y_tensor = torch.tensor(y_train_aug[target].values, dtype=torch.double).to(self.device)
            
            # Initialize likelihood and model
            likelihood = GaussianLikelihood().to(self.device)
            model = SingleOutputGP(X_tensor, y_tensor, likelihood).to(self.device)
            
            # Training mode
            model.train()
            likelihood.train()
            
            # Optimizer
            optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
            mll = ExactMarginalLogLikelihood(likelihood, model)
            
            # Training loop
            for i in range(self.n_epochs):
                optimizer.zero_grad()
                output = model(X_tensor)
                loss = -mll(output, y_tensor)
                loss.backward()
                optimizer.step()
            
            # Store trained model
            self.models[target] = model
            self.likelihoods[target] = likelihood
    
    def predict(self, X_test):
        """Predict with symmetry TTA for mixed solvents."""
        # Standard prediction
        X_feat = self._create_features(X_test, flip=False)
        X_scaled = self.scaler.transform(X_feat)
        X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(self.device)
        
        predictions = {}
        for target in TARGET_LABELS:
            self.models[target].eval()
            self.likelihoods[target].eval()
            with torch.no_grad(), gpytorch.settings.fast_pred_var():
                pred = self.likelihoods[target](self.models[target](X_tensor))
                predictions[target] = pred.mean.cpu().numpy()
        
        if self.data_type == 'full':
            # TTA: Also predict with flipped inputs
            X_flip = self._create_features(X_test, flip=True)
            X_flip_scaled = self.scaler.transform(X_flip)
            X_flip_tensor = torch.tensor(X_flip_scaled, dtype=torch.double).to(self.device)
            
            for target in TARGET_LABELS:
                with torch.no_grad(), gpytorch.settings.fast_pred_var():
                    pred_flip = self.likelihoods[target](self.models[target](X_flip_tensor))
                    predictions[target] = (predictions[target] + pred_flip.mean.cpu().numpy()) / 2
        
        # Clip to [0, 1]
        for target in TARGET_LABELS:
            predictions[target] = np.clip(predictions[target], 0, 1)
        
        # Stack predictions: [Product 2, Product 3, SM]
        result = np.column_stack([predictions['Product 2'], predictions['Product 3'], predictions['SM']])
        return torch.tensor(result)

print("GPModel defined")

GPModel defined


In [7]:
# Quick test
print("Testing GP model...")
X, Y = load_data("single_solvent")
print(f"Single solvent: {X.shape}, {Y.shape}")

# Test on first fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

model = GPModel(data='single', n_epochs=50)
model.train_model(train_X, train_Y)
preds = model.predict(test_X)
print(f"Predictions shape: {preds.shape}")
print(f"Sample predictions: {preds[:3]}")

Testing GP model...
Single solvent: (656, 3), (656, 3)


Predictions shape: torch.Size([37, 3])
Sample predictions: tensor([[0.0701, 0.0557, 0.7990],
        [0.0760, 0.0589, 0.7752],
        [0.0928, 0.0733, 0.7086]])


In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
all_actuals = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GPModel(data='single', n_epochs=100)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()
    all_actuals.append(test_Y.values)

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

# Calculate CV score
all_actuals_np = np.vstack(all_actuals)
all_preds_np = np.array([[p['target_1'], p['target_2'], p['target_3']] for p in all_predictions])
single_mse = np.mean((all_actuals_np - all_preds_np) ** 2)
print(f"\nSingle Solvent CV MSE: {single_mse:.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:01<00:24,  1.05s/it]

  8%|▊         | 2/24 [00:02<00:22,  1.05s/it]

 12%|█▎        | 3/24 [00:03<00:22,  1.05s/it]

 17%|█▋        | 4/24 [00:04<00:20,  1.05s/it]

 21%|██        | 5/24 [00:05<00:19,  1.05s/it]

 25%|██▌       | 6/24 [00:06<00:18,  1.05s/it]

 29%|██▉       | 7/24 [00:07<00:17,  1.06s/it]

 33%|███▎      | 8/24 [00:08<00:16,  1.05s/it]

 38%|███▊      | 9/24 [00:09<00:15,  1.05s/it]

 42%|████▏     | 10/24 [00:10<00:14,  1.05s/it]

 46%|████▌     | 11/24 [00:11<00:13,  1.05s/it]

 50%|█████     | 12/24 [00:12<00:12,  1.05s/it]

 54%|█████▍    | 13/24 [00:13<00:11,  1.05s/it]

 58%|█████▊    | 14/24 [00:14<00:10,  1.05s/it]

 62%|██████▎   | 15/24 [00:15<00:09,  1.05s/it]

 67%|██████▋   | 16/24 [00:16<00:08,  1.05s/it]

 71%|███████   | 17/24 [00:17<00:07,  1.05s/it]

 75%|███████▌  | 18/24 [00:18<00:06,  1.05s/it]

 79%|███████▉  | 19/24 [00:19<00:05,  1.05s/it]

 83%|████████▎ | 20/24 [00:21<00:04,  1.05s/it]

 88%|████████▊ | 21/24 [00:22<00:03,  1.05s/it]

 92%|█████████▏| 22/24 [00:23<00:02,  1.05s/it]

 96%|█████████▌| 23/24 [00:24<00:01,  1.05s/it]

100%|██████████| 24/24 [00:25<00:00,  1.06s/it]

100%|██████████| 24/24 [00:25<00:00,  1.05s/it]


Single Solvent CV MSE: 0.014880





In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions_full = []
all_actuals_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GPModel(data='full', n_epochs=100)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()
    all_actuals_full.append(test_Y.values)

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions_full.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions_full)

# Calculate CV score
all_actuals_full_np = np.vstack(all_actuals_full)
all_preds_full_np = np.array([[p['target_1'], p['target_2'], p['target_3']] for p in all_predictions_full])
full_mse = np.mean((all_actuals_full_np - all_preds_full_np) ** 2)
print(f"\nFull Data CV MSE: {full_mse:.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

  0%|          | 0/13 [00:00<?, ?it/s]

  8%|▊         | 1/13 [00:06<01:23,  6.92s/it]

 15%|█▌        | 2/13 [00:13<01:13,  6.71s/it]

 23%|██▎       | 3/13 [00:20<01:06,  6.67s/it]

 31%|███       | 4/13 [00:26<01:00,  6.69s/it]

 38%|███▊      | 5/13 [00:33<00:53,  6.66s/it]

 46%|████▌     | 6/13 [00:40<00:46,  6.65s/it]

 54%|█████▍    | 7/13 [00:46<00:40,  6.67s/it]

 62%|██████▏   | 8/13 [00:53<00:33,  6.68s/it]

 69%|██████▉   | 9/13 [01:00<00:26,  6.69s/it]

 77%|███████▋  | 10/13 [01:06<00:20,  6.70s/it]

 85%|████████▍ | 11/13 [01:13<00:13,  6.71s/it]

 92%|█████████▏| 12/13 [01:20<00:06,  6.71s/it]

100%|██████████| 13/13 [01:27<00:00,  6.72s/it]

100%|██████████| 13/13 [01:27<00:00,  6.70s/it]


Full Data CV MSE: 0.019547





In [10]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

# Calculate overall CV score
total_samples = len(all_actuals_np) + len(all_actuals_full_np)
overall_mse = (single_mse * len(all_actuals_np) + full_mse * len(all_actuals_full_np)) / total_samples

print(f"\n=== FINAL RESULTS ===")
print(f"Single Solvent MSE: {single_mse:.6f}")
print(f"Full Data MSE: {full_mse:.6f}")
print(f"Overall CV MSE: {overall_mse:.6f}")
print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Submission shape: {submission.shape}")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################


=== FINAL RESULTS ===
Single Solvent MSE: 0.014880
Full Data MSE: 0.019547
Overall CV MSE: 0.017921

Submission saved to /home/submission/submission.csv
Submission shape: (1883, 7)
