# Experiment 065: Multi-Task Gaussian Process

**Goal:** Implement MTGP to potentially change the CV-LB relationship.

**Key Insight:** The benchmark achieved MSE 0.0039 using multi-task GP. MTGP can "borrow statistical strength" from related tasks (solvents) to improve predictions on unseen solvents.

**Approach:**
1. Use GPyTorch for efficient MTGP implementation
2. Treat each target (Product 2, Product 3, SM) as a task
3. Learn shared covariance across tasks
4. Ensemble with MLP+LGBM for robustness

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Featurizer class
class SimpleFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1]  # 18 features

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange])

print(f'SimpleFeaturizer defined with {SimpleFeaturizer().feats_dim} features')

SimpleFeaturizer defined with 18 features


In [5]:
# Multi-Task GP using GPyTorch
import gpytorch
from gpytorch.models import ExactGP
from gpytorch.likelihoods import MultitaskGaussianLikelihood
from gpytorch.mlls import ExactMarginalLogLikelihood
from gpytorch.distributions import MultitaskMultivariateNormal

class MultitaskGPModel(ExactGP):
    """Multi-task GP that learns shared covariance across targets."""
    def __init__(self, train_x, train_y, likelihood, num_tasks=3):
        super(MultitaskGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.MultitaskMean(
            gpytorch.means.ConstantMean(), num_tasks=num_tasks
        )
        # Use MultitaskKernel with rank=1 for efficiency
        self.covar_module = gpytorch.kernels.MultitaskKernel(
            gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel()),
            num_tasks=num_tasks,
            rank=1
        )
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return MultitaskMultivariateNormal(mean_x, covar_x)

print('MultitaskGPModel defined')

MultitaskGPModel defined


In [6]:
# Multi-Task GP Wrapper
class MTGPWrapper:
    def __init__(self, data='single', n_epochs=100):
        self.data_type = data
        self.n_epochs = n_epochs
        self.featurizer = SimpleFeaturizer(mixed=(data=='full'))
        self.model = None
        self.likelihood = None
        self.scaler = None
    
    def train_model(self, X_train, y_train):
        # Featurize
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        
        # Scale features
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_all)
        
        # Convert to tensors
        train_x = torch.tensor(X_scaled, dtype=torch.double)
        train_y = torch.tensor(y_all, dtype=torch.double)
        
        # Initialize likelihood and model
        self.likelihood = MultitaskGaussianLikelihood(num_tasks=3)
        self.model = MultitaskGPModel(train_x, train_y, self.likelihood, num_tasks=3)
        
        # Training
        self.model.train()
        self.likelihood.train()
        
        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.1)
        mll = ExactMarginalLogLikelihood(self.likelihood, self.model)
        
        for i in range(self.n_epochs):
            optimizer.zero_grad()
            output = self.model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.step()
    
    def predict(self, X_test):
        X_std = self.featurizer.featurize(X_test, flip=False)
        X_scaled = self.scaler.transform(X_std)
        test_x = torch.tensor(X_scaled, dtype=torch.double)
        
        self.model.eval()
        self.likelihood.eval()
        
        with torch.no_grad(), gpytorch.settings.fast_pred_var():
            predictions = self.likelihood(self.model(test_x))
            mean = predictions.mean
        
        return torch.clamp(mean, 0, 1)

print('MTGPWrapper defined')

MTGPWrapper defined


In [7]:
# MLP Model (same as exp_030)
class MLPModelInternal(nn.Module):
    def __init__(self, input_dim, hidden_dims=[32, 16], output_dim=3, dropout=0.05):
        super(MLPModelInternal, self).__init__()
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([nn.Linear(prev_dim, h_dim), nn.BatchNorm1d(h_dim), nn.ReLU(), nn.Dropout(dropout)])
            prev_dim = h_dim
        layers.extend([nn.Linear(prev_dim, output_dim), nn.Sigmoid()])
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

class WeightedMLPEnsemble:
    def __init__(self, hidden_dims=[32, 16], n_models=3, data='single'):
        self.hidden_dims = hidden_dims
        self.n_models = n_models
        self.data_type = data
        self.featurizer = SimpleFeaturizer(mixed=(data=='full'))
        self.models = []
        self.scaler = None

    def train_model(self, X_train, y_train, epochs=150, batch_size=32, lr=5e-4):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_all)
        
        X_tensor = torch.tensor(X_scaled, dtype=torch.double)
        y_tensor = torch.tensor(y_all, dtype=torch.double)
        
        input_dim = X_tensor.shape[1]
        self.models = []
        
        for i in range(self.n_models):
            model = MLPModelInternal(input_dim, self.hidden_dims, 3, dropout=0.05).double()
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            criterion = nn.MSELoss()
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
            
            for epoch in range(epochs):
                model.train()
                for X_batch, y_batch in loader:
                    optimizer.zero_grad()
                    pred = model(X_batch)
                    loss = criterion(pred, y_batch)
                    loss.backward()
                    optimizer.step()
            
            model.eval()
            self.models.append(model)

    def predict(self, X_test):
        X_std = self.featurizer.featurize(X_test, flip=False)
        X_scaled = self.scaler.transform(X_std)
        X_tensor = torch.tensor(X_scaled, dtype=torch.double)
        
        preds = []
        with torch.no_grad():
            for model in self.models:
                preds.append(model(X_tensor))
        
        return torch.clamp(torch.stack(preds).mean(dim=0), 0, 1)

print('WeightedMLPEnsemble defined')

WeightedMLPEnsemble defined


In [8]:
# LightGBM Wrapper
class LGBMWrapper:
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = SimpleFeaturizer(mixed=(data=='full'))
        self.models = []
        self.scaler = None

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_all)
        
        self.models = []
        params = {'objective': 'regression', 'metric': 'mse', 'boosting_type': 'gbdt',
                  'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9,
                  'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': -1, 'seed': 42}
        
        for i in range(3):
            train_data = lgb.Dataset(X_scaled, label=y_all[:, i])
            model = lgb.train(params, train_data, num_boost_round=100)
            self.models.append(model)

    def predict(self, X_test):
        X_std = self.featurizer.featurize(X_test, flip=False)
        X_scaled = self.scaler.transform(X_std)
        
        preds = []
        for model in self.models:
            preds.append(model.predict(X_scaled))
        
        return torch.clamp(torch.tensor(np.column_stack(preds), dtype=torch.double), 0, 1)

print('LGBMWrapper defined')

LGBMWrapper defined


In [9]:
# MTGP + MLP + LGBM Ensemble
class MTGPMLPLGBMEnsemble:
    def __init__(self, data='single'):
        self.data_type = data
        self.mtgp = MTGPWrapper(data=data, n_epochs=50)  # Reduced epochs for speed
        self.mlp = WeightedMLPEnsemble(hidden_dims=[32, 16], n_models=3, data=data)
        self.lgbm = LGBMWrapper(data=data)
        # Weights: MTGP 0.3, MLP 0.4, LGBM 0.3
        self.weights = {'mtgp': 0.3, 'mlp': 0.4, 'lgbm': 0.3}

    def train_model(self, X_train, y_train):
        self.mtgp.train_model(X_train, y_train)
        self.mlp.train_model(X_train, y_train)
        self.lgbm.train_model(X_train, y_train)

    def predict(self, X_test):
        mtgp_pred = self.mtgp.predict(X_test)
        mlp_pred = self.mlp.predict(X_test)
        lgbm_pred = self.lgbm.predict(X_test)
        
        combined = (self.weights['mtgp'] * mtgp_pred + 
                    self.weights['mlp'] * mlp_pred + 
                    self.weights['lgbm'] * lgbm_pred)
        return torch.clamp(combined, 0, 1)

print('MTGPMLPLGBMEnsemble defined: MTGP(0.3) + MLP(0.4) + LGBM(0.3)')

MTGPMLPLGBMEnsemble defined: MTGP(0.3) + MLP(0.4) + LGBM(0.3)


In [10]:
# Quick test on one fold
print('Testing on one fold...')
X, Y = load_data("single_solvent")
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

model = MTGPMLPLGBMEnsemble(data='single')
model.train_model(train_X, train_Y)
preds = model.predict(test_X)

mse = ((preds.numpy() - test_Y.values) ** 2).mean()
print(f'Test fold MSE: {mse:.6f}')
print(f'Predictions shape: {preds.shape}')

Testing on one fold...


Test fold MSE: 0.041903
Predictions shape: torch.Size([37, 3])


In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MTGPMLPLGBMEnsemble(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = MTGPMLPLGBMEnsemble(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
# CV CALCULATION - This cell is AFTER the final submission cell
import os
from sklearn.metrics import mean_squared_error

os.makedirs('/home/submission', exist_ok=True)
submission.to_csv('/home/submission/submission.csv', index=True)

# Single solvent CV
X_single, Y_single = load_data("single_solvent")
split_gen = list(generate_leave_one_out_splits(X_single, Y_single))
all_y_true, all_y_pred = [], []
for fold_idx, split in enumerate(split_gen):
    (_, _), (_, test_Y) = split
    fold_preds = submission_single_solvent[submission_single_solvent['fold'] == fold_idx]
    all_y_true.append(test_Y.values)
    all_y_pred.append(fold_preds[['target_1', 'target_2', 'target_3']].values)
mse_single = mean_squared_error(np.vstack(all_y_true), np.vstack(all_y_pred))

# Full data CV
X_full, Y_full = load_data("full")
split_gen = list(generate_leave_one_ramp_out_splits(X_full, Y_full))
all_y_true, all_y_pred = [], []
for fold_idx, split in enumerate(split_gen):
    (_, _), (_, test_Y) = split
    fold_preds = submission_full_data[submission_full_data['fold'] == fold_idx]
    all_y_true.append(test_Y.values)
    all_y_pred.append(fold_preds[['target_1', 'target_2', 'target_3']].values)
mse_full = mean_squared_error(np.vstack(all_y_true), np.vstack(all_y_pred))

print(f'Single Solvent CV MSE: {mse_single:.6f}')
print(f'Full Data CV MSE: {mse_full:.6f}')
print(f'Submission saved with {len(submission)} rows')