# Gaussian Process + MLP + LGBM Ensemble

**Problem**: The CV-LB gap is ~10x. Our CV (0.008465) is 2x BETTER than target LB (0.01727).

**Hypothesis**: GPs have fundamentally different inductive biases than NNs. They may have a different CV-LB relationship.

**Why GP?**
1. Competition explicitly mentions GPs ("imputing any missing values using a multi-task GP")
2. GPs work well with small datasets
3. GPs provide uncertainty estimates
4. Different mathematical framework may break the CV-LB pattern

**Implementation**:
- GP with Matern kernel on simpler features (Spange + Arrhenius = 18 features)
- Ensemble: GP (0.3) + MLP (0.4) + LGBM (0.3)

**Baseline**: exp_026 CV 0.008465, LB 0.0887

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)


In [4]:
# Full Featurizer (for MLP and LGBM) - mixture uses pair-interaction features
class FullFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF

        # kinetic features: Residence Time, Temperature, inv_temp, log_time, interaction = 5
        self.kin_dim = 5
        
        if self.mixed:
            # pair descriptor uses concatenated interaction blocks
            # base solvent descriptor dim = spange + drfp + acs
            self.base_desc_dim = self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]
            # pair_desc = [blend, diff, prod, pct2] => 3*base + 1
            self.pair_desc_dim = 3 * self.base_desc_dim + 1
            # plus pct itself as numeric feature
            self.feats_dim = self.kin_dim + 1 + self.pair_desc_dim
        else:
            self.base_desc_dim = self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]
            self.feats_dim = self.kin_dim + self.base_desc_dim

    def featurize(self, X, flip=False):
        # kinetic / numeric features
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])

        if self.mixed:
            # Get A and B descriptors
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)

            # handle flip symmetry: swap A/B and invert pct
            if flip:
                A_spange, B_spange = B_spange, A_spange
                A_drfp, B_drfp = B_drfp, A_drfp
                A_acs, B_acs = B_acs, A_acs
                pct = 1.0 - pct

            A_desc = np.hstack([A_spange, A_drfp, A_acs])
            B_desc = np.hstack([B_spange, B_drfp, B_acs])

            # pair interaction features
            blend = (1 - pct) * A_desc + pct * B_desc
            diff = A_desc - B_desc
            prod = A_desc * B_desc
            pct2 = pct * (1 - pct)
            pair_desc = np.hstack([blend, diff, prod, pct2])

            # include pct itself as a separate feature (numeric)
            return np.hstack([X_kinetic, pct, pair_desc])

        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
            X_desc = np.hstack([X_spange, X_drfp, X_acs])
            return np.hstack([X_kinetic, X_desc])
    
    def featurize_torch(self, X, flip=False):
        return torch.tensor(self.featurize(X, flip=flip), dtype=torch.double).to(device)

print(f'Full feature dimension: {FullFeaturizer(mixed=False).feats_dim} (single)')
print(f'Full feature dimension: {FullFeaturizer(mixed=True).feats_dim} (full/pair-interaction)')

Full feature dimension: 145


In [5]:
# Simple Featurizer (for GP) - 18 features (Spange + Arrhenius kinetics)
class SimpleFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1]  # 18 features

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange])

print(f'Simple feature dimension (for GP): {SimpleFeaturizer().feats_dim}')

Simple feature dimension (for GP): 18


In [6]:
# Gaussian Process Wrapper
class GPWrapper:
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = SimpleFeaturizer(mixed=(data=='full'))  # Use simpler features for GP
        self.models = []
        self.scalers = []

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        
        # Scale features for GP
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_all)
        
        self.models = []
        # Matern kernel with noise
        kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        
        for i in range(3):  # 3 targets
            gp = GaussianProcessRegressor(
                kernel=kernel,
                n_restarts_optimizer=3,
                normalize_y=True,
                random_state=42
            )
            gp.fit(X_scaled, y_all[:, i])
            self.models.append(gp)

    def predict(self, X_test):
        X_feat = self.featurizer.featurize(X_test, flip=False)
        X_scaled = self.scaler.transform(X_feat)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_test, flip=True)
            X_flip_scaled = self.scaler.transform(X_flip)
        
        preds = []
        for i, model in enumerate(self.models):
            pred = model.predict(X_scaled)
            if self.data_type == 'full':
                pred_flip = model.predict(X_flip_scaled)
                pred = (pred + pred_flip) / 2
            preds.append(pred)
        
        return torch.tensor(np.column_stack(preds))

print('GPWrapper defined with Matern kernel')

GPWrapper defined with Matern kernel


In [7]:
# Weighted Huber Loss
class WeightedHuberLoss(nn.Module):
    def __init__(self, weights=[1.0, 1.0, 2.0]):
        super().__init__()
        self.weights = torch.tensor(weights, dtype=torch.double)
        self.huber = nn.HuberLoss(reduction='none')
    
    def forward(self, pred, target):
        huber_loss = self.huber(pred, target)
        weighted_loss = huber_loss * self.weights.to(pred.device)
        return weighted_loss.mean()

# MLP Model
class MLPModelInternal(nn.Module):
    def __init__(self, input_dim, hidden_dims=[32, 16], output_dim=3, dropout=0.05):
        super(MLPModelInternal, self).__init__()
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([nn.Linear(prev_dim, h_dim), nn.BatchNorm1d(h_dim), nn.ReLU(), nn.Dropout(dropout)])
            prev_dim = h_dim
        layers.extend([nn.Linear(prev_dim, output_dim), nn.Sigmoid()])
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

print('MLP components defined')

MLP components defined


In [8]:
# MLP Ensemble with Weighted Loss
class WeightedMLPEnsemble:
    def __init__(self, hidden_dims=[32, 16], n_models=5, data='single', loss_weights=[1.0, 1.0, 2.0]):
        self.hidden_dims = hidden_dims
        self.n_models = n_models
        self.data_type = data
        self.loss_weights = loss_weights
        self.featurizer = FullFeaturizer(mixed=(data=='full'))
        self.models = []

    def train_model(self, X_train, y_train, epochs=200, batch_size=32, lr=5e-4):
        X_std = self.featurizer.featurize_torch(X_train, flip=False)
        y_vals = torch.tensor(y_train.values)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize_torch(X_train, flip=True)
            X_all = torch.cat([X_std, X_flip], dim=0)
            y_all = torch.cat([y_vals, y_vals], dim=0)
        else:
            X_all, y_all = X_std, y_vals
            
        input_dim = X_all.shape[1]
        self.models = []
        
        for i in range(self.n_models):
            torch.manual_seed(42 + i * 13)
            np.random.seed(42 + i * 13)
            
            model = MLPModelInternal(input_dim, self.hidden_dims).to(device).double()
            optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20)
            criterion = WeightedHuberLoss(weights=self.loss_weights)
            
            dataset = TensorDataset(X_all.to(device), y_all.to(device))
            loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                epoch_loss = 0.0
                for batch_X, batch_y in loader:
                    optimizer.zero_grad()
                    pred = model(batch_X)
                    loss = criterion(pred, batch_y)
                    loss.backward()
                    optimizer.step()
                    epoch_loss += loss.item()
                scheduler.step(epoch_loss / len(loader))
            
            model.eval()
            self.models.append(model)

    def predict(self, X_test):
        X_feat = self.featurizer.featurize_torch(X_test, flip=False).to(device)
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize_torch(X_test, flip=True).to(device)
        
        all_preds = []
        with torch.no_grad():
            for model in self.models:
                pred = model(X_feat)
                if self.data_type == 'full':
                    pred_flip = model(X_flip)
                    pred = (pred + pred_flip) / 2
                all_preds.append(pred)
        
        return torch.stack(all_preds).mean(dim=0).cpu()

print('WeightedMLPEnsemble defined')

WeightedMLPEnsemble defined


In [9]:
# LightGBM Wrapper
class LGBMWrapper:
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = FullFeaturizer(mixed=(data=='full'))
        self.models = []

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        
        self.models = []
        params = {'objective': 'regression', 'metric': 'mse', 'boosting_type': 'gbdt',
                  'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9,
                  'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': -1, 'seed': 42}
        
        for i in range(3):
            train_data = lgb.Dataset(X_all, label=y_all[:, i])
            model = lgb.train(params, train_data, num_boost_round=100)
            self.models.append(model)

    def predict(self, X_test):
        X_feat = self.featurizer.featurize(X_test, flip=False)
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_test, flip=True)
        
        preds = []
        for i, model in enumerate(self.models):
            pred = model.predict(X_feat)
            if self.data_type == 'full':
                pred_flip = model.predict(X_flip)
                pred = (pred + pred_flip) / 2
            preds.append(pred)
        
        return torch.tensor(np.column_stack(preds))

print('LGBMWrapper defined')

LGBMWrapper defined


In [10]:
# GP + MLP + LGBM Ensemble
class GPMLPLGBMEnsemble:
    def __init__(self, data='single'):
        self.data_type = data
        self.gp = GPWrapper(data=data)
        self.mlp = WeightedMLPEnsemble(hidden_dims=[32, 16], n_models=5, data=data, loss_weights=[1.0, 1.0, 2.0])
        self.lgbm = LGBMWrapper(data=data)
        # Weights: GP 0.2, MLP 0.5, LGBM 0.3
        self.weights = {'gp': 0.2, 'mlp': 0.5, 'lgbm': 0.3}

    def train_model(self, X_train, y_train):
        self.gp.train_model(X_train, y_train)
        self.mlp.train_model(X_train, y_train)
        self.lgbm.train_model(X_train, y_train)

    def predict(self, X_test):
        gp_pred = self.gp.predict(X_test)
        mlp_pred = self.mlp.predict(X_test)
        lgbm_pred = self.lgbm.predict(X_test)
        
        combined = (self.weights['gp'] * gp_pred + 
                    self.weights['mlp'] * mlp_pred + 
                    self.weights['lgbm'] * lgbm_pred)
        return torch.clamp(combined, 0, 1)


# ============== Applicability Domain (AD) shrinkage wrapper ==============
from sklearn.model_selection import KFold
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.isotonic import IsotonicRegression


def _get_solvent_desc(names: pd.Series | np.ndarray) -> np.ndarray:
    """Low-dim descriptor space: Spange + ACS PCA (no DRFP)."""
    names = pd.Series(names)
    sp = SPANGE_DF.reindex(names).fillna(0.0).values
    acs = ACS_PCA_DF.reindex(names).fillna(0.0).values
    return np.hstack([sp, acs])


def _get_distance_space(X: pd.DataFrame, data_type: str) -> np.ndarray:
    """Descriptor space used ONLY for kNN distance computation."""
    if data_type == 'single':
        return _get_solvent_desc(X['SOLVENT NAME'])
    else:
        A = _get_solvent_desc(X['SOLVENT A NAME'])
        B = _get_solvent_desc(X['SOLVENT B NAME'])
        pct = X['SolventB%'].values.reshape(-1, 1)
        blend = (1 - pct) * A + pct * B
        diff = A - B
        prod = A * B
        pct2 = pct * (1 - pct)
        return np.hstack([blend, diff, prod, pct2])


def _get_baseline_features(X: pd.DataFrame, data_type: str, dist_space: np.ndarray) -> np.ndarray:
    if data_type == 'single':
        num = X[['Residence Time', 'Temperature']].values.astype(np.float64)
        return np.hstack([num, dist_space])
    else:
        num = X[['Residence Time', 'Temperature', 'SolventB%']].values.astype(np.float64)
        return np.hstack([num, dist_space])


class GPMLPLGBMEnsembleAD:
    """Base ensemble + distance-based shrinkage to kNN baseline.

    Per outer fold:
      1) fit base ensemble
      2) fit kNN baseline on train
      3) cross-fit distance->alpha mapping within train (KFold=5)
      4) inference: blend base with baseline by alpha(distance)
    """

    def __init__(self, data='single'):
        self.data_type = data
        self.base_model = GPMLPLGBMEnsemble(data=data)

        # fitted per outer fold
        self.dist_scaler = None
        self.nn = None
        self.baseline_scaler = None
        self.knn = None
        self.iso = None

    def _fit_distance_models(self, X_train: pd.DataFrame):
        dist_space = _get_distance_space(X_train, self.data_type)
        self.dist_scaler = StandardScaler()
        dist_scaled = self.dist_scaler.fit_transform(dist_space)
        self.nn = NearestNeighbors(n_neighbors=5, metric='euclidean')
        self.nn.fit(dist_scaled)

        base_feats = _get_baseline_features(X_train, self.data_type, dist_space)
        self.baseline_scaler = StandardScaler()
        base_scaled = self.baseline_scaler.fit_transform(base_feats)
        self.knn = KNeighborsRegressor(n_neighbors=10, weights='distance')
        self.knn.fit(base_scaled, X_train_y_values)  # placeholder

    def train_model(self, X_train: pd.DataFrame, y_train: pd.DataFrame):
        # 1) train base ensemble on outer-train
        self.base_model.train_model(X_train, y_train)

        # 2) fit distance space scaler + NN and baseline KNN on outer-train
        dist_space = _get_distance_space(X_train, self.data_type)
        self.dist_scaler = StandardScaler()
        dist_scaled = self.dist_scaler.fit_transform(dist_space)
        self.nn = NearestNeighbors(n_neighbors=5, metric='euclidean')
        self.nn.fit(dist_scaled)

        base_feats = _get_baseline_features(X_train, self.data_type, dist_space)
        self.baseline_scaler = StandardScaler()
        base_scaled = self.baseline_scaler.fit_transform(base_feats)
        self.knn = KNeighborsRegressor(n_neighbors=10, weights='distance')
        self.knn.fit(base_scaled, y_train.values)

        # 3) cross-fitted alpha(d) within outer-train
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        d_all = []
        y_all = []
        m_all = []
        b_all = []

        X_train_reset = X_train.reset_index(drop=True)
        y_train_reset = y_train.reset_index(drop=True)

        for tr_idx, va_idx in kf.split(X_train_reset):
            X_tr = X_train_reset.iloc[tr_idx]
            y_tr = y_train_reset.iloc[tr_idx]
            X_va = X_train_reset.iloc[va_idx]
            y_va = y_train_reset.iloc[va_idx]

            # base model (inner)
            base_inner = GPMLPLGBMEnsemble(data=self.data_type)
            base_inner.train_model(X_tr, y_tr)
            m_pred = base_inner.predict(X_va).detach().cpu().numpy()

            # baseline KNN (inner)
            dist_tr = _get_distance_space(X_tr, self.data_type)
            dist_va = _get_distance_space(X_va, self.data_type)

            # distance model for d(x)
            dist_scaler = StandardScaler()
            dist_tr_scaled = dist_scaler.fit_transform(dist_tr)
            nn = NearestNeighbors(n_neighbors=5, metric='euclidean')
            nn.fit(dist_tr_scaled)
            dist_va_scaled = dist_scaler.transform(dist_va)
            knn_dists, _ = nn.kneighbors(dist_va_scaled, n_neighbors=5)
            d_val = knn_dists.mean(axis=1)

            # baseline regressor
            base_tr = _get_baseline_features(X_tr, self.data_type, dist_tr)
            base_va = _get_baseline_features(X_va, self.data_type, dist_va)
            base_scaler = StandardScaler()
            base_tr_scaled = base_scaler.fit_transform(base_tr)
            base_va_scaled = base_scaler.transform(base_va)
            knn_reg = KNeighborsRegressor(n_neighbors=10, weights='distance')
            knn_reg.fit(base_tr_scaled, y_tr.values)
            b_pred = knn_reg.predict(base_va_scaled)

            d_all.append(d_val)
            y_all.append(y_va.values)
            m_all.append(m_pred)
            b_all.append(b_pred)

        d_all = np.concatenate(d_all)
        y_all = np.vstack(y_all)
        m_all = np.vstack(m_all)
        b_all = np.vstack(b_all)

        # bin distances into quantiles
        qs = np.quantile(d_all, np.linspace(0, 1, 11))
        # ensure strictly increasing bin edges
        qs = np.unique(qs)
        if len(qs) < 3:
            # degenerate case
            self.iso = IsotonicRegression(increasing=True, y_min=0.0, y_max=1.0, out_of_bounds='clip')
            self.iso.fit([0.0, 1.0], [0.0, 0.0])
            return

        # assign bins
        bin_ids = np.digitize(d_all, qs[1:-1], right=True)
        bin_centers = []
        alpha_bins = []

        for b in range(bin_ids.min(), bin_ids.max() + 1):
            mask = bin_ids == b
            if mask.sum() < 5:
                continue
            yb = y_all[mask]
            mb = m_all[mask]
            bb = b_all[mask]

            # least squares alpha on flattened targets
            num = np.sum((yb - mb) * (bb - mb))
            den = np.sum((bb - mb) ** 2) + 1e-12
            alpha = float(np.clip(num / den, 0.0, 1.0))

            # center as mid-point of bin edges
            lo = qs[b]
            hi = qs[b + 1] if (b + 1) < len(qs) else qs[-1]
            bin_centers.append((lo + hi) / 2)
            alpha_bins.append(alpha)

        # isotonic regression to enforce monotonicity alpha(d)
        if len(bin_centers) < 2:
            self.iso = IsotonicRegression(increasing=True, y_min=0.0, y_max=1.0, out_of_bounds='clip')
            self.iso.fit([0.0, 1.0], [0.0, 0.0])
        else:
            order = np.argsort(bin_centers)
            x = np.array(bin_centers)[order]
            y = np.array(alpha_bins)[order]
            self.iso = IsotonicRegression(increasing=True, y_min=0.0, y_max=1.0, out_of_bounds='clip')
            self.iso.fit(x, y)

    def predict(self, X_test: pd.DataFrame):
        # base prediction
        m_pred = self.base_model.predict(X_test).detach().cpu().numpy()

        # baseline prediction
        dist_space = _get_distance_space(X_test, self.data_type)
        base_feats = _get_baseline_features(X_test, self.data_type, dist_space)
        base_scaled = self.baseline_scaler.transform(base_feats)
        b_pred = self.knn.predict(base_scaled)

        # distance / alpha
        dist_scaled = self.dist_scaler.transform(dist_space)
        knn_dists, _ = self.nn.kneighbors(dist_scaled, n_neighbors=5)
        d = knn_dists.mean(axis=1)
        alpha = self.iso.predict(d) if self.iso is not None else np.zeros_like(d)
        alpha = np.clip(alpha, 0.0, 1.0)

        out = (1.0 - alpha.reshape(-1, 1)) * m_pred + alpha.reshape(-1, 1) * b_pred
        out = np.clip(out, 0.0, 1.0)
        return torch.tensor(out, dtype=torch.double)


print('Defined GPMLPLGBMEnsemble (base) and GPMLPLGBMEnsembleAD (AD shrinkage)')

GPMLPLGBMEnsemble defined: GP(0.2) + MLP(0.5) + LGBM(0.3)


In [11]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GPMLPLGBMEnsembleAD(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:48, 48.50s/it]

2it [01:33, 46.47s/it]

3it [02:13, 43.29s/it]

4it [02:53, 42.04s/it]

5it [03:38, 43.40s/it]

6it [04:25, 44.56s/it]

7it [05:12, 45.11s/it]

8it [05:56, 44.92s/it]

9it [06:45, 46.04s/it]

10it [07:30, 45.85s/it]

11it [08:18, 46.39s/it]

12it [09:04, 46.29s/it]

13it [09:48, 45.76s/it]

14it [10:35, 45.95s/it]

15it [11:22, 46.44s/it]

16it [12:15, 48.42s/it]

17it [13:06, 49.05s/it]

18it [13:56, 49.44s/it]

19it [14:52, 51.44s/it]

20it [15:45, 51.90s/it]

21it [16:41, 52.99s/it]

22it [17:32, 52.60s/it]

23it [18:27, 53.24s/it]

24it [19:13, 51.18s/it]

24it [19:13, 48.08s/it]




In [12]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GPMLPLGBMEnsembleAD(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [04:21, 261.76s/it]

2it [09:15, 280.67s/it]

3it [14:08, 286.16s/it]

4it [18:31, 277.14s/it]

5it [23:00, 273.99s/it]

6it [27:36, 274.97s/it]

7it [32:24, 279.14s/it]

8it [37:05, 279.80s/it]

9it [41:27, 274.26s/it]

10it [47:08, 294.71s/it]

11it [52:31, 303.24s/it]

12it [57:49, 307.82s/it]

13it [1:03:24, 316.13s/it]

13it [1:03:24, 292.66s/it]




In [13]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################
