# Experiment 053: Exact Template Submission

**Goal:** Use EXACTLY the template code structure to ensure submission format is correct.

**Approach:** Copy the template code exactly, only changing the model definition.

In [1]:
# Cell 1: Imports and setup (from template)
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from abc import ABC, abstractmethod
import warnings
warnings.filterwarnings('ignore')

torch.set_default_dtype(torch.double)

# Data path for local execution
DATA_PATH = "/home/data"

print("Imports complete.")
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.preprocessing import StandardScaler



Imports complete.


In [2]:
# Cell 2: Data loading functions (adapted for local paths)

INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"
]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_FEATURES = ["SOLVENT NAME"]
INPUT_LABELS_FULL_FEATURES = ["SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

def generate_leave_one_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvents."""
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvent ramps."""
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != solvent_pair).any(axis=1)
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

print("Data loading functions defined.")

Data loading functions defined.


In [3]:
# Cell 3: Base classes (from template)

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

print("Base classes defined.")

Base classes defined.


In [4]:
# Cell 4: Featurizer (from template)

class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        assert features in ['drfps_catechol', 'fragprints', 'smiles', 'acs_pca_descriptors', 'spange_descriptors']
        self.features = features
        self.featurizer = load_features(self.features)
        self.feats_dim = self.featurizer.shape[1] + 2

    def featurize(self, X):
        X_numeric = X[INPUT_LABELS_NUMERIC]
        X_smiles_feat = self.featurizer.loc[X["SOLVENT NAME"]]
        X_numeric_tensor = torch.tensor(X_numeric.values)
        X_smiles_feat_tensor = torch.tensor(X_smiles_feat.values)
        X_out = torch.cat((X_numeric_tensor, X_smiles_feat_tensor), dim=1)
        return X_out

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        assert features in ['drfps_catechol', 'fragprints', 'smiles', 'acs_pca_descriptors', 'spange_descriptors']
        self.features = features
        self.featurizer = load_features(self.features)
        self.feats_dim = self.featurizer.shape[1] * 2 + 3

    def featurize(self, X):
        X_numeric = X[INPUT_LABELS_NUMERIC]
        X_smiles_A_feat = self.featurizer.loc[X["SOLVENT A NAME"]]
        X_smiles_B_feat = self.featurizer.loc[X["SOLVENT B NAME"]]
        X_solventB_pct = X[["SolventB%"]]
        X_numeric_tensor = torch.tensor(X_numeric.values)
        X_smiles_A_feat_tensor = torch.tensor(X_smiles_A_feat.values)
        X_smiles_B_feat_tensor = torch.tensor(X_smiles_B_feat.values)
        X_solventB_pct_tensor = torch.tensor(X_solventB_pct.values)
        X_out = torch.cat((X_numeric_tensor, X_smiles_A_feat_tensor, X_smiles_B_feat_tensor, X_solventB_pct_tensor), dim=1)
        return X_out

print("Featurizers defined.")

Featurizers defined.


In [5]:
# Cell 5: GP+MLP+LGBM ensemble (template-safe exp_030 reproduction) + AD shrinkage v2

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GroupShuffleSplit
from sklearn.isotonic import IsotonicRegression

# ---- Load lookup tables (same as exp_030) ----
SPANGE_DF = load_features('spange_descriptors')
DRFP_DF = load_features('drfps_catechol')
ACS_PCA_DF = load_features('acs_pca_descriptors')

# Filter DRFP to nonzero-variance columns (unsupervised; solvent table only)
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]


def _solvent_embed(names):
    """Embedding for a solvent name using Spange + DRFP(nzvar) + ACS PCA."""
    sp = SPANGE_DF.loc[names].values
    dr = DRFP_FILTERED.loc[names].values
    ac = ACS_PCA_DF.loc[names].values
    return np.hstack([sp, dr, ac])


def _kinetic_vec(X: pd.DataFrame) -> np.ndarray:
    X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
    temp_c = X_vals[:, 1:2]
    time_m = X_vals[:, 0:1]
    temp_k = temp_c + 273.15
    inv_temp = 1000.0 / temp_k
    log_time = np.log(time_m + 1e-6)
    interaction = inv_temp * log_time
    return np.hstack([X_vals, inv_temp, log_time, interaction])


def _dist_features_single(X: pd.DataFrame) -> np.ndarray:
    # Loop73: pure solvent embed distance didn't correlate positively with error; add kinetics.
    return np.hstack([_kinetic_vec(X), _solvent_embed(X['SOLVENT NAME'])])


def _dist_features_full(X: pd.DataFrame, flip: bool = False) -> np.ndarray:
    A = X['SOLVENT A NAME']
    B = X['SOLVENT B NAME']
    pct = X['SolventB%'].values.reshape(-1, 1)
    if flip:
        A, B = B, A
        pct = 1.0 - pct
    A_e = _solvent_embed(A)
    B_e = _solvent_embed(B)
    blend = (1 - pct) * A_e + pct * B_e
    diff_norm = np.linalg.norm(A_e - B_e, axis=1, keepdims=True)
    pct2 = pct * (1 - pct)
    # distance features focus on solvent similarity + mixture geometry (exclude kinetics per rec)
    return np.hstack([blend, pct, pct2, diff_norm])


# ---- Base featurizers with correct flip ----
class FullFeaturizer030:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 5 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]
        if self.mixed:
            self.feats_dim = 5 + 1 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def _kinetic(self, X):
        return _kinetic_vec(X)

    def featurize(self, X, flip=False):
        X_kin = self._kinetic(X)
        if self.mixed:
            A = X['SOLVENT A NAME']
            B = X['SOLVENT B NAME']
            pct = X['SolventB%'].values.reshape(-1, 1)
            if flip:
                A, B = B, A
                pct = 1.0 - pct
            A_sp = self.spange_df.loc[A].values
            B_sp = self.spange_df.loc[B].values
            A_dr = self.drfp_df.loc[A].values
            B_dr = self.drfp_df.loc[B].values
            A_ac = self.acs_pca_df.loc[A].values
            B_ac = self.acs_pca_df.loc[B].values
            X_sp = (1 - pct) * A_sp + pct * B_sp
            X_dr = (1 - pct) * A_dr + pct * B_dr
            X_ac = (1 - pct) * A_ac + pct * B_ac
            return np.hstack([X_kin, pct, X_sp, X_dr, X_ac])
        else:
            S = X['SOLVENT NAME']
            X_sp = self.spange_df.loc[S].values
            X_dr = self.drfp_df.loc[S].values
            X_ac = self.acs_pca_df.loc[S].values
            return np.hstack([X_kin, X_sp, X_dr, X_ac])

    def featurize_torch(self, X, flip=False):
        return torch.tensor(self.featurize(X, flip=flip), dtype=torch.double)


class SimpleFeaturizer030:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.feats_dim = 5 + self.spange_df.shape[1]
        if self.mixed:
            self.feats_dim = 5 + 1 + self.spange_df.shape[1]

    def _kinetic(self, X):
        return _kinetic_vec(X)

    def featurize(self, X, flip=False):
        X_kin = self._kinetic(X)
        if self.mixed:
            A = X['SOLVENT A NAME']
            B = X['SOLVENT B NAME']
            pct = X['SolventB%'].values.reshape(-1, 1)
            if flip:
                A, B = B, A
                pct = 1.0 - pct
            A_sp = self.spange_df.loc[A].values
            B_sp = self.spange_df.loc[B].values
            X_sp = (1 - pct) * A_sp + pct * B_sp
            return np.hstack([X_kin, pct, X_sp])
        else:
            S = X['SOLVENT NAME']
            X_sp = self.spange_df.loc[S].values
            return np.hstack([X_kin, X_sp])


# ---- Models (GP, MLP ensemble, LGBM) ----
class GPWrapper030:
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = SimpleFeaturizer030(mixed=(data=='full'))
        self.models = []
        self.scaler = None

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_all)
        kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        self.models = []
        for i in range(3):
            gp = GaussianProcessRegressor(kernel=kernel, alpha=1e-6, normalize_y=True, random_state=42)
            gp.fit(X_scaled, y_all[:, i])
            self.models.append(gp)

    def predict(self, X_test):
        X_std = self.featurizer.featurize(X_test, flip=False)
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_test, flip=True)
            X_all = np.vstack([X_std, X_flip])
        else:
            X_all = X_std
        X_scaled = self.scaler.transform(X_all)
        preds = []
        for gp in self.models:
            preds.append(gp.predict(X_scaled))
        out = np.vstack(preds).T
        if self.data_type == 'full':
            n = len(X_std)
            out = 0.5 * (out[:n] + out[n:])
        return torch.tensor(np.clip(out, 0, 1), dtype=torch.double)


class WeightedHuberLoss(nn.Module):
    def __init__(self, weights=[1.0, 1.0, 2.0]):
        super().__init__()
        self.weights = torch.tensor(weights, dtype=torch.double)
        self.huber = nn.HuberLoss(reduction='none')

    def forward(self, pred, target):
        huber_loss = self.huber(pred, target)
        weighted_loss = huber_loss * self.weights.to(pred.device)
        return weighted_loss.mean()


class MLPModelInternal(nn.Module):
    def __init__(self, input_dim, hidden_dims=[32, 16], output_dim=3, dropout=0.05):
        super().__init__()
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([nn.Linear(prev_dim, h_dim), nn.BatchNorm1d(h_dim), nn.ReLU(), nn.Dropout(dropout)])
            prev_dim = h_dim
        layers.extend([nn.Linear(prev_dim, output_dim), nn.Sigmoid()])
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


class WeightedMLPEnsemble030:
    def __init__(self, hidden_dims=[32, 16], n_models=5, data='single', loss_weights=[1.0, 1.0, 2.0]):
        self.hidden_dims = hidden_dims
        self.n_models = n_models
        self.data_type = data
        self.loss_weights = loss_weights
        self.featurizer = FullFeaturizer030(mixed=(data=='full'))
        self.models = []
        self.scaler = None

    def train_model(self, X_train, y_train, epochs=200, batch_size=32, lr=5e-4):
        X_std = self.featurizer.featurize_torch(X_train, flip=False)
        y_vals = torch.tensor(y_train.values, dtype=torch.double)
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize_torch(X_train, flip=True)
            X_all = torch.cat([X_std, X_flip], dim=0)
            y_all = torch.cat([y_vals, y_vals], dim=0)
        else:
            X_all, y_all = X_std, y_vals

        self.scaler = StandardScaler()
        X_all_np = X_all.cpu().numpy()
        X_all_scaled = self.scaler.fit_transform(X_all_np)
        X_all = torch.tensor(X_all_scaled, dtype=torch.double)

        dataset = TensorDataset(X_all, y_all)
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        self.models = []
        loss_fn = WeightedHuberLoss(self.loss_weights)

        for seed in range(self.n_models):
            torch.manual_seed(42 + seed)
            model = MLPModelInternal(X_all.shape[1], hidden_dims=self.hidden_dims).double()
            optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
            model.train()
            for _ in range(epochs):
                for xb, yb in loader:
                    pred = model(xb)
                    loss = loss_fn(pred, yb)
                    optim.zero_grad()
                    loss.backward()
                    optim.step()
            self.models.append(model.eval())

    def predict(self, X_test):
        X_std = self.featurizer.featurize_torch(X_test, flip=False)
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize_torch(X_test, flip=True)
            X_all = torch.cat([X_std, X_flip], dim=0)
        else:
            X_all = X_std
        X_all_np = X_all.cpu().numpy()
        X_scaled = self.scaler.transform(X_all_np)
        X_t = torch.tensor(X_scaled, dtype=torch.double)
        preds = []
        for model in self.models:
            with torch.no_grad():
                preds.append(model(X_t).cpu().numpy())
        out = np.mean(preds, axis=0)
        if self.data_type == 'full':
            n = len(X_std)
            out = 0.5 * (out[:n] + out[n:])
        return torch.tensor(np.clip(out, 0, 1), dtype=torch.double)


class LGBMWrapper030:
    def __init__(self, data='single'):
        self.data_type = data
        self.featurizer = FullFeaturizer030(mixed=(data=='full'))
        self.models = []

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        params = {
            'objective': 'regression',
            'metric': 'mse',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'seed': 42
        }
        self.models = []
        for i in range(3):
            dtrain = lgb.Dataset(X_all, label=y_all[:, i])
            m = lgb.train(params, dtrain, num_boost_round=200)
            self.models.append(m)

    def predict(self, X_test):
        X_std = self.featurizer.featurize(X_test, flip=False)
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_test, flip=True)
            X_all = np.vstack([X_std, X_flip])
        else:
            X_all = X_std
        preds = []
        for i in range(3):
            preds.append(self.models[i].predict(X_all))
        out = np.vstack(preds).T
        if self.data_type == 'full':
            n = len(X_std)
            out = 0.5 * (out[:n] + out[n:])
        return torch.tensor(np.clip(out, 0, 1), dtype=torch.double)


class GPMLPLGBMEnsemble_TemplateSafe(BaseModel):
    def __init__(self, data='single'):
        self.data_type = data
        self.gp = GPWrapper030(data=data)
        self.mlp = WeightedMLPEnsemble030(hidden_dims=[32, 16], n_models=5, data=data, loss_weights=[1.0, 1.0, 2.0])
        self.lgbm = LGBMWrapper030(data=data)
        self.weights = {'gp': 0.2, 'mlp': 0.5, 'lgbm': 0.3}

    def train_model(self, X_train, y_train, device=None, verbose=False):
        self.gp.train_model(X_train, y_train)
        self.mlp.train_model(X_train, y_train)
        self.lgbm.train_model(X_train, y_train)

    def predict(self, X):
        gp_pred = self.gp.predict(X)
        mlp_pred = self.mlp.predict(X)
        lgbm_pred = self.lgbm.predict(X)
        out = (self.weights['gp'] * gp_pred + self.weights['mlp'] * mlp_pred + self.weights['lgbm'] * lgbm_pred)
        return torch.clamp(out, 0, 1)


class GPMLPLGBM_ADShrinkageV2(BaseModel):
    """AD shrinkage v2:
    - calibrate alpha(d) on train_inner/cal_inner
    - then refit base on full fold
    - refit scaler+NN on full fold (unsupervised)
    - use HINGE so alpha=0 for d<=tau (tau=quantile of train distances)
    - fallback is kNN-mean-y (per query) instead of global mean
    - optionally disable shrinkage on single (alpha=0) if not useful
    """

    def __init__(self, data='single', n_neighbors=10, tau_q=0.70, alpha_max=0.6, random_state=42, apply_on_single=False):
        self.data_type = data
        self.n_neighbors = n_neighbors
        self.tau_q = tau_q
        self.alpha_max = alpha_max
        self.random_state = random_state
        self.apply_on_single = apply_on_single

        self.base = GPMLPLGBMEnsemble_TemplateSafe(data=data)
        # for predicting fallback knn mean y
        self.y_train_full = None
        self.scaler = None
        self.nn = None
        self.tau = None
        self.iso_models = None

    def _dist_features(self, X: pd.DataFrame):
        if self.data_type == 'single':
            return _dist_features_single(X)
        else:
            return _dist_features_full(X, flip=False)

    def _groups(self, X: pd.DataFrame):
        if self.data_type == 'single':
            return X['SOLVENT NAME'].values
        else:
            # canonicalize pair
            a = X['SOLVENT A NAME'].astype(str).values
            b = X['SOLVENT B NAME'].astype(str).values
            ab = np.where(a < b, a + '||' + b, b + '||' + a)
            return ab

    def train_model(self, X_train, y_train, device=None, verbose=False):
        # Split into train_inner/cal_inner by groups
        groups = self._groups(X_train)
        gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=self.random_state)
        tr_idx, cal_idx = next(gss.split(X_train, y_train, groups=groups))
        X_tr = X_train.iloc[tr_idx]
        y_tr = y_train.iloc[tr_idx]
        X_cal = X_train.iloc[cal_idx]
        y_cal = y_train.iloc[cal_idx]

        # Train base on train_inner for calibration
        base_inner = GPMLPLGBMEnsemble_TemplateSafe(data=self.data_type)
        base_inner.train_model(X_tr, y_tr)
        pred_cal = base_inner.predict(X_cal).detach().cpu().numpy()

        # Distance model fit on train_inner for calibration distances
        D_tr = self._dist_features(X_tr)
        D_cal = self._dist_features(X_cal)
        scaler_tr = StandardScaler()
        D_tr_s = scaler_tr.fit_transform(D_tr)
        nn_tr = NearestNeighbors(n_neighbors=self.n_neighbors, metric='euclidean')
        nn_tr.fit(D_tr_s)
        d_cal = nn_tr.kneighbors(scaler_tr.transform(D_cal), return_distance=True)[0].mean(axis=1)

        # Also compute train distances for hinge tau
        d_tr = nn_tr.kneighbors(D_tr_s, return_distance=True)[0].mean(axis=1)
        tau = float(np.quantile(d_tr, self.tau_q))

        # hinge distances
        d_cal_h = np.maximum(0.0, d_cal - tau)

        # kNN fallback on cal: knn-mean-y from train_inner
        # neighbors indices already available via nn_tr on D_cal
        dist_mat, idx_mat = nn_tr.kneighbors(scaler_tr.transform(D_cal), return_distance=True)
        w = 1.0 / (dist_mat + 1e-6)
        w = w / w.sum(axis=1, keepdims=True)
        y_tr_vals = y_tr.values
        knn_cal = np.einsum('ij,ijk->ik', w, y_tr_vals[idx_mat])

        # learn alpha(d) per target in bins, then isotonic on hinge distance
        # If no shrink on single and single mode, force alpha=0.
        if (self.data_type == 'single') and (not self.apply_on_single):
            self.iso_models = [IsotonicRegression(increasing=True, y_min=0.0, y_max=0.0, out_of_bounds='clip').fit([0.0, 1.0], [0.0, 0.0]) for _ in range(3)]
            tau = 1e9
        else:
            qs = np.quantile(d_cal_h, np.linspace(0, 1, 11))
            qs = np.unique(qs)
            if len(qs) < 3:
                self.iso_models = [IsotonicRegression(increasing=True, y_min=0.0, y_max=self.alpha_max, out_of_bounds='clip').fit([0.0, 1.0], [0.0, 0.0]) for _ in range(3)]
            else:
                bin_ids = np.digitize(d_cal_h, qs[1:-1], right=True)
                centers = []
                alpha_t = [[], [], []]
                for b in range(bin_ids.min(), bin_ids.max() + 1):
                    m = bin_ids == b
                    if m.sum() < 10:
                        continue
                    lo = qs[b]
                    hi = qs[b + 1] if (b + 1) < len(qs) else qs[-1]
                    centers.append((lo + hi) / 2)
                    for t in range(3):
                        yb = y_cal.values[m, t]
                        pb = pred_cal[m, t]
                        kb = knn_cal[m, t]
                        num = np.sum((yb - pb) * (kb - pb))
                        den = np.sum((kb - pb) ** 2) + 1e-12
                        a = float(np.clip(num / den, 0.0, self.alpha_max))
                        alpha_t[t].append(a)

                if len(centers) < 2:
                    self.iso_models = [IsotonicRegression(increasing=True, y_min=0.0, y_max=self.alpha_max, out_of_bounds='clip').fit([0.0, 1.0], [0.0, 0.0]) for _ in range(3)]
                else:
                    order = np.argsort(centers)
                    x = np.array(centers)[order]
                    self.iso_models = []
                    for t in range(3):
                        y = np.array(alpha_t[t])[order]
                        iso = IsotonicRegression(increasing=True, y_min=0.0, y_max=self.alpha_max, out_of_bounds='clip')
                        iso.fit(x, y)
                        self.iso_models.append(iso)

        # Refit base on full fold training
        self.base.train_model(X_train, y_train)

        # Refit distance scaler+NN on full fold (unsupervised)
        D_full = self._dist_features(X_train)
        self.scaler = StandardScaler()
        D_full_s = self.scaler.fit_transform(D_full)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors, metric='euclidean')
        self.nn.fit(D_full_s)
        # Store full-fold y for kNN fallback
        self.y_train_full = y_train.reset_index(drop=True).values
        self.tau = tau

    def predict(self, X):
        base_pred = self.base.predict(X).detach().cpu().numpy()
        if (self.data_type == 'single') and (not self.apply_on_single):
            return torch.tensor(np.clip(base_pred, 0.0, 1.0), dtype=torch.double)

        D = self._dist_features(X)
        D_s = self.scaler.transform(D)
        dist_mat, idx_mat = self.nn.kneighbors(D_s, return_distance=True)
        d = dist_mat.mean(axis=1)
        d_h = np.maximum(0.0, d - self.tau)

        # kNN mean y fallback
        w = 1.0 / (dist_mat + 1e-6)
        w = w / w.sum(axis=1, keepdims=True)
        knn_pred = np.einsum('ij,ijk->ik', w, self.y_train_full[idx_mat])

        out = base_pred.copy()
        for t in range(3):
            alpha = self.iso_models[t].predict(d_h)
            alpha = np.clip(alpha, 0.0, self.alpha_max)
            out[:, t] = (1 - alpha) * out[:, t] + alpha * knn_pred[:, t]

        out = np.clip(out, 0.0, 1.0)
        return torch.tensor(out, dtype=torch.double)


print('GPMLPLGBMEnsemble_TemplateSafe defined. (Flip bug fixed)')
print('GPMLPLGBM_ADShrinkageV2 defined.')


GPMLPLGBMEnsemble_TemplateSafe defined. (Flip bug fixed)
GPMLPLGBM_ADShrinkage defined.


In [6]:
# Quick test to verify model works
print("Testing model...")
X, Y = load_data("single_solvent")
print(f"Single solvent data: X={X.shape}, Y={Y.shape}")

# Test one fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

model = GPMLPLGBM_ADShrinkageV2(data='single')
model.train_model(train_X, train_Y)
preds = model.predict(test_X)

print(f"Predictions shape: {preds.shape}")
print(f"Predictions range: [{preds.min():.4f}, {preds.max():.4f}]")
print("Model test passed!")

Testing model...
Single solvent data: X=(656, 3), Y=(656, 3)


Predictions shape: torch.Size([37, 3])
Predictions range: [0.0408, 0.8741]
Model test passed!


In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GPMLPLGBM_ADShrinkageV2() # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [01:01, 61.95s/it]

2it [02:03, 61.51s/it]

3it [03:00, 59.72s/it]

4it [03:58, 58.94s/it]

5it [05:01, 60.40s/it]

6it [06:05, 61.48s/it]

7it [07:06, 61.61s/it]

8it [08:07, 61.35s/it]

9it [09:11, 61.97s/it]

10it [10:13, 62.12s/it]

11it [11:16, 62.49s/it]

12it [12:20, 62.85s/it]

13it [13:21, 62.26s/it]

14it [14:24, 62.36s/it]

15it [15:26, 62.50s/it]

16it [16:28, 62.30s/it]

17it [17:32, 62.78s/it]

18it [18:33, 62.32s/it]

19it [19:34, 61.84s/it]

20it [20:37, 62.07s/it]

21it [21:40, 62.44s/it]

22it [22:43, 62.54s/it]

23it [23:44, 62.09s/it]

24it [24:47, 62.47s/it]

24it [24:47, 61.99s/it]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = GPMLPLGBM_ADShrinkage(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [04:08, 248.28s/it]

2it [08:18, 249.59s/it]

3it [12:28, 249.59s/it]

4it [16:40, 250.42s/it]

5it [20:53, 251.64s/it]

6it [25:05, 251.56s/it]

7it [29:40, 259.16s/it]

8it [33:46, 255.06s/it]

9it [38:25, 262.64s/it]

10it [42:33, 257.97s/it]

11it [47:00, 260.79s/it]

12it [51:07, 256.66s/it]

13it [55:43, 262.55s/it]

13it [55:43, 257.21s/it]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Submission saved to /home/submission/submission.csv")
print(f"Total rows: {len(submission)}")

Submission saved to /home/submission/submission.csv
Total rows: 1883
