In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
sys.path.append('/kaggle/input/catechol-benchmark-hackathon/')

from utils import INPUT_LABELS_FULL_SOLVENT, INPUT_LABELS_SINGLE_SOLVENT, INPUT_LABELS_NUMERIC, INPUT_LABELS_SINGLE_FEATURES, INPUT_LABELS_FULL_FEATURES, load_data, load_features, generate_leave_one_out_splits, generate_leave_one_ramp_out_splits

In [None]:
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

In the cells below we create the base classes of the two main objects you must write for the competition. 

The first thing to write is a SmilesFeaturizer, which will take the solvent molecules and create a machine-learning ready featurization of the molecule. Finding better ways of featurizing solvents is one of the goals of the hackathon, however, you can also skip this step and use the pre-computed featurizations given in the utils file. Further down, you can see a SmilesFeaturizer that loads all the precomputed representations. A **featurizer** object simply consists of:
- An initialization function
- A featurize function that takes 

The second one being a **model** which has:
- An initialization function, where the model internally defines which featurizer to use
- A "train_model" which lets the model train on data given by X_train, y_train as pandas data-frames. 
- A "predict" which takes a data frame of test inputs and makes a prediction

In the next cell we define two featurizers, which allow you to use the pre-computed featurizations from the original benchmark paper. These are:

- drfps
- fragprints
- acs_pca_descriptors
- spange_descriptors

You can refer to the paper for more details on them. We also include the simple SMILES string featurization which can be chained into more complicated representations.

The first featurizer simply uses the features directly. The second one is expanded to featurize *mixed* solvents too, which is done by taking a weighted average of the two single-solvent features.

We also show how to write code for a simple multi-layer perceptron on the data.

In [None]:
class PerTargetEnsembleModel:
    def __init__(self):
        self.targets = ["Product 2", "Product 3", "SM"]
        self.models = {}

        for t in self.targets:
            if t == "SM":
                self.models[t] = [
                    BetterCatecholModel(feature_table="acs_pca_descriptors", base_type="hgb"),
                    BetterCatecholModel(feature_table="spange_descriptors", base_type="hgb"),
                ]
            else:
                self.models[t] = [
                    BetterCatecholModel(feature_table="acs_pca_descriptors", base_type="etr"),
                    BetterCatecholModel(feature_table="spange_descriptors", base_type="etr"),
                ]

    def train_model(self, X, Y):
        for t in self.targets:
            y_single = Y[[t]]
            for m in self.models[t]:
                m.train_model(X, y_single)

    def predict(self, X):
        preds = []

        for t in self.targets:
            preds_t = []
            for m in self.models[t]:
                p = m.model.predict(m._build_X(X))
                preds_t.append(p.reshape(-1, 1))

            # average ensemble
            pred_t = 0.65 * preds_t[0] + 0.35 * preds_t[1]
            preds.append(pred_t)

        pred = np.hstack(preds)
        pred = np.clip(pred, 0, 1)

        return torch.tensor(pred, dtype=torch.double)


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

torch.set_default_dtype(torch.double)

# ============================================================
# Featurizers (TEMPLATE – DO NOT CHANGE LOGIC)
# ============================================================

class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self, features="spange_descriptors"):
        self.featurizer = load_features(features)
        self.feats_dim = self.featurizer.shape[1] + 2

    def featurize(self, X):
        X_num = torch.tensor(X[INPUT_LABELS_NUMERIC].values)
        X_sol = torch.tensor(self.featurizer.loc[X["SOLVENT NAME"]].values)
        return torch.cat([X_num, X_sol], dim=1)


class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self, features="spange_descriptors"):
        self.featurizer = load_features(features)
        self.feats_dim = self.featurizer.shape[1] + 2

    def featurize(self, X):
        X_num = torch.tensor(X[INPUT_LABELS_NUMERIC].values)

        A = self.featurizer.loc[X["SOLVENT A NAME"]].values
        B = self.featurizer.loc[X["SOLVENT B NAME"]].values
        frac_b = X["SolventB%"].values.reshape(-1, 1)

        mix = A * (1 - frac_b) + B * frac_b
        X_mix = torch.tensor(mix)

        return torch.cat([X_num, X_mix], dim=1)


# ============================================================
# TEMPLATE BASELINE MODEL (KEEP UNCHANGED)
# ============================================================

class MLPModel(nn.Module, BaseModel):
    def __init__(self, features="spange_descriptors", hidden_dims=[64, 64], output_dim=3, data="single"):
        super().__init__()

        self.featurizer = (
            PrecomputedFeaturizer(features)
            if data == "single"
            else PrecomputedFeaturizerMixed(features)
        )

        layers = []
        dim = self.featurizer.feats_dim
        for h in hidden_dims:
            layers += [nn.Linear(dim, h), nn.ReLU()]
            dim = h

        layers.append(nn.Linear(dim, output_dim))
        self.net = nn.Sequential(*layers)

    def train_model(self, X, Y, **kwargs):
        X = self.featurizer.featurize(X)
        Y = torch.tensor(Y.values)

        opt = torch.optim.Adam(self.parameters(), lr=1e-3)
        loss_fn = nn.MSELoss()

        for _ in range(100):
            opt.zero_grad()
            loss = loss_fn(self.net(X), Y)
            loss.backward()
            opt.step()

    def predict(self, X):
        return self.net(self.featurizer.featurize(X))


# ============================================================
# CUSTOM SKLEARN MODEL (FIXED)
# ============================================================

import numpy as np
import pandas as pd
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


class BetterCatecholModel:
    def __init__(self, feature_table="spange_descriptors", base_type="hgb"):
        self.base_type = base_type
        self.lookup = (
            pd.read_csv(
                f"/kaggle/input/catechol-benchmark-hackathon/{feature_table}_lookup.csv",
                index_col=0,
            )
            .apply(pd.to_numeric, errors="coerce")
            .fillna(0)
        )
        self.model = None

    def _vec(self, s):
        return self.lookup.loc[s].values if s in self.lookup.index else np.zeros(self.lookup.shape[1])

    def _build_X(self, X):
        rt = X["Residence Time"].values.reshape(-1, 1)
        temp = X["Temperature"].values.reshape(-1, 1)

        if "SOLVENT NAME" in X.columns:
            S = np.vstack([self._vec(s) for s in X["SOLVENT NAME"]])
            return np.hstack([rt, temp, S])

        frac_b = X["SolventB%"].values.reshape(-1, 1) / 100.0
        A = np.vstack([self._vec(s) for s in X["SOLVENT A NAME"]])
        B = np.vstack([self._vec(s) for s in X["SOLVENT B NAME"]])
        mix = (1 - frac_b) * A + frac_b * B

        return np.hstack([rt, temp, frac_b, mix])

    def train_model(self, X, Y):
        Xf = self._build_X(X)
        y = Y.values

        if self.base_type == "hgb":
            base = HistGradientBoostingRegressor(
                max_depth=7, max_iter=700, learning_rate=0.04
            )
        else:
            base = ExtraTreesRegressor(
                n_estimators=900,
                min_samples_leaf=2,
                random_state=42,
                n_jobs=-1,
            )

        self.model = Pipeline(
            [("scaler", StandardScaler()), ("reg", MultiOutputRegressor(base))]
        )
        self.model.fit(Xf, y)

    def predict(self, X):
        pred = np.clip(self.model.predict(self._build_X(X)), 0, 1)
        return torch.tensor(pred, dtype=torch.double)


# ============================================================
# PER-TARGET + HETEROGENEOUS ENSEMBLE
# ============================================================

class PerTargetEnsembleModel:
    def __init__(self):
        self.targets = ["Product 2", "Product 3", "SM"]
        self.models = {}

        for t in self.targets:
            if t == "SM":
                self.models[t] = [
                    BetterCatecholModel("acs_pca_descriptors", "hgb"),
                    BetterCatecholModel("spange_descriptors", "hgb"),
                ]
            else:
                self.models[t] = [
                    BetterCatecholModel("acs_pca_descriptors", "etr"),
                    BetterCatecholModel("spange_descriptors", "etr"),
                ]

    def train_model(self, X, Y):
        for t in self.targets:
            y_single = Y[[t]]
            for m in self.models[t]:
                m.train_model(X, y_single)

    def predict(self, X):
        preds = []

        for t in self.targets:
            p1 = self.models[t][0].model.predict(self.models[t][0]._build_X(X))
            p2 = self.models[t][1].model.predict(self.models[t][1]._build_X(X))

            pred_t = 0.65 * p1 + 0.35 * p2
            preds.append(pred_t.reshape(-1, 1))

        pred = np.clip(np.hstack(preds), 0, 1)
        return torch.tensor(pred, dtype=torch.double)


# ============================================================
# TWO MODELS FOR CV
# ============================================================

model_single = PerTargetEnsembleModel()
model_full = PerTargetEnsembleModel()


From this point onward the cross-validation procedure is calculated. **For a submission to be valid the next three cells must be the final three of your submission, and you can only modify the lines where the models are defined.**

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = model_single # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = model_full # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################