In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
sys.path.append('/kaggle/input/catechol-benchmark-hackathon/')

from utils import INPUT_LABELS_FULL_SOLVENT, INPUT_LABELS_SINGLE_SOLVENT, INPUT_LABELS_NUMERIC, INPUT_LABELS_SINGLE_FEATURES, INPUT_LABELS_FULL_FEATURES, load_data, load_features, generate_leave_one_out_splits, generate_leave_one_ramp_out_splits

In [None]:
from sklearn.model_selection import GroupKFold
from typing import Any, Generator

# Overwrite utility functions to use GroupKFold (5 splits) instead of Leave-One-Out
# This ensures we respect the compute budget while maintaining unseen solvent validation
# Since we cannot modify the utils.py file on Kaggle platform directly, we redefine them here.

def generate_leave_one_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    """Generate Group K-Fold splits across the solvents (5-fold)."""
    groups = X["SOLVENT NAME"]
    n_groups = len(groups.unique())
    n_splits = min(5, n_groups)
    
    gkf = GroupKFold(n_splits=n_splits)
    
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (
            (X.iloc[train_idx], Y.iloc[train_idx]),
            (X.iloc[test_idx], Y.iloc[test_idx]),
        )

def generate_leave_one_ramp_out_splits(
    X: pd.DataFrame, Y: pd.DataFrame
) -> Generator[
    tuple[tuple[pd.DataFrame, pd.DataFrame], tuple[pd.DataFrame, pd.DataFrame]],
    Any,
    None,
]:
    """Generate Group K-Fold splits across the solvent ramps (5-fold)."""
    groups = X["SOLVENT A NAME"].astype(str) + "_" + X["SOLVENT B NAME"].astype(str)
    
    n_groups = len(groups.unique())
    n_splits = min(5, n_groups)
    
    gkf = GroupKFold(n_splits=n_splits)
    
    for train_idx, test_idx in gkf.split(X, Y, groups):
        yield (
            (X.iloc[train_idx], Y.iloc[train_idx]),
            (X.iloc[test_idx], Y.iloc[test_idx]),
        )


In the cells below we create the base classes of the two main objects you must write for the competition. 

The first thing to write is a SmilesFeaturizer, which will take the solvent molecules and create a machine-learning ready featurization of the molecule. Finding better ways of featurizing solvents is one of the goals of the hackathon, however, you can also skip this step and use the pre-computed featurizations given in the utils file. Further down, you can see a SmilesFeaturizer that loads all the precomputed representations. A **featurizer** object simply consists of:
- An initialization function
- A featurize function that takes 

The second one being a **model** which has:
- An initialization function, where the model internally defines which featurizer to use
- A "train_model" which lets the model train on data given by X_train, y_train as pandas data-frames. 
- A "predict" which takes a data frame of test inputs and makes a prediction

In [None]:
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

In the next cell we define two featurizers, which allow you to use the pre-computed featurizations from the original benchmark paper. These are:

- drfps
- fragprints
- acs_pca_descriptors
- spange_descriptors

You can refer to the paper for more details on them. We also include the simple SMILES string featurization which can be chained into more complicated representations.

The first featurizer simply uses the features directly. The second one is expanded to featurize *mixed* solvents too, which is done by taking a weighted average of the two single-solvent features.

We also show how to write code for a simple multi-layer perceptron on the data.

In [None]:

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
import numpy as np
import pandas as pd

# --- Featurizers ---
class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 2 # +2 for Time, Temp
        
    def featurize(self, X):
        # X is DataFrame with 'Residence Time', 'Temperature', 'SOLVENT NAME'
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        
        solvent_names = X['SOLVENT NAME']
        feats = self.features.loc[solvent_names].values
        
        final_feats = np.hstack([res_time, temp, feats])
        return torch.tensor(final_feats, dtype=torch.float32)

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.features = load_features(features)
        self.feats_dim = self.features.shape[1] + 3 # +3 for Time, Temp, %B
        
    def featurize(self, X):
        res_time = X['Residence Time'].values.reshape(-1, 1)
        temp = X['Temperature'].values.reshape(-1, 1)
        sb_pct = X['SolventB%'].values.reshape(-1, 1)
        
        desc_a = self.features.loc[X['SOLVENT A NAME']].values
        desc_b = self.features.loc[X['SOLVENT B NAME']].values
        
        mixture_feats = (1 - sb_pct) * desc_a + sb_pct * desc_b
        
        final_feats = np.hstack([res_time, temp, sb_pct, mixture_feats])
        return torch.tensor(final_feats, dtype=torch.float32)

# --- MLP ---
class EnhancedMLP(nn.Module):
    def __init__(self, input_dim, output_dim=3, hidden_dims=[128, 64, 32], dropout=0.1):
        super(EnhancedMLP, self).__init__()
        layers = []
        in_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, output_dim))
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.network(x)

# --- Ensemble ---
class EnsembleModel(nn.Module, BaseModel):
    def __init__(self, features='spange_descriptors', hidden_dims=[128, 64, 32], output_dim=3, dropout=0.1, data='single', use_tta=True, weights=None):
        super(EnsembleModel, self).__init__()
        
        if data == 'single':
            self.smiles_featurizer = PrecomputedFeaturizer(features=features)
        else:
            self.smiles_featurizer = PrecomputedFeaturizerMixed(features=features)
            
        self.input_dim = self.smiles_featurizer.feats_dim
        self.use_tta = use_tta
        self.weights = weights if weights is not None else [0.4, 0.2, 0.2, 0.2] # MLP, XGB, RF, LGBM

        # MLP
        self.mlp = EnhancedMLP(self.input_dim, output_dim, hidden_dims, dropout)
        
        # XGBoost
        self.xgb_params = {'n_estimators': 300, 'learning_rate': 0.05, 'max_depth': 6, 'subsample': 0.8, 'n_jobs': -1, 'random_state': 42}
        self.xgb = None
        
        # RandomForest
        self.rf_params = {'n_estimators': 300, 'max_depth': 15, 'n_jobs': -1, 'random_state': 42}
        self.rf = None
        
        # LightGBM
        self.lgb_params = {'n_estimators': 300, 'learning_rate': 0.05, 'num_leaves': 31, 'n_jobs': -1, 'random_state': 42, 'verbose': -1}
        self.lgbm = None
        
        self.scaler = StandardScaler()

    def train_model(self, train_X, train_Y, criterion=nn.MSELoss, optimizer=torch.optim.Adam, num_epochs=100, batch_size=32, device="cpu", verbose=True, lr=1e-3):
        # Data Prep
        X_tensor = self.smiles_featurizer.featurize(train_X)
        X_np = X_tensor.numpy()
        X_scaled = self.scaler.fit_transform(X_np)
        
        # Ensure strict DataFrame format with string column names for LightGBM compatibility
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        train_Y_np = train_Y.values
        
        # Train GBDTs with DataFrame input
        self.xgb = MultiOutputRegressor(xgb.XGBRegressor(**self.xgb_params))
        self.xgb.fit(X_scaled_df, train_Y_np)
        
        self.rf = MultiOutputRegressor(RandomForestRegressor(**self.rf_params))
        self.rf.fit(X_scaled_df, train_Y_np)
        
        self.lgbm = MultiOutputRegressor(lgb.LGBMRegressor(**self.lgb_params))
        self.lgbm.fit(X_scaled_df, train_Y_np)
        
        # Train MLP (still uses tensors)
        X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32)
        train_Y_tensor = torch.tensor(train_Y_np, dtype=torch.float32)
        
        if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.mlp.to(device)
        
        optimizer_inst = optimizer(self.mlp.parameters(), lr=lr)
        train_loader = DataLoader(TensorDataset(X_tensor_scaled, train_Y_tensor), batch_size=batch_size, shuffle=True, drop_last=True)
        
        criterion_inst = criterion()
        for epoch in range(num_epochs):
            self.mlp.train()
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer_inst.zero_grad()
                loss = criterion_inst(self.mlp(inputs), targets)
                loss.backward()
                optimizer_inst.step()

    def predict(self, test_X):
        X_tensor = self.smiles_featurizer.featurize(test_X)
        X_np = X_tensor.numpy()
        X_scaled = self.scaler.transform(X_np)
        
        # Ensure strict DataFrame format with string column names for LightGBM compatibility
        feature_names = [str(i) for i in range(X_scaled.shape[1])]
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
        
        # MLP Preds
        self.mlp.eval()
        with torch.no_grad():
            X_tensor_scaled = torch.tensor(X_scaled, dtype=torch.float32)
            mlp_preds = self.mlp(X_tensor_scaled).cpu().numpy()
            
        # GBDT Preds
        xgb_preds = self.xgb.predict(X_scaled_df)
        rf_preds = self.rf.predict(X_scaled_df)
        lgb_preds = self.lgbm.predict(X_scaled_df)
        
        # Weighted Ensemble
        final_preds = (self.weights[0] * mlp_preds + 
                       self.weights[1] * xgb_preds + 
                       self.weights[2] * rf_preds + 
                       self.weights[3] * lgb_preds)
                       
        return torch.tensor(final_preds)


In [None]:
def run_optuna_optimization(X_train, y_train, X_val, y_val, n_trials=20):
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    
    def objective(trial):
        # Params
        lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
        dropout = trial.suggest_float('dropout', 0.1, 0.5)
        hidden_dim_1 = trial.suggest_int('hidden_dim_1', 64, 256)
        
        xgb_depth = trial.suggest_int('xgb_depth', 3, 8)
        rf_depth = trial.suggest_int('rf_depth', 5, 15)
        lgb_leaves = trial.suggest_int('lgb_leaves', 15, 63)
        
        # Weights (Dirichlet-like via normalization)
        w_mlp = trial.suggest_float('w_mlp', 0.1, 1.0)
        w_xgb = trial.suggest_float('w_xgb', 0.1, 1.0)
        w_rf = trial.suggest_float('w_rf', 0.1, 1.0)
        w_lgb = trial.suggest_float('w_lgb', 0.1, 1.0)
        total_w = w_mlp + w_xgb + w_rf + w_lgb
        weights = [w_mlp/total_w, w_xgb/total_w, w_rf/total_w, w_lgb/total_w]
        
        model = EnsembleModel(hidden_dims=[hidden_dim_1, hidden_dim_1//2, hidden_dim_1//4], dropout=dropout, use_tta=False, weights=weights)
        model.xgb_params['max_depth'] = xgb_depth
        model.rf_params['max_depth'] = rf_depth
        model.lgb_params['num_leaves'] = lgb_leaves
        
        model.train_model(X_train, y_train, num_epochs=50, lr=lr, verbose=False)
        preds = model.predict(X_val).numpy()
        return np.mean(np.abs(preds - y_val.values))

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    
    print("Best params:", study.best_params)
    return study.best_params

In [None]:
# To run hyperparameter optimization, uncomment the following lines:
# print("Starting Optuna optimization...")
# X_full, Y_full = load_data("full")
# 
# # Split data for optimization
# from sklearn.model_selection import train_test_split
# X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(X_full, Y_full, test_size=0.2, random_state=42)
# 
# # Run optimization
# best_params = run_optuna_optimization(X_train_opt, y_train_opt, X_val_opt, y_val_opt, n_trials=50)
# print("Optimization complete. Best params:", best_params)


From this point onward the cross-validation procedure is calculated. **For a submission to be valid the next three cells must be the final three of your submission, and you can only modify the lines where the models are defined.**

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################