In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports & Configuration

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from tqdm import tqdm

In [None]:
import sys
sys.path.append('/kaggle/input/catechol-benchmark-hackathon')

# Load Utilities

In [None]:
from utils import (
    load_data,
    load_features,
    generate_leave_one_out_splits,
    generate_leave_one_ramp_out_splits,
    INPUT_LABELS_NUMERIC,
    INPUT_LABELS_SINGLE_FEATURES,
    INPUT_LABELS_FULL_FEATURES,
    TARGET_LABELS,
)

# Feature Engineering Helpers

In [None]:
def build_solvent_features(solvent_names, lookup):
    features = []
    for s in solvent_names:
        if s in lookup.index:
            features.append(lookup.loc[s].values)
        else:
            features.append(np.zeros(lookup.shape[1]))
    return np.vstack(features)


def make_single_solvent_features(X, descriptor_lookup):
    num = X[INPUT_LABELS_NUMERIC].values
    solv = build_solvent_features(X["SOLVENT NAME"], descriptor_lookup)
    return np.hstack([num, solv])


def make_mixture_features(X, descriptor_lookup):
    num = X[INPUT_LABELS_NUMERIC].values
    solv_a = build_solvent_features(X["SOLVENT A NAME"], descriptor_lookup)
    solv_b = build_solvent_features(X["SOLVENT B NAME"], descriptor_lookup)
    frac_b = X["SolventB%"].values.reshape(-1, 1)
    mixed = (1 - frac_b) * solv_a + frac_b * solv_b
    return np.hstack([num, mixed])

# Load Data & Descriptors

In [None]:
X_full, Y_full = load_data("full")
X_single, Y_single = load_data("single_solvent")

descriptor_lookup = load_features("spange_descriptors")

print("Full data shape:", X_full.shape)
print("Single-solvent data shape:", X_single.shape)
print("Descriptor dimension:", descriptor_lookup.shape)

# Feature Matrices

In [None]:
X_full_feat = make_mixture_features(X_full, descriptor_lookup)
X_single_feat = make_single_solvent_features(X_single, descriptor_lookup)

Y_full = Y_full.values
Y_single = Y_single.values

# Model Definition

In [None]:
class CatecholModel:
    def __init__(self):
        base = GradientBoostingRegressor(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=4,
            random_state=42
        )
        self.model = Pipeline([
            ("scaler", StandardScaler()),
            ("regressor", MultiOutputRegressor(base))
        ])

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        preds = self.model.predict(X)
        return np.clip(preds, 0.0, 1.0)

# Cross-Validation (Single-Solvent)

In [None]:
single_results = []

splits = list(generate_leave_one_out_splits(X_single, pd.DataFrame(Y_single)))

for fold_idx, ((X_tr, Y_tr), (X_te, Y_te)) in enumerate(tqdm(splits)):
    Xtr = make_single_solvent_features(X_tr, descriptor_lookup)
    Xte = make_single_solvent_features(X_te, descriptor_lookup)

    model = CatecholModel()
    model.fit(Xtr, Y_tr.values)
    preds = model.predict(Xte)

    for row_idx, row in enumerate(preds):
        single_results.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2],
        })

submission_single_solvent = pd.DataFrame(single_results)

# Cross-Validation (Mixture Solvents)

In [None]:
mixture_results = []

splits = list(generate_leave_one_ramp_out_splits(X_full, pd.DataFrame(Y_full)))

for fold_idx, ((X_tr, Y_tr), (X_te, Y_te)) in enumerate(tqdm(splits)):
    Xtr = make_mixture_features(X_tr, descriptor_lookup)
    Xte = make_mixture_features(X_te, descriptor_lookup)

    model = CatecholModel()
    model.fit(Xtr, Y_tr.values)
    preds = model.predict(Xte)

    for row_idx, row in enumerate(preds):
        mixture_results.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2],
        })

submission_full_data = pd.DataFrame(mixture_results)

# Create Submission

In [None]:
submission_single_solvent = pd.DataFrame(single_results)
submission_full_data = pd.DataFrame(mixture_results)

submission = pd.concat([submission_single_solvent, submission_full_data])

submission = submission.reset_index()
submission.index.name = "id"

submission.to_csv("submission.csv", index=True)

submission.head()

In [None]:
print("Submission file written: submission.csv")
print("Rows:", len(submission))