In [None]:
import numpy as np
import pandas as pd
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import sys
sys.path.append('/kaggle/input/catechol-benchmark-hackathon/')

from utils import (
    INPUT_LABELS_NUMERIC,
    INPUT_LABELS_SINGLE_FEATURES,
    INPUT_LABELS_FULL_FEATURES,
    load_data,
    load_features,
    generate_leave_one_out_splits,
    generate_leave_one_ramp_out_splits,
)

In [None]:
from abc import ABC, abstractmethod

class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X, flip=False):
        raise NotImplementedError


class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

torch.set_default_dtype(torch.double)

In [None]:
class KineticMixingFeaturizer(SmilesFeaturizer):
    def __init__(self, features="spange_descriptors", mixed=False):
        self.mixed = mixed
        self.featurizer = load_features(features)
        self.feats_dim = self.featurizer.shape[1] + 2 + 3

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)

        # --- KINETIC FEATURES ---
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]

        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time

        X_kinetic = torch.tensor(
            np.hstack([X_vals, inv_temp, log_time, interaction])
        )

        X_kinetic = X_kinetic + 0.01 * torch.randn_like(X_kinetic)

        # --- CHEMICAL FEATURES ---
        if self.mixed:
            A = torch.tensor(self.featurizer.loc[X["SOLVENT A NAME"]].values)
            B = torch.tensor(self.featurizer.loc[X["SOLVENT B NAME"]].values)
            pct = torch.tensor(X["SolventB%"].values.reshape(-1, 1))

            if flip:
                X_chem = B * (1 - (1 - pct)) + A * (1 - pct)
            else:
                X_chem = A * (1 - pct) + B * pct
        else:
            X_chem = torch.tensor(self.featurizer.loc[X["SOLVENT NAME"]].values)

        return torch.cat([X_kinetic, X_chem], dim=1)

In [None]:
class MLPInternal(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(input_dim),
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, 3),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class SymmetricBaggedModel(nn.Module, BaseModel):
    def __init__(self, data="single"):
        super().__init__()
        self.data_type = data
        self.featurizer = KineticMixingFeaturizer(
            mixed=(data == "full")
        )

        self.n_models = 9
        self.models = nn.ModuleList()

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = torch.tensor(y_train.values)

        if self.data_type == "full":
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = torch.cat([X_std, X_flip], dim=0)
            y_all = torch.cat([y_vals, y_vals], dim=0)
        else:
            X_all = X_std
            y_all = y_vals

        input_dim = X_all.shape[1]
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        for _ in range(self.n_models):
            model = MLPInternal(input_dim).to(device)
            self.models.append(model)

            dataset = TensorDataset(X_all, y_all)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)

            optimizer = torch.optim.Adam(
                model.parameters(), lr=5e-4, weight_decay=1e-5
            )

            criterion = nn.SmoothL1Loss(beta=0.3)

            model.train()
            for _ in range(220):  # fewer epochs
                for xb, yb in loader:
                    xb, yb = xb.to(device), yb.to(device)
                    optimizer.zero_grad()
                    loss = criterion(model(xb), yb)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()

    def predict(self, X):
        device = next(self.models[0].parameters()).device

        if self.data_type == "full":
            X_std = self.featurizer.featurize(X, flip=False).to(device)
            X_flip = self.featurizer.featurize(X, flip=True).to(device)

            pred_sum = torch.zeros((len(X), 3)).to(device)
            with torch.no_grad():
                for model in self.models:
                    model.eval()
                    pred_sum += 0.5 * (model(X_std) + model(X_flip))

            return (pred_sum / self.n_models).cpu()

        else:
            X_std = self.featurizer.featurize(X).to(device)
            pred_sum = torch.zeros((len(X), 3)).to(device)
            with torch.no_grad():
                for model in self.models:
                    model.eval()
                    pred_sum += model(X_std)

            return (pred_sum / self.n_models).cpu()

In [None]:
import tqdm

X, Y = load_data("single_solvent")
split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SymmetricBaggedModel(data="single")  # ONLY CHANGE
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X).numpy()

    for row_idx, row in enumerate(predictions):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2],
        })

submission_single_solvent = pd.DataFrame(all_predictions)

In [None]:
X, Y = load_data("full")
split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = SymmetricBaggedModel(data="full")  # ONLY CHANGE
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X).numpy()

    for row_idx, row in enumerate(predictions):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2],
        })

submission_full_data = pd.DataFrame(all_predictions)

In [None]:
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)