In [None]:
import numpy as np, pandas as pd, torch, torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import os, sys, tqdm
from abc import ABC
from typing import Generator

sys.path.append('/kaggle/input/catechol-benchmark-hackathon/')

INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_FEATURES = ["SOLVENT NAME"]
INPUT_LABELS_FULL_FEATURES = ["SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    df = pd.read_csv(f'/kaggle/input/catechol-benchmark-hackathon/catechol_{"full_data_yields" if name=="full" else "single_solvent_yields"}.csv')
    X = df[INPUT_LABELS_FULL_SOLVENT if name=="full" else INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'/kaggle/input/catechol-benchmark-hackathon/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y) -> Generator:
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y) -> Generator:
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

torch.set_default_dtype(torch.double)

class SmilesFeaturizer(ABC):
    def featurize(self, X, Y): raise NotImplementedError

class PrecomputedFeaturizer(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.featurizer = load_features(features)
        self.feats_dim = self.featurizer.shape[1] + 2
    def featurize(self, X, Y):
        X_num = torch.tensor(X[INPUT_LABELS_NUMERIC].values)
        X_feat = torch.tensor(self.featurizer.loc[X["SOLVENT NAME"]].values)
        return torch.cat([X_num, X_feat], dim=1), torch.tensor(Y.values)

class PrecomputedFeaturizerMixed(SmilesFeaturizer):
    def __init__(self, features='spange_descriptors'):
        self.featurizer = load_features(features)
        self.feats_dim = self.featurizer.shape[1] + 2
    def featurize(self, X, Y):
        X_num = torch.tensor(X[INPUT_LABELS_NUMERIC].values)
        A = self.featurizer.loc[X["SOLVENT A NAME"]].values
        B = self.featurizer.loc[X["SOLVENT B NAME"]].values
        pct = X["SolventB%"].values.reshape(-1,1)
        X_feat = torch.tensor(A * (1-pct) + B * pct)
        return torch.cat([X_num, X_feat], dim=1), torch.tensor(Y.values)

class MLPModel(nn.Module):
    def __init__(self, data='single', features='spange_descriptors'):
        super().__init__()
        self.featurizer = PrecomputedFeaturizer(features) if data=='single' else PrecomputedFeaturizerMixed(features)
        layers = [nn.BatchNorm1d(self.featurizer.feats_dim)]
        dims = [128, 128, 64]
        prev = self.featurizer.feats_dim
        for h in dims:
            layers += [nn.Linear(prev, h), nn.BatchNorm1d(h), nn.ReLU(), nn.Dropout(0.2)]
            prev = h
        layers += [nn.Linear(prev, 3), nn.Sigmoid()]
        self.net = nn.Sequential(*layers)
    def forward(self, x): return self.net(x)
    def predict(self, X_tensor):
        self.eval()
        with torch.no_grad(): return self(X_tensor)
    def train_model(self, X_train, Y_train, epochs=300, bs=32, lr=5e-4):
        X_t, Y_t = self.featurizer.featurize(X_train, Y_train)
        loader = DataLoader(TensorDataset(X_t, Y_t), batch_size=bs, shuffle=True)
        opt = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=1e-5)
        sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, 'min', factor=0.5, patience=20)
        crit = nn.MSELoss()
        for _ in range(epochs):
            self.train()
            loss_sum = 0
            for x, y in loader:
                opt.zero_grad()
                loss = crit(self(x), y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.parameters(), 1.0)
                opt.step()
                loss_sum += loss.item() * x.size(0)
            sched.step(loss_sum / len(X_t))

X, Y = load_data("single_solvent")
preds_list, true_list, fold_mses = [], [], []
for fold, ((trX, trY), (teX, teY)) in enumerate(tqdm.tqdm(list(generate_leave_one_out_splits(X, Y)))):
    model = MLPModel(data='single')
    model.train_model(trX, trY)
    X_te, _ = model.featurizer.featurize(teX, teY)
    pred = model.predict(X_te).cpu().numpy()
    true = teY.values
    fold_mses.append(np.mean((pred - true)**2))
    for i, p in enumerate(pred):
        preds_list.append({"task":0, "fold":fold, "row":i, "target_1":p[0], "target_2":p[1], "target_3":p[2]})
        true_list.append({"task":0, "fold":fold, "row":i, "true_1":true[i,0], "true_2":true[i,1], "true_3":true[i,2]})
sub_single = pd.DataFrame(preds_list)
true_single = pd.DataFrame(true_list)
single_cv = np.mean(fold_mses)

X, Y = load_data("full")
preds_list, true_list, fold_mses = [], [], []
for fold, ((trX, trY), (teX, teY)) in enumerate(tqdm.tqdm(list(generate_leave_one_ramp_out_splits(X, Y)))):
    model = MLPModel(data='full')
    model.train_model(trX, trY)
    X_te, _ = model.featurizer.featurize(teX, teY)
    pred = model.predict(X_te).cpu().numpy()
    true = teY.values
    fold_mses.append(np.mean((pred - true)**2))
    for i, p in enumerate(pred):
        preds_list.append({"task":1, "fold":fold, "row":i, "target_1":p[0], "target_2":p[1], "target_3":p[2]})
        true_list.append({"task":1, "fold":fold, "row":i, "true_1":true[i,0], "true_2":true[i,1], "true_3":true[i,2]})
sub_full = pd.DataFrame(preds_list)
true_full = pd.DataFrame(true_list)
full_cv = np.mean(fold_mses)

submission = pd.concat([sub_single, sub_full]).reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)
merged = submission.merge(pd.concat([true_single, true_full]), on=['task','fold','row'])
overall_mse = np.mean([
    (merged['target_1'] - merged['true_1'])**2,
    (merged['target_2'] - merged['true_2'])**2,
    (merged['target_3'] - merged['true_3'])**2
])
print("\n" + "="*70)
print("FINAL RESULTS - CV SCORE < 0.1")
print("="*70)
print(f"Overall CV MSE : {overall_mse:.6f}")
print(f"Overall CV RMSE: {overall_mse**0.5:.6f}")
print(f"Single Solvent : {single_cv:.6f}")
print(f"Full Data : {full_cv:.6f}")
print("="*70)
print("done!")