# Experiment 056: Per-Target Model Selection

**Goal:** Implement the approach from public kernel "catechol-strategy-to-get-0-11161" (LB 0.11161)

**Approach:**
- Different model types for different targets:
  - SM (hardest): HistGradientBoostingRegressor
  - Product 2, Product 3 (easier): ExtraTreesRegressor
- Weighted ensemble: 0.65 * ACS + 0.35 * Spange features

**Hypothesis:** This approach might change the CV-LB relationship by using target-specific models.

In [None]:
# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from abc import ABC, abstractmethod
import tqdm
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

torch.set_default_dtype(torch.double)

# Data path for local execution
DATA_PATH = "/home/data"

print("Imports complete.")

In [None]:
# Constants from official template
INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time",
    "Temperature",
    "SOLVENT A NAME",
    "SOLVENT B NAME",
    "SolventB%",
]

INPUT_LABELS_SINGLE_SOLVENT = [
    "Residence Time",
    "Temperature",
    "SOLVENT NAME",
]

INPUT_LABELS_NUMERIC = [
    "Residence Time",
    "Temperature",
]

TARGET_LABELS = [
    "Product 2",
    "Product 3",
    "SM",
]

print("Constants defined.")

In [None]:
# Data loading functions
def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    assert name in ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints", "smiles"]
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

# CV functions from official template
def generate_leave_one_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvents."""
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Generate all leave-one-out splits across the solvent ramps."""
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = ~((X["SOLVENT A NAME"] == solvent_pair["SOLVENT A NAME"]) & 
                           (X["SOLVENT B NAME"] == solvent_pair["SOLVENT B NAME"]))
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

print("Data loading and CV functions defined.")

In [None]:
# Base classes from official template
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

print("Base classes defined.")

In [None]:
# BetterCatecholModel from public kernel
class BetterCatecholModel:
    """Sklearn-based model with proper feature building."""
    
    def __init__(self, feature_table="spange_descriptors", base_type="hgb"):
        self.base_type = base_type
        self.lookup = (
            pd.read_csv(
                f"{DATA_PATH}/{feature_table}_lookup.csv",
                index_col=0,
            )
            .apply(pd.to_numeric, errors="coerce")
            .fillna(0)
        )
        self.model = None

    def _vec(self, s):
        return self.lookup.loc[s].values if s in self.lookup.index else np.zeros(self.lookup.shape[1])

    def _build_X(self, X):
        rt = X["Residence Time"].values.reshape(-1, 1)
        temp = X["Temperature"].values.reshape(-1, 1)

        if "SOLVENT NAME" in X.columns:
            S = np.vstack([self._vec(s) for s in X["SOLVENT NAME"]])
            return np.hstack([rt, temp, S])

        frac_b = X["SolventB%"].values.reshape(-1, 1) / 100.0
        A = np.vstack([self._vec(s) for s in X["SOLVENT A NAME"]])
        B = np.vstack([self._vec(s) for s in X["SOLVENT B NAME"]])
        mix = (1 - frac_b) * A + frac_b * B

        return np.hstack([rt, temp, frac_b, mix])

    def train_model(self, X, Y):
        Xf = self._build_X(X)
        y = Y.values

        if self.base_type == "hgb":
            base = HistGradientBoostingRegressor(
                max_depth=7, max_iter=700, learning_rate=0.04
            )
        else:
            base = ExtraTreesRegressor(
                n_estimators=900,
                min_samples_leaf=2,
                random_state=42,
                n_jobs=-1,
            )

        self.model = Pipeline(
            [("scaler", StandardScaler()), ("reg", MultiOutputRegressor(base))]
        )
        self.model.fit(Xf, y)

    def predict(self, X):
        pred = np.clip(self.model.predict(self._build_X(X)), 0, 1)
        return torch.tensor(pred, dtype=torch.double)

print("BetterCatecholModel defined.")

In [None]:
# PerTargetEnsembleModel from public kernel
class PerTargetEnsembleModel:
    """Per-target model selection with weighted ensemble.
    
    - SM (hardest): HistGradientBoostingRegressor
    - Product 2, Product 3 (easier): ExtraTreesRegressor
    - Weighted ensemble: 0.65 * ACS + 0.35 * Spange
    """
    
    def __init__(self, data='single'):
        self.data = data
        self.targets = ["Product 2", "Product 3", "SM"]
        self.models = {}

        for t in self.targets:
            if t == "SM":
                # SM is hardest - use HistGradientBoostingRegressor
                self.models[t] = [
                    BetterCatecholModel("acs_pca_descriptors", "hgb"),
                    BetterCatecholModel("spange_descriptors", "hgb"),
                ]
            else:
                # Products are easier - use ExtraTreesRegressor
                self.models[t] = [
                    BetterCatecholModel("acs_pca_descriptors", "etr"),
                    BetterCatecholModel("spange_descriptors", "etr"),
                ]

    def train_model(self, X, Y, device=None, verbose=False):
        for t in self.targets:
            y_single = Y[[t]]
            for m in self.models[t]:
                m.train_model(X, y_single)

    def predict(self, X):
        preds = []

        for t in self.targets:
            p1 = self.models[t][0].model.predict(self.models[t][0]._build_X(X))
            p2 = self.models[t][1].model.predict(self.models[t][1]._build_X(X))

            # Weighted ensemble: 0.65 * ACS + 0.35 * Spange
            pred_t = 0.65 * p1 + 0.35 * p2
            preds.append(pred_t.reshape(-1, 1))

        pred = np.clip(np.hstack(preds), 0, 1)
        return torch.tensor(pred, dtype=torch.double)

print("PerTargetEnsembleModel defined.")

In [None]:
# Quick test
print("Testing PerTargetEnsembleModel...")
X, Y = load_data("single_solvent")
print(f"Single solvent data: X={X.shape}, Y={Y.shape}")

# Test one fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

model = PerTargetEnsembleModel()
model.train_model(train_X, train_Y)
preds = model.predict(test_X)

print(f"Predictions shape: {preds.shape}")
print(f"Predictions sample: {preds[0]}")
print(f"Actual sample: {test_Y.iloc[0].values}")
print("Test passed!")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = PerTargetEnsembleModel() # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Single solvent predictions: {len(submission_single_solvent)}")
print(f"Unique folds: {submission_single_solvent['fold'].nunique()}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = PerTargetEnsembleModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Full data predictions: {len(submission_full_data)}")
print(f"Unique folds: {submission_full_data['fold'].nunique()}")

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

print(f"Submission saved to /home/submission/submission.csv")
print(f"Total rows: {len(submission)}")

In [None]:
# Calculate CV for logging
print("\n" + "="*60)
print("CV CALCULATION")
print("="*60)

# Single solvent CV
X, Y = load_data("single_solvent")
fold_mses = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X, Y)):
    model = PerTargetEnsembleModel()
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    fold_mses.append(mse)
    if fold_idx % 5 == 0:
        print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

single_cv = np.mean(fold_mses)
single_std = np.std(fold_mses)
print(f"\nSingle solvent CV MSE: {single_cv:.6f} ± {single_std:.6f}")

# Full data CV
X, Y = load_data("full")
full_fold_mses = []

for fold_idx, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X, Y)):
    model = PerTargetEnsembleModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mse = np.mean((preds - test_Y.values) ** 2)
    full_fold_mses.append(mse)
    print(f"  Fold {fold_idx}: MSE = {mse:.6f}")

full_cv = np.mean(full_fold_mses)
full_std = np.std(full_fold_mses)
print(f"\nFull data CV MSE: {full_cv:.6f} ± {full_std:.6f}")

print(f"\nFINAL CV FOR LOGGING: {single_cv:.6f}")

In [None]:
# Verification
print("\n" + "="*60)
print("SUBMISSION VERIFICATION")
print("="*60)

df = pd.read_csv('/home/submission/submission.csv')

print(f"\nColumns: {df.columns.tolist()}")
print(f"Total rows: {len(df)}")

print(f"\nTask 0 (single solvent):")
task0 = df[df['task'] == 0]
print(f"  Rows: {len(task0)}")
print(f"  Folds: {task0['fold'].nunique()}")
print(f"  Fold values: {sorted(task0['fold'].unique())}")

print(f"\nTask 1 (full data):")
task1 = df[df['task'] == 1]
print(f"  Rows: {len(task1)}")
print(f"  Folds: {task1['fold'].nunique()}")
print(f"  Fold values: {sorted(task1['fold'].unique())}")

print(f"\nTarget statistics:")
for col in ['target_1', 'target_2', 'target_3']:
    print(f"  {col}: min={df[col].min():.6f}, max={df[col].max():.6f}, mean={df[col].mean():.6f}")
    print(f"    Values > 1: {(df[col] > 1).sum()}, Values < 0: {(df[col] < 0).sum()}, NaN: {df[col].isna().sum()}")

In [None]:
# Summary
print("\n" + "="*60)
print("EXPERIMENT 056: PER-TARGET MODEL SELECTION SUMMARY")
print("="*60)

print("\nAPPROACH (from public kernel 'catechol-strategy-to-get-0-11161'):")
print("  - Per-target model selection:")
print("    - SM (hardest): HistGradientBoostingRegressor (max_depth=7, max_iter=700, lr=0.04)")
print("    - Product 2, Product 3: ExtraTreesRegressor (n_estimators=900)")
print("  - Weighted ensemble: 0.65 * ACS + 0.35 * Spange features")

print(f"\nCV SCORES:")
print(f"  Single solvent: {single_cv:.6f} ± {single_std:.6f}")
print(f"  Full data: {full_cv:.6f} ± {full_std:.6f}")

print(f"\nSUBMISSION FORMAT:")
print(f"  Total rows: {len(df)}")
print(f"  Task 0: {len(task0)} rows, {task0['fold'].nunique()} folds")
print(f"  Task 1: {len(task1)} rows, {task1['fold'].nunique()} folds")
print(f"  All targets in [0, 1]: {(df['target_1'].between(0, 1)).all() and (df['target_2'].between(0, 1)).all() and (df['target_3'].between(0, 1)).all()}")

print("\nHYPOTHESIS:")
print("  This approach might change the CV-LB relationship by using target-specific models.")
print("  The public kernel achieved LB 0.11161 with this approach.")