# Tree-Based Per-Target Ensemble Model

This notebook follows the EXACT template structure required by the competition.
Only the model definition line is changed in the last 3 cells.

Features:
1. Arrhenius kinetics features (inv_temp, log_time, interaction)
2. Spange descriptors + ACS PCA descriptors combined
3. Per-target models: HistGradientBoosting for SM, ExtraTrees for Products
4. Symmetry TTA for mixed solvents
5. Reproducibility seeds

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from abc import ABC, abstractmethod
import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = True

torch.set_default_dtype(torch.double)
print("Imports complete")

Imports complete


In [2]:
# Data loading utilities - adapted for local paths
DATA_PATH = '/home/data'

INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_FEATURES = ["SOLVENT NAME"]
INPUT_LABELS_FULL_FEATURES = ["SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print("Data utilities loaded")

Data utilities loaded


In [3]:
# Base classes (from template)
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(X, Y):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self):
        raise NotImplementedError

print("Base classes defined")

Base classes defined


In [4]:
# Load feature lookup tables
SPANGE_DF = load_features('spange_descriptors')
ACS_PCA_DF = load_features('acs_pca_descriptors')
print(f"Spange descriptors: {SPANGE_DF.shape}")
print(f"ACS PCA descriptors: {ACS_PCA_DF.shape}")

Spange descriptors: (26, 13)
ACS PCA descriptors: (24, 5)


In [5]:
# Tree-based Per-Target Ensemble Model
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler

class TreeEnsembleModel(BaseModel):
    """Tree-based per-target ensemble with Arrhenius features and symmetry TTA."""
    
    def __init__(self, data='single'):
        super().__init__()
        self.data_type = data
        self.spange_df = SPANGE_DF
        self.acs_pca_df = ACS_PCA_DF
        self.scaler = StandardScaler()
        self.models = {}  # Per-target models
        
    def _create_features(self, X, flip=False):
        """Create feature matrix with Arrhenius kinetics and solvent descriptors."""
        # Numeric features
        time_m = X["Residence Time"].values.reshape(-1, 1)
        temp_c = X["Temperature"].values.reshape(-1, 1)
        
        # Arrhenius kinetics features
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        
        # Solvent features
        if self.data_type == 'full':
            # Mixed solvent: weighted average
            pct = X["SolventB%"].values.reshape(-1, 1)
            
            if flip:
                # Symmetry flip: swap A and B
                spange_A = self.spange_df.loc[X["SOLVENT B NAME"]].values
                spange_B = self.spange_df.loc[X["SOLVENT A NAME"]].values
                acs_A = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
                acs_B = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
                pct_use = 1 - pct
            else:
                spange_A = self.spange_df.loc[X["SOLVENT A NAME"]].values
                spange_B = self.spange_df.loc[X["SOLVENT B NAME"]].values
                acs_A = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
                acs_B = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
                pct_use = pct
            
            spange_feat = spange_A * (1 - pct_use) + spange_B * pct_use
            acs_feat = acs_A * (1 - pct_use) + acs_B * pct_use
        else:
            # Single solvent
            spange_feat = self.spange_df.loc[X["SOLVENT NAME"]].values
            acs_feat = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        # Combine all features
        features = np.hstack([
            time_m, temp_c,           # Original numeric (2)
            inv_temp, log_time, interaction,  # Arrhenius (3)
            spange_feat,              # Spange descriptors (13)
            acs_feat                  # ACS PCA descriptors (5)
        ])
        
        return features
    
    def train_model(self, X_train, y_train):
        """Train per-target models."""
        # Create features
        X_feat = self._create_features(X_train, flip=False)
        
        # Data augmentation for mixed solvents
        if self.data_type == 'full':
            X_flip = self._create_features(X_train, flip=True)
            X_feat = np.vstack([X_feat, X_flip])
            y_train_aug = pd.concat([y_train, y_train], ignore_index=True)
        else:
            y_train_aug = y_train
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # Train per-target models
        # SM: HistGradientBoosting (better for smooth targets)
        self.models['SM'] = HistGradientBoostingRegressor(
            max_depth=7, max_iter=700, learning_rate=0.04,
            random_state=42, early_stopping=False
        )
        self.models['SM'].fit(X_scaled, y_train_aug['SM'].values)
        
        # Product 2 & 3: ExtraTrees (better for noisy targets)
        self.models['Product 2'] = ExtraTreesRegressor(
            n_estimators=500, min_samples_leaf=2,
            random_state=42, n_jobs=-1
        )
        self.models['Product 2'].fit(X_scaled, y_train_aug['Product 2'].values)
        
        self.models['Product 3'] = ExtraTreesRegressor(
            n_estimators=500, min_samples_leaf=2,
            random_state=42, n_jobs=-1
        )
        self.models['Product 3'].fit(X_scaled, y_train_aug['Product 3'].values)
    
    def predict(self, X_test):
        """Predict with symmetry TTA for mixed solvents."""
        # Standard prediction
        X_feat = self._create_features(X_test, flip=False)
        X_scaled = self.scaler.transform(X_feat)
        
        pred_p2 = self.models['Product 2'].predict(X_scaled)
        pred_p3 = self.models['Product 3'].predict(X_scaled)
        pred_sm = self.models['SM'].predict(X_scaled)
        
        if self.data_type == 'full':
            # TTA: Also predict with flipped inputs
            X_flip = self._create_features(X_test, flip=True)
            X_flip_scaled = self.scaler.transform(X_flip)
            
            pred_p2_flip = self.models['Product 2'].predict(X_flip_scaled)
            pred_p3_flip = self.models['Product 3'].predict(X_flip_scaled)
            pred_sm_flip = self.models['SM'].predict(X_flip_scaled)
            
            # Average predictions
            pred_p2 = (pred_p2 + pred_p2_flip) / 2
            pred_p3 = (pred_p3 + pred_p3_flip) / 2
            pred_sm = (pred_sm + pred_sm_flip) / 2
        
        # Clip to [0, 1]
        pred_p2 = np.clip(pred_p2, 0, 1)
        pred_p3 = np.clip(pred_p3, 0, 1)
        pred_sm = np.clip(pred_sm, 0, 1)
        
        # Stack predictions: [Product 2, Product 3, SM]
        predictions = np.column_stack([pred_p2, pred_p3, pred_sm])
        return torch.tensor(predictions)

print("TreeEnsembleModel defined")

TreeEnsembleModel defined


In [6]:
# Quick test
print("Testing model...")
X, Y = load_data("single_solvent")
print(f"Single solvent: {X.shape}, {Y.shape}")

# Test on first fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)

model = TreeEnsembleModel(data='single')
model.train_model(train_X, train_Y)
preds = model.predict(test_X)
print(f"Predictions shape: {preds.shape}")
print(f"Sample predictions: {preds[:3]}")

Testing model...
Single solvent: (656, 3), (656, 3)


Predictions shape: torch.Size([37, 3])
Sample predictions: tensor([[0.0035, 0.0042, 0.8157],
        [0.0055, 0.0064, 0.8633],
        [0.0276, 0.0314, 0.7959]])


In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
all_actuals = []  # For CV calculation

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split

    model = TreeEnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()
    all_actuals.append(test_Y.values)  # For CV calculation

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

# Calculate CV score
all_actuals_np = np.vstack(all_actuals)
all_preds_np = np.array([[p['target_1'], p['target_2'], p['target_3']] for p in all_predictions])
single_mse = np.mean((all_actuals_np - all_preds_np) ** 2)
print(f"\nSingle Solvent CV MSE: {single_mse:.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:01<00:25,  1.11s/it]

  8%|▊         | 2/24 [00:02<00:24,  1.12s/it]

 12%|█▎        | 3/24 [00:03<00:23,  1.13s/it]

 17%|█▋        | 4/24 [00:04<00:23,  1.16s/it]

 21%|██        | 5/24 [00:05<00:21,  1.15s/it]

 25%|██▌       | 6/24 [00:06<00:20,  1.15s/it]

 29%|██▉       | 7/24 [00:08<00:19,  1.16s/it]

 33%|███▎      | 8/24 [00:09<00:18,  1.15s/it]

 38%|███▊      | 9/24 [00:10<00:16,  1.13s/it]

 42%|████▏     | 10/24 [00:11<00:15,  1.13s/it]

 46%|████▌     | 11/24 [00:12<00:14,  1.12s/it]

 50%|█████     | 12/24 [00:13<00:13,  1.10s/it]

 54%|█████▍    | 13/24 [00:14<00:12,  1.12s/it]

 58%|█████▊    | 14/24 [00:15<00:11,  1.12s/it]

 62%|██████▎   | 15/24 [00:16<00:10,  1.13s/it]

 67%|██████▋   | 16/24 [00:18<00:08,  1.12s/it]

 71%|███████   | 17/24 [00:19<00:07,  1.12s/it]

 75%|███████▌  | 18/24 [00:20<00:06,  1.12s/it]

 79%|███████▉  | 19/24 [00:21<00:05,  1.13s/it]

 83%|████████▎ | 20/24 [00:22<00:04,  1.12s/it]

 88%|████████▊ | 21/24 [00:23<00:03,  1.12s/it]

 92%|█████████▏| 22/24 [00:24<00:02,  1.13s/it]

 96%|█████████▌| 23/24 [00:25<00:01,  1.13s/it]

100%|██████████| 24/24 [00:27<00:00,  1.12s/it]

100%|██████████| 24/24 [00:27<00:00,  1.13s/it]


Single Solvent CV MSE: 0.011227





In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions_full = []
all_actuals_full = []  # For CV calculation

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split

    model = TreeEnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()
    all_actuals_full.append(test_Y.values)  # For CV calculation

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions_full.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions_full)

# Calculate CV score
all_actuals_full_np = np.vstack(all_actuals_full)
all_preds_full_np = np.array([[p['target_1'], p['target_2'], p['target_3']] for p in all_predictions_full])
full_mse = np.mean((all_actuals_full_np - all_preds_full_np) ** 2)
print(f"\nFull Data CV MSE: {full_mse:.6f}")

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

  0%|          | 0/13 [00:00<?, ?it/s]

  8%|▊         | 1/13 [00:01<00:21,  1.76s/it]

 15%|█▌        | 2/13 [00:03<00:18,  1.72s/it]

 23%|██▎       | 3/13 [00:05<00:17,  1.75s/it]

 31%|███       | 4/13 [00:06<00:15,  1.73s/it]

 38%|███▊      | 5/13 [00:08<00:13,  1.74s/it]

 46%|████▌     | 6/13 [00:10<00:12,  1.74s/it]

 54%|█████▍    | 7/13 [00:12<00:10,  1.75s/it]

 62%|██████▏   | 8/13 [00:13<00:08,  1.75s/it]

 69%|██████▉   | 9/13 [00:15<00:06,  1.75s/it]

 77%|███████▋  | 10/13 [00:17<00:05,  1.76s/it]

 85%|████████▍ | 11/13 [00:19<00:03,  1.76s/it]

 92%|█████████▏| 12/13 [00:21<00:01,  1.76s/it]

100%|██████████| 13/13 [00:22<00:00,  1.77s/it]

100%|██████████| 13/13 [00:22<00:00,  1.75s/it]


Full Data CV MSE: 0.010857





In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

# Calculate overall CV score
total_samples = len(all_actuals_np) + len(all_actuals_full_np)
overall_mse = (single_mse * len(all_actuals_np) + full_mse * len(all_actuals_full_np)) / total_samples

print(f"\n=== FINAL RESULTS ===")
print(f"Single Solvent MSE: {single_mse:.6f}")
print(f"Full Data MSE: {full_mse:.6f}")
print(f"Overall CV MSE: {overall_mse:.6f}")
print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Submission shape: {submission.shape}")

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################


=== FINAL RESULTS ===
Single Solvent MSE: 0.011227
Full Data MSE: 0.010857
Overall CV MSE: 0.010986

Submission saved to /home/submission/submission.csv
Submission shape: (1883, 7)
