# Experiment 099: Per-Target Models with Different Architectures

**Approach from dabansherwani kernel**:
- SM (hardest target): HistGradientBoosting with ACS_PCA + Spange features
- Product 2/3 (easier targets): ExtraTreesRegressor with ACS_PCA + Spange features
- Weights: 0.65 * ACS_PCA model + 0.35 * Spange model

**Hypothesis**: Different targets may benefit from different model types.
SM is consistently the hardest target and may need more regularization.

In [None]:
# Standard imports
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.set_default_dtype(torch.double)

print('Imports complete')

In [None]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

In [None]:
# Load feature lookups
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

print(f'Spange: {SPANGE_DF.shape}, ACS PCA: {ACS_PCA_DF.shape}')

In [None]:
# Per-Target Model (from dabansherwani kernel)
class PerTargetModel:
    """Model for a single target using a specific feature table."""
    
    def __init__(self, feature_table='spange', base_type='hgb', mixed=False):
        self.feature_table = feature_table
        self.base_type = base_type
        self.mixed = mixed
        
        if feature_table == 'spange':
            self.lookup = SPANGE_DF
        else:  # acs_pca
            self.lookup = ACS_PCA_DF
        
        self.model = None
    
    def _vec(self, s):
        """Get feature vector for a solvent."""
        if s in self.lookup.index:
            return self.lookup.loc[s].values
        return np.zeros(self.lookup.shape[1])
    
    def _build_X(self, X):
        """Build feature matrix."""
        rt = X["Residence Time"].values.reshape(-1, 1)
        temp = X["Temperature"].values.reshape(-1, 1)
        
        if not self.mixed:
            # Single solvent
            S = np.vstack([self._vec(s) for s in X["SOLVENT NAME"]])
            return np.hstack([rt, temp, S])
        else:
            # Mixed solvents - NOTE: SolventB% is already in [0, 1]
            frac_b = X["SolventB%"].values.reshape(-1, 1)  # Already in [0, 1]!
            A = np.vstack([self._vec(s) for s in X["SOLVENT A NAME"]])
            B = np.vstack([self._vec(s) for s in X["SOLVENT B NAME"]])
            mix = (1 - frac_b) * A + frac_b * B
            return np.hstack([rt, temp, frac_b, mix])
    
    def train_model(self, X, y):
        """Train the model."""
        Xf = self._build_X(X)
        
        if self.base_type == 'hgb':
            base = HistGradientBoostingRegressor(
                max_depth=7, max_iter=700, learning_rate=0.04, random_state=42
            )
        else:  # etr
            base = ExtraTreesRegressor(
                n_estimators=900, min_samples_leaf=2, random_state=42, n_jobs=-1
            )
        
        self.model = Pipeline([
            ('scaler', StandardScaler()),
            ('reg', base)
        ])
        self.model.fit(Xf, y.values.ravel())
    
    def predict(self, X):
        """Predict."""
        Xf = self._build_X(X)
        return self.model.predict(Xf)

print('PerTargetModel defined')

In [None]:
# Per-Target Ensemble Model
class PerTargetEnsembleModel:
    """Ensemble with different model types for different targets.
    
    - SM (hardest): HistGradientBoosting
    - Product 2/3 (easier): ExtraTreesRegressor
    - Each target uses 2 models: ACS_PCA (0.65) + Spange (0.35)
    """
    
    def __init__(self, data='single'):
        self.mixed = (data == 'full')
        self.targets = ["Product 2", "Product 3", "SM"]
        self.models = {}
        
        for t in self.targets:
            if t == "SM":
                # SM is hardest - use HistGradientBoosting
                self.models[t] = [
                    PerTargetModel('acs_pca', 'hgb', self.mixed),
                    PerTargetModel('spange', 'hgb', self.mixed),
                ]
            else:
                # P2/P3 are easier - use ExtraTrees
                self.models[t] = [
                    PerTargetModel('acs_pca', 'etr', self.mixed),
                    PerTargetModel('spange', 'etr', self.mixed),
                ]
    
    def train_model(self, X, Y):
        """Train all models."""
        for t in self.targets:
            y_single = Y[[t]]
            for m in self.models[t]:
                m.train_model(X, y_single)
    
    def predict(self, X):
        """Predict with weighted ensemble."""
        preds = []
        
        for t in self.targets:
            p1 = self.models[t][0].predict(X)  # ACS_PCA model
            p2 = self.models[t][1].predict(X)  # Spange model
            
            # Weighted average: 0.65 * ACS_PCA + 0.35 * Spange
            pred_t = 0.65 * p1 + 0.35 * p2
            preds.append(pred_t.reshape(-1, 1))
        
        pred = np.clip(np.hstack(preds), 0, 1)
        return torch.tensor(pred, dtype=torch.double)

print('PerTargetEnsembleModel defined')
print('  SM: HistGradientBoosting (ACS_PCA 0.65 + Spange 0.35)')
print('  P2/P3: ExtraTreesRegressor (ACS_PCA 0.65 + Spange 0.35)')

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = PerTargetEnsembleModel(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = PerTargetEnsembleModel(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
# Calculate CV score (for verification only - NOT part of submission)
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")

# Get actuals in same order as predictions
actuals_single = []
for solvent in sorted(X_single["SOLVENT NAME"].unique()):
    mask = X_single["SOLVENT NAME"] == solvent
    actuals_single.append(Y_single[mask].values)
actuals_single = np.vstack(actuals_single)

actuals_full = []
ramps = X_full[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
for _, row in ramps.iterrows():
    mask = (X_full["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X_full["SOLVENT B NAME"] == row["SOLVENT B NAME"])
    actuals_full.append(Y_full[mask].values)
actuals_full = np.vstack(actuals_full)

# Get predictions
preds_single = submission_single_solvent[['target_1', 'target_2', 'target_3']].values
preds_full = submission_full_data[['target_1', 'target_2', 'target_3']].values

# Calculate MSE
mse_single = np.mean((actuals_single - preds_single) ** 2)
mse_full = np.mean((actuals_full - preds_full) ** 2)
n_single = len(actuals_single)
n_full = len(actuals_full)
overall_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)

print(f'\n=== CV SCORE VERIFICATION ===')
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Overall MSE: {overall_mse:.6f}')
print(f'\nBest previous CV: 0.008092 (CatBoost+XGBoost)')
print(f'Best previous LB: 0.0877 (GP+MLP+LGBM)')
print(f'exp_030 baseline (GP+MLP+LGBM): CV 0.008298')
print(f'\nThis (Per-Target Ensemble): CV {overall_mse:.6f}')

if overall_mse < 0.008092:
    improvement = (0.008092 - overall_mse) / 0.008092 * 100
    print(f'\n✓ IMPROVEMENT: {improvement:.2f}% better than best CV!')
elif overall_mse < 0.008298:
    improvement = (0.008298 - overall_mse) / 0.008298 * 100
    print(f'\n✓ IMPROVEMENT: {improvement:.2f}% better than exp_030!')
else:
    degradation = (overall_mse - 0.008298) / 0.008298 * 100
    print(f'\n✗ WORSE: {degradation:.2f}% worse than exp_030')

# Predicted LB based on CV-LB relationship
predicted_lb = 4.36 * overall_mse + 0.052
print(f'\nPredicted LB (based on CV-LB relationship): {predicted_lb:.4f}')
print(f'Best LB so far: 0.0877')