# Per-Target Ensemble Model

**Hypothesis**: Training separate models for SM vs Products may improve performance because:
- SM has different distribution (mean 0.52, std 0.36) vs Products (mean ~0.13, std ~0.14)
- Product 2 and Product 3 are highly correlated (0.923)
- Competition explicitly allows "different hyper-parameters for different objectives"

**Approach**:
- SM model: MLP[64,32] + LightGBM (larger architecture for higher-variance target)
- Product model: MLP[32,16] + LightGBM (shared model for correlated P2/P3)

**Baseline to beat**: exp_024 CV 0.008689, LB 0.0893

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
import tqdm
import warnings
import sys
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_default_dtype(torch.double)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[["Product 2", "Product 3", "SM"]]
    return X, Y

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Load feature lookups INCLUDING ACS PCA
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFP_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

# Filter DRFP to high-variance columns
drfp_variance = DRFP_DF.var()
nonzero_variance_cols = drfp_variance[drfp_variance > 0].index.tolist()
DRFP_FILTERED = DRFP_DF[nonzero_variance_cols]

print(f'Spange: {SPANGE_DF.shape}, DRFP filtered: {DRFP_FILTERED.shape}, ACS PCA: {ACS_PCA_DF.shape}')
print(f'Total features: 5 (kinetic) + {SPANGE_DF.shape[1]} (Spange) + {DRFP_FILTERED.shape[1]} (DRFP) + {ACS_PCA_DF.shape[1]} (ACS PCA) = {5 + SPANGE_DF.shape[1] + DRFP_FILTERED.shape[1] + ACS_PCA_DF.shape[1]}')

Spange: (26, 13), DRFP filtered: (24, 122), ACS PCA: (24, 5)
Total features: 5 (kinetic) + 13 (Spange) + 122 (DRFP) + 5 (ACS PCA) = 145


In [4]:
# Combined Featurizer with Arrhenius kinetics AND ACS PCA
class ACSPCAFeaturizer:
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.spange_df = SPANGE_DF
        self.drfp_df = DRFP_FILTERED
        self.acs_pca_df = ACS_PCA_DF
        self.feats_dim = 2 + 3 + self.spange_df.shape[1] + self.drfp_df.shape[1] + self.acs_pca_df.shape[1]

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        temp_c = X_vals[:, 1:2]
        time_m = X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        if self.mixed:
            A_spange = self.spange_df.loc[X["SOLVENT A NAME"]].values
            B_spange = self.spange_df.loc[X["SOLVENT B NAME"]].values
            A_drfp = self.drfp_df.loc[X["SOLVENT A NAME"]].values
            B_drfp = self.drfp_df.loc[X["SOLVENT B NAME"]].values
            A_acs = self.acs_pca_df.loc[X["SOLVENT A NAME"]].values
            B_acs = self.acs_pca_df.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            if flip:
                X_spange = B_spange * (1 - (1-pct)) + A_spange * (1-pct)
                X_drfp = B_drfp * (1 - (1-pct)) + A_drfp * (1-pct)
                X_acs = B_acs * (1 - (1-pct)) + A_acs * (1-pct)
            else:
                X_spange = A_spange * (1 - pct) + B_spange * pct
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                X_acs = A_acs * (1 - pct) + B_acs * pct
        else:
            X_spange = self.spange_df.loc[X["SOLVENT NAME"]].values
            X_drfp = self.drfp_df.loc[X["SOLVENT NAME"]].values
            X_acs = self.acs_pca_df.loc[X["SOLVENT NAME"]].values
        
        return np.hstack([X_kinetic, X_spange, X_drfp, X_acs])
    
    def featurize_torch(self, X, flip=False):
        return torch.tensor(self.featurize(X, flip))

print(f'Feature dimension: {ACSPCAFeaturizer().feats_dim}')

Feature dimension: 145


In [5]:
# MLP Model with configurable output dimension
class MLPModelInternal(nn.Module):
    def __init__(self, input_dim, hidden_dims=[32, 16], output_dim=3, dropout=0.05):
        super(MLPModelInternal, self).__init__()
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

print('MLPModelInternal defined')

MLPModelInternal defined


In [6]:
# Single-Target MLP Ensemble (for SM or Products separately)
class SingleTargetMLPEnsemble:
    def __init__(self, hidden_dims=[32, 16], n_models=5, data='single', output_dim=1):
        self.hidden_dims = hidden_dims
        self.n_models = n_models
        self.data_type = data
        self.output_dim = output_dim
        self.featurizer = ACSPCAFeaturizer(mixed=(data=='full'))
        self.models = []

    def train_model(self, X_train, y_train, epochs=200, batch_size=32, lr=5e-4):
        X_std = self.featurizer.featurize_torch(X_train, flip=False)
        # y_train can be a DataFrame with 1 or 2 columns
        if isinstance(y_train, pd.DataFrame):
            y_vals = torch.tensor(y_train.values)
        else:
            y_vals = torch.tensor(y_train.reshape(-1, self.output_dim))
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize_torch(X_train, flip=True)
            X_all = torch.cat([X_std, X_flip], dim=0)
            y_all = torch.cat([y_vals, y_vals], dim=0)
        else:
            X_all, y_all = X_std, y_vals
            
        input_dim = X_all.shape[1]
        self.models = []
        
        for i in range(self.n_models):
            torch.manual_seed(42 + i * 13)
            np.random.seed(42 + i * 13)
            
            model = MLPModelInternal(input_dim, self.hidden_dims, output_dim=self.output_dim).to(device).double()
            optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20)
            criterion = nn.HuberLoss()
            
            dataset = TensorDataset(X_all.to(device), y_all.to(device))
            loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
            
            model.train()
            for epoch in range(epochs):
                epoch_loss = 0.0
                for batch_X, batch_y in loader:
                    optimizer.zero_grad()
                    pred = model(batch_X)
                    loss = criterion(pred, batch_y)
                    loss.backward()
                    optimizer.step()
                    epoch_loss += loss.item()
                scheduler.step(epoch_loss / len(loader))
            
            model.eval()
            self.models.append(model)

    def predict(self, X_test):
        X_feat = self.featurizer.featurize_torch(X_test, flip=False).to(device)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize_torch(X_test, flip=True).to(device)
        
        all_preds = []
        with torch.no_grad():
            for model in self.models:
                pred = model(X_feat)
                if self.data_type == 'full':
                    pred_flip = model(X_flip)
                    pred = (pred + pred_flip) / 2
                all_preds.append(pred)
        
        return torch.stack(all_preds).mean(dim=0).cpu()

print('SingleTargetMLPEnsemble defined')

SingleTargetMLPEnsemble defined


In [7]:
# Single-Target LightGBM Wrapper
class SingleTargetLGBMWrapper:
    def __init__(self, data='single', output_dim=1):
        self.data_type = data
        self.output_dim = output_dim
        self.featurizer = ACSPCAFeaturizer(mixed=(data=='full'))
        self.models = []

    def train_model(self, X_train, y_train):
        X_std = self.featurizer.featurize(X_train, flip=False)
        if isinstance(y_train, pd.DataFrame):
            y_vals = y_train.values
        else:
            y_vals = y_train.reshape(-1, self.output_dim)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_std, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all, y_all = X_std, y_vals
        
        self.models = []
        params = {
            'objective': 'regression',
            'metric': 'mse',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'seed': 42
        }
        
        for i in range(self.output_dim):
            train_data = lgb.Dataset(X_all, label=y_all[:, i])
            model = lgb.train(params, train_data, num_boost_round=100)
            self.models.append(model)

    def predict(self, X_test):
        X_feat = self.featurizer.featurize(X_test, flip=False)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_test, flip=True)
        
        preds = []
        for i, model in enumerate(self.models):
            pred = model.predict(X_feat)
            if self.data_type == 'full':
                pred_flip = model.predict(X_flip)
                pred = (pred + pred_flip) / 2
            preds.append(pred)
        
        return torch.tensor(np.column_stack(preds))

print('SingleTargetLGBMWrapper defined')

SingleTargetLGBMWrapper defined


In [8]:
# Per-Target Ensemble: Separate models for SM vs Products
class PerTargetEnsemble:
    def __init__(self, data='single'):
        self.data_type = data
        
        # SM model: larger architecture for higher-variance target (mean 0.52, std 0.36)
        self.sm_mlp = SingleTargetMLPEnsemble(hidden_dims=[64, 32], n_models=5, data=data, output_dim=1)
        self.sm_lgbm = SingleTargetLGBMWrapper(data=data, output_dim=1)
        
        # Product model: shared model for correlated P2/P3 (correlation 0.923)
        self.product_mlp = SingleTargetMLPEnsemble(hidden_dims=[32, 16], n_models=5, data=data, output_dim=2)
        self.product_lgbm = SingleTargetLGBMWrapper(data=data, output_dim=2)
        
        self.mlp_weight = 0.6
        self.lgbm_weight = 0.4

    def train_model(self, X_train, y_train):
        # Train SM model on SM target only
        y_sm = y_train[['SM']]
        self.sm_mlp.train_model(X_train, y_sm)
        self.sm_lgbm.train_model(X_train, y_sm)
        
        # Train Product model on both products
        y_products = y_train[['Product 2', 'Product 3']]
        self.product_mlp.train_model(X_train, y_products)
        self.product_lgbm.train_model(X_train, y_products)

    def predict(self, X_test):
        # SM predictions
        sm_mlp_pred = self.sm_mlp.predict(X_test)  # Shape: [N, 1]
        sm_lgbm_pred = self.sm_lgbm.predict(X_test)  # Shape: [N, 1]
        sm_pred = self.mlp_weight * sm_mlp_pred + self.lgbm_weight * sm_lgbm_pred
        
        # Product predictions
        product_mlp_pred = self.product_mlp.predict(X_test)  # Shape: [N, 2]
        product_lgbm_pred = self.product_lgbm.predict(X_test)  # Shape: [N, 2]
        product_pred = self.mlp_weight * product_mlp_pred + self.lgbm_weight * product_lgbm_pred
        
        # Combine: [Product 2, Product 3, SM] to match expected output order
        return torch.cat([product_pred, sm_pred], dim=1)

print('PerTargetEnsemble defined')

PerTargetEnsemble defined


In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = PerTargetEnsemble(data='single')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [01:34, 94.26s/it]

2it [03:07, 93.47s/it]

3it [04:35, 91.32s/it]

4it [06:03, 90.00s/it]

5it [07:36, 90.79s/it]

6it [09:08, 91.27s/it]

7it [10:40, 91.63s/it]

8it [12:13, 91.87s/it]

9it [13:45, 92.05s/it]

10it [15:19, 92.52s/it]

11it [16:52, 92.74s/it]

12it [18:25, 92.94s/it]

13it [19:59, 93.13s/it]

14it [21:32, 93.11s/it]

15it [23:06, 93.37s/it]

16it [24:40, 93.48s/it]

17it [26:18, 94.82s/it]

18it [27:51, 94.47s/it]

19it [29:25, 94.14s/it]

20it [30:59, 94.11s/it]

21it [32:32, 93.87s/it]

22it [34:05, 93.77s/it]

23it [35:39, 93.76s/it]

24it [37:13, 93.67s/it]

24it [37:13, 93.05s/it]




In [10]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = PerTargetEnsemble(data='full')  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [05:25, 325.29s/it]

2it [10:47, 323.31s/it]

3it [16:18, 327.06s/it]

4it [21:41, 325.46s/it]

5it [27:06, 325.21s/it]

6it [32:31, 325.03s/it]

7it [37:53, 324.02s/it]

8it [43:17, 324.24s/it]

9it [48:38, 323.07s/it]

10it [54:25, 330.37s/it]

11it [1:00:32, 341.83s/it]

12it [1:07:03, 356.59s/it]

13it [1:13:32, 366.35s/it]

13it [1:13:32, 339.39s/it]




In [11]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("/home/submission/submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [12]:
# Calculate CV score (for verification only - not part of template)
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")

# Get actuals in same order as predictions
actuals_single = []
for solvent in sorted(X_single["SOLVENT NAME"].unique()):
    mask = X_single["SOLVENT NAME"] == solvent
    actuals_single.append(Y_single[mask].values)
actuals_single = np.vstack(actuals_single)

actuals_full = []
ramps = X_full[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
for _, row in ramps.iterrows():
    mask = (X_full["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X_full["SOLVENT B NAME"] == row["SOLVENT B NAME"])
    actuals_full.append(Y_full[mask].values)
actuals_full = np.vstack(actuals_full)

# Get predictions
preds_single = submission_single_solvent[['target_1', 'target_2', 'target_3']].values
preds_full = submission_full_data[['target_1', 'target_2', 'target_3']].values

# Calculate MSE
mse_single = np.mean((actuals_single - preds_single) ** 2)
mse_full = np.mean((actuals_full - preds_full) ** 2)
n_single = len(actuals_single)
n_full = len(actuals_full)
overall_mse = (mse_single * n_single + mse_full * n_full) / (n_single + n_full)

# Per-target MSE breakdown
mse_p2 = np.mean((actuals_single[:, 0] - preds_single[:, 0]) ** 2)
mse_p3 = np.mean((actuals_single[:, 1] - preds_single[:, 1]) ** 2)
mse_sm = np.mean((actuals_single[:, 2] - preds_single[:, 2]) ** 2)

print(f'\n=== CV SCORE VERIFICATION ===')
print(f'Single Solvent MSE: {mse_single:.6f} (n={n_single})')
print(f'Full Data MSE: {mse_full:.6f} (n={n_full})')
print(f'Overall MSE: {overall_mse:.6f}')
print(f'\n=== PER-TARGET MSE (Single Solvent) ===')
print(f'Product 2 MSE: {mse_p2:.6f}')
print(f'Product 3 MSE: {mse_p3:.6f}')
print(f'SM MSE: {mse_sm:.6f}')
print(f'\nexp_024 baseline: CV 0.008689, LB 0.0893')
print(f'Submission shape: {submission.shape}')

if overall_mse < 0.008689:
    improvement = (0.008689 - overall_mse) / 0.008689 * 100
    print(f'\n✓ IMPROVEMENT: {improvement:.2f}% better than exp_024!')
else:
    degradation = (overall_mse - 0.008689) / 0.008689 * 100
    print(f'\n✗ WORSE: {degradation:.2f}% worse than exp_024')


=== CV SCORE VERIFICATION ===
Single Solvent MSE: 0.009249 (n=656)
Full Data MSE: 0.008971 (n=1227)
Overall MSE: 0.009068

=== PER-TARGET MSE (Single Solvent) ===
Product 2 MSE: 0.005917
Product 3 MSE: 0.007797
SM MSE: 0.014034

exp_024 baseline: CV 0.008689, LB 0.0893
Submission shape: (1883, 7)

✗ WORSE: 4.36% worse than exp_024
