# Experiment 002: Template-Compliant Ensemble with Enhanced Features

This experiment:
1. **FIXES TEMPLATE COMPLIANCE** - Last 3 cells match template exactly
2. Uses Arrhenius kinetics + polynomial features
3. Ensemble of MLP + XGBoost + LightGBM + RF with TTA
4. Bagging with 5 MLP models

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set paths for local execution
DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch version: 2.2.0+cu118
CUDA available: True


In [2]:
# --- UTILITY FUNCTIONS (matching template) ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_FEATURES = ["SOLVENT NAME"]
INPUT_LABELS_FULL_FEATURES = ["SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load feature lookups
SPANGE_DF = load_features('spange_descriptors')
ACS_PCA_DF = load_features('acs_pca_descriptors')
print(f"Spange: {SPANGE_DF.shape}, ACS_PCA: {ACS_PCA_DF.shape}")

Spange: (26, 13), ACS_PCA: (24, 5)


In [3]:
# --- BASE CLASSES (matching template) ---
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError
    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass
    def train_model(self, X_train, y_train):
        raise NotImplementedError
    def predict(self):
        raise NotImplementedError

In [4]:
# --- ENHANCED FEATURIZER WITH ARRHENIUS + POLYNOMIAL FEATURES ---
class EnhancedFeaturizer(SmilesFeaturizer):
    """Featurizer with Arrhenius kinetics + polynomial features."""
    def __init__(self, mixed=False, feature_set='spange'):
        self.mixed = mixed
        self.feature_set = feature_set
        if feature_set == 'spange':
            self.featurizer = SPANGE_DF
        else:
            self.featurizer = ACS_PCA_DF
        
        # Feature dimensions:
        # - 2 numeric (rt, temp)
        # - 5 Arrhenius/polynomial (inv_temp, log_time, interaction, rt*temp, temp^2)
        # - N solvent features
        # - 1 SolventB% for mixed
        base_feats = 2 + 5 + self.featurizer.shape[1]
        self.feats_dim = base_feats + (1 if mixed else 0)

    def featurize(self, X, flip=False):
        # Numeric features
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        # Arrhenius kinetic features
        temp_k = temp + 273.15  # Kelvin
        inv_temp = 1000.0 / temp_k  # Inverse temperature
        log_time = np.log(rt + 1e-6)  # Log time
        interaction = inv_temp * log_time  # Arrhenius interaction
        
        # Polynomial features
        rt_temp = rt * temp / 1000.0  # Scaled interaction
        temp_sq = (temp / 100.0) ** 2  # Scaled squared temp
        
        numeric_feats = np.hstack([rt, temp, inv_temp, log_time, interaction, rt_temp, temp_sq])
        
        # Solvent features
        if self.mixed:
            A = self.featurizer.loc[X["SOLVENT A NAME"]].values
            B = self.featurizer.loc[X["SOLVENT B NAME"]].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            
            if flip:
                # Symmetry flip
                solvent_feats = B * (1 - pct) + A * pct
            else:
                solvent_feats = A * (1 - pct) + B * pct
            
            all_feats = np.hstack([numeric_feats, pct, solvent_feats])
        else:
            solvent_feats = self.featurizer.loc[X["SOLVENT NAME"]].values
            all_feats = np.hstack([numeric_feats, solvent_feats])
        
        return torch.tensor(all_feats, dtype=torch.double)

# Test
X_test, Y_test = load_data("single_solvent")
feat = EnhancedFeaturizer(mixed=False)
print(f"Single solvent features: {feat.featurize(X_test.head(3)).shape}")

Single solvent features: torch.Size([3, 20])


In [5]:
# --- MLP ARCHITECTURE ---
class MLPInternal(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(input_dim),
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 3),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

In [6]:
# --- ENSEMBLE MODEL CLASS ---
class EnsembleModel(nn.Module, BaseModel):
    """Template-compliant ensemble with TTA for mixed solvents."""
    def __init__(self, data='single'):
        super().__init__()
        self.data_type = data
        self.featurizer = EnhancedFeaturizer(mixed=(data=='full'), feature_set='spange')
        
        # Bagging: 5 MLP models
        self.n_mlp = 5
        self.mlp_models = nn.ModuleList()
        
        # GBDT models
        self.xgb_model = None
        self.lgb_model = None
        self.rf_model = None
        
        # Scaler
        self.scaler = StandardScaler()
        
        # Ensemble weights
        self.weights = [0.35, 0.25, 0.25, 0.15]

    def train_model(self, X_train, y_train):
        # Prepare features
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = torch.tensor(y_train.values, dtype=torch.double)
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = torch.cat([X_std, X_flip], dim=0)
            y_all = torch.cat([y_vals, y_vals], dim=0)
        else:
            X_all = X_std
            y_all = y_vals
        
        X_np = X_all.numpy()
        y_np = y_all.numpy()
        X_scaled = self.scaler.fit_transform(X_np)
        
        input_dim = X_scaled.shape[1]
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Train MLPs with bagging
        for i in range(self.n_mlp):
            torch.manual_seed(42 + i)
            np.random.seed(42 + i)
            
            model = MLPInternal(input_dim).to(device)
            model.train()
            self.mlp_models.append(model)
            
            X_t = torch.tensor(X_scaled, dtype=torch.double).to(device)
            y_t = torch.tensor(y_np, dtype=torch.double).to(device)
            
            loader = DataLoader(TensorDataset(X_t, y_t), batch_size=32, shuffle=True)
            optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
            criterion = nn.HuberLoss()
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=20)
            
            for epoch in range(200):
                epoch_loss = 0.0
                for inputs, targets in loader:
                    optimizer.zero_grad()
                    loss = criterion(model(inputs), targets)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    epoch_loss += loss.item() * inputs.size(0)
                scheduler.step(epoch_loss / len(X_t))
        
        # Train XGBoost
        self.xgb_model = MultiOutputRegressor(
            xgb.XGBRegressor(n_estimators=500, learning_rate=0.02, max_depth=6, 
                            subsample=0.8, colsample_bytree=0.8, random_state=42, verbosity=0)
        )
        self.xgb_model.fit(X_scaled, y_np)
        
        # Train LightGBM
        self.lgb_model = MultiOutputRegressor(
            lgb.LGBMRegressor(n_estimators=500, learning_rate=0.02, num_leaves=31,
                             max_depth=6, subsample=0.8, colsample_bytree=0.8, 
                             random_state=42, verbosity=-1)
        )
        self.lgb_model.fit(X_scaled, y_np)
        
        # Train RandomForest
        self.rf_model = MultiOutputRegressor(
            RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
        )
        self.rf_model.fit(X_scaled, y_np)

    def predict(self, X):
        device = next(self.mlp_models[0].parameters()).device
        
        if self.data_type == 'full':
            # TTA for mixed solvents
            X_std = self.featurizer.featurize(X, flip=False)
            X_flip = self.featurizer.featurize(X, flip=True)
            
            X_std_sc = self.scaler.transform(X_std.numpy())
            X_flip_sc = self.scaler.transform(X_flip.numpy())
            
            # MLP with TTA
            mlp_sum = torch.zeros((len(X), 3)).to(device)
            with torch.no_grad():
                for model in self.mlp_models:
                    model.eval()
                    p1 = model(torch.tensor(X_std_sc, dtype=torch.double).to(device))
                    p2 = model(torch.tensor(X_flip_sc, dtype=torch.double).to(device))
                    mlp_sum += (p1 + p2) * 0.5
            mlp_preds = (mlp_sum / self.n_mlp).cpu().numpy()
            
            # GBDT with TTA
            xgb_preds = (self.xgb_model.predict(X_std_sc) + self.xgb_model.predict(X_flip_sc)) / 2
            lgb_preds = (self.lgb_model.predict(X_std_sc) + self.lgb_model.predict(X_flip_sc)) / 2
            rf_preds = (self.rf_model.predict(X_std_sc) + self.rf_model.predict(X_flip_sc)) / 2
        else:
            X_std = self.featurizer.featurize(X)
            X_std_sc = self.scaler.transform(X_std.numpy())
            
            # MLP
            mlp_sum = torch.zeros((len(X), 3)).to(device)
            with torch.no_grad():
                for model in self.mlp_models:
                    model.eval()
                    mlp_sum += model(torch.tensor(X_std_sc, dtype=torch.double).to(device))
            mlp_preds = (mlp_sum / self.n_mlp).cpu().numpy()
            
            # GBDT
            xgb_preds = self.xgb_model.predict(X_std_sc)
            lgb_preds = self.lgb_model.predict(X_std_sc)
            rf_preds = self.rf_model.predict(X_std_sc)
        
        # Weighted ensemble
        final = (self.weights[0] * mlp_preds + self.weights[1] * xgb_preds + 
                 self.weights[2] * lgb_preds + self.weights[3] * rf_preds)
        final = np.clip(final, 0, 1)
        
        return torch.tensor(final, dtype=torch.double)

In [7]:
# Quick test of the model
print("Testing model...")
X_test, Y_test = load_data("single_solvent")
X_train, Y_train = X_test.iloc[:500], Y_test.iloc[:500]
X_val, Y_val = X_test.iloc[500:], Y_test.iloc[500:]

test_model = EnsembleModel(data='single')
test_model.train_model(X_train, Y_train)
preds = test_model.predict(X_val)
print(f"Predictions shape: {preds.shape}")
print(f"Sample predictions: {preds[:3]}")
print(f"Test MAE: {np.mean(np.abs(preds.numpy() - Y_val.values)):.6f}")

Testing model...


Predictions shape: torch.Size([156, 3])
Sample predictions: tensor([[0.2813, 0.1892, 0.2006],
        [0.2813, 0.1892, 0.2006],
        [0.2813, 0.1892, 0.2006]])
Test MAE: 0.076420


## Template-Compliant Cross-Validation

The following 3 cells are EXACTLY as in the template, with only the model definition line changed.

In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:35, 35.79s/it]

2it [01:12, 36.12s/it]

3it [01:46, 35.48s/it]

4it [02:21, 34.99s/it]

5it [02:56, 35.26s/it]

6it [03:32, 35.48s/it]

7it [04:08, 35.45s/it]

8it [04:43, 35.56s/it]

9it [05:19, 35.67s/it]

10it [05:55, 35.77s/it]

11it [06:32, 35.93s/it]

12it [07:07, 35.88s/it]

13it [07:43, 35.83s/it]

14it [08:18, 35.70s/it]

15it [08:54, 35.67s/it]

16it [09:30, 35.73s/it]

17it [10:08, 36.40s/it]

18it [10:44, 36.29s/it]

19it [11:20, 36.13s/it]

20it [11:56, 36.11s/it]

21it [12:32, 36.16s/it]

22it [13:09, 36.36s/it]

23it [13:47, 36.94s/it]

24it [14:23, 36.75s/it]

24it [14:23, 36.00s/it]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = EnsembleModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [02:03, 123.76s/it]

2it [04:04, 121.98s/it]

3it [06:10, 123.71s/it]

4it [08:11, 122.67s/it]

5it [10:14, 122.80s/it]

6it [12:17, 122.79s/it]

7it [14:20, 123.03s/it]

8it [16:26, 124.02s/it]

9it [18:29, 123.51s/it]

10it [20:43, 126.81s/it]

11it [22:55, 128.51s/it]

12it [25:07, 129.58s/it]

13it [27:20, 130.42s/it]

13it [27:20, 126.16s/it]




In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
# Calculate CV scores (for logging only - not part of submission)
import os
os.makedirs('/home/submission', exist_ok=True)
submission.to_csv('/home/submission/submission.csv', index=True)
submission.to_csv('/home/code/experiments/002_template_compliant/submission.csv', index=True)

# Calculate MAE from predictions
X_single, Y_single = load_data("single_solvent")
X_full, Y_full = load_data("full")

# Single solvent MAE
single_preds = submission_single_solvent[['target_1', 'target_2', 'target_3']].values
single_mae = np.mean(np.abs(single_preds - Y_single.values))

# Full data MAE  
full_preds = submission_full_data[['target_1', 'target_2', 'target_3']].values
full_mae = np.mean(np.abs(full_preds - Y_full.values))

# Combined
total = len(single_preds) + len(full_preds)
combined_mae = (single_mae * len(single_preds) + full_mae * len(full_preds)) / total

print(f"\n=== FINAL CV RESULTS ===")
print(f"Single Solvent MAE: {single_mae:.6f}")
print(f"Full Data MAE: {full_mae:.6f}")
print(f"Combined MAE: {combined_mae:.6f}")
print(f"\nSubmission saved to /home/submission/submission.csv")