# Experiment 001: Baseline with Arrhenius Kinetics + Ensemble + TTA

This baseline combines:
1. Arrhenius kinetics features (1/T, ln(t), interaction)
2. Chemical symmetry TTA for mixed solvents
3. Ensemble of MLP + XGBoost + LightGBM + RF
4. Bagging with multiple seeds (5 models)
5. Spange descriptors for solvent features

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set paths for local execution
DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.2.0+cu118
CUDA available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# --- UTILITY FUNCTIONS ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load spange descriptors
SPANGE_DF = load_features('spange_descriptors')
print(f"Spange descriptors shape: {SPANGE_DF.shape}")
print(f"Solvents: {list(SPANGE_DF.index)}")

Spange descriptors shape: (26, 13)
Solvents: ['Cyclohexane', 'Ethyl Acetate', 'Acetic Acid', '2-Methyltetrahydrofuran [2-MeTHF]', '1,1,1,3,3,3-Hexafluoropropan-2-ol', 'IPA [Propan-2-ol]', 'Ethanol', 'Methanol', 'Ethylene Glycol [1,2-Ethanediol]', 'Acetonitrile', 'Water', 'Diethyl Ether [Ether]', 'MTBE [tert-Butylmethylether]', 'Dimethyl Carbonate', 'tert-Butanol [2-Methylpropan-2-ol]', 'DMA [N,N-Dimethylacetamide]', '2,2,2-Trifluoroethanol', 'Dihydrolevoglucosenone (Cyrene)', 'Decanol', 'Butanone [MEK]', 'Ethyl Lactate', 'Methyl Propionate', 'THF [Tetrahydrofuran]', 'Water.Acetonitrile', 'Acetonitrile.Acetic Acid', 'Water.2,2,2-Trifluoroethanol']


In [3]:
# --- KINETIC FEATURIZER WITH ARRHENIUS FEATURES ---
class SmilesFeaturizer(ABC):
    def featurize(self, X, flip=False): raise NotImplementedError

class KineticMixingFeaturizer(SmilesFeaturizer):
    """Featurizer with Arrhenius kinetics features and chemical symmetry support."""
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.featurizer = SPANGE_DF
        # Base features: 2 numeric + 3 kinetic + 13 spange = 18
        # For mixed: add SolventB% = 19
        self.feats_dim = self.featurizer.shape[1] + 2 + 3 + (1 if mixed else 0)

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        
        # --- ARRHENIUS KINETIC FEATURES ---
        temp_c = X_vals[:, 1:2]  # Temperature in Celsius
        time_m = X_vals[:, 0:1]  # Residence time in minutes
        
        temp_k = temp_c + 273.15  # Convert to Kelvin
        inv_temp = 1000.0 / temp_k  # Inverse temperature (Arrhenius)
        log_time = np.log(time_m + 1e-6)  # Log of time
        interaction = inv_temp * log_time  # Kinetic interaction term
        
        kinetic_features = np.hstack([X_vals, inv_temp, log_time, interaction])
        X_kinetic = torch.tensor(kinetic_features)
        
        # --- CHEMICAL FEATURES ---
        if self.mixed:
            A = torch.tensor(self.featurizer.loc[X["SOLVENT A NAME"]].values)
            B = torch.tensor(self.featurizer.loc[X["SOLVENT B NAME"]].values)
            pct = torch.tensor(X["SolventB%"].values.reshape(-1, 1))
            
            if flip:
                # SYMMETRY FLIP: Swap A and B
                X_chem = B * (1 - pct) + A * pct
            else:
                X_chem = A * (1 - pct) + B * pct
            
            # Add SolventB% as feature
            X_out = torch.cat([X_kinetic, pct, X_chem], dim=1)
        else:
            X_chem = torch.tensor(self.featurizer.loc[X["SOLVENT NAME"]].values)
            X_out = torch.cat([X_kinetic, X_chem], dim=1)
            
        return X_out

# Test featurizer
X_test, Y_test = load_data("single_solvent")
feat = KineticMixingFeaturizer(mixed=False)
X_feat = feat.featurize(X_test.head(5))
print(f"Single solvent feature shape: {X_feat.shape}")
print(f"Feature dim: {feat.feats_dim}")

Single solvent feature shape: torch.Size([5, 18])
Feature dim: 18


In [4]:
# --- MLP ARCHITECTURE ---
class MLPInternal(nn.Module):
    """MLP with BatchNorm, ReLU, Dropout, and Sigmoid output."""
    def __init__(self, input_dim):
        super(MLPInternal, self).__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(input_dim),
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(64, 3),
            nn.Sigmoid()  # Bounded output [0, 1]
        )

    def forward(self, x):
        return self.net(x)

In [5]:
# --- ENSEMBLE MODEL WITH BAGGING AND TTA ---
class EnsembleModel(nn.Module):
    """Ensemble of MLP + XGBoost + LightGBM + RandomForest with TTA for mixed solvents."""
    def __init__(self, data='single'):
        super().__init__()
        self.data_type = data
        self.featurizer = KineticMixingFeaturizer(mixed=(data=='full'))
        
        # Bagging: 5 MLP models
        self.n_mlp_models = 5
        self.mlp_models = nn.ModuleList()
        
        # Gradient boosting models
        self.xgb_model = None
        self.lgb_model = None
        self.rf_model = None
        
        # Scaler for GBDT models
        self.scaler = StandardScaler()
        
        # Ensemble weights: [MLP, XGB, LGB, RF]
        self.weights = [0.35, 0.25, 0.25, 0.15]

    def train_model(self, X_train, y_train):
        # 1. Prepare features
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = torch.tensor(y_train.values)
        
        if self.data_type == 'full':
            # Data augmentation with flipped features
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = torch.cat([X_std, X_flip], dim=0)
            y_all = torch.cat([y_vals, y_vals], dim=0)
        else:
            X_all = X_std
            y_all = y_vals
        
        X_np = X_all.numpy()
        y_np = y_all.numpy()
        
        # Fit scaler
        X_scaled = self.scaler.fit_transform(X_np)
        
        input_dim = X_scaled.shape[1]
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # --- Train MLP models (bagging) ---
        for i in range(self.n_mlp_models):
            torch.manual_seed(42 + i)
            np.random.seed(42 + i)
            
            model = MLPInternal(input_dim).to(device)
            model.train()
            self.mlp_models.append(model)
            
            X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(device)
            y_tensor = torch.tensor(y_np, dtype=torch.double).to(device)
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
            criterion = nn.HuberLoss()
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, mode='min', factor=0.5, patience=20
            )
            
            for epoch in range(200):
                epoch_loss = 0.0
                for inputs, targets in loader:
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    epoch_loss += loss.item() * inputs.size(0)
                
                scheduler.step(epoch_loss / len(dataset))
        
        # --- Train XGBoost ---
        self.xgb_model = MultiOutputRegressor(
            xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.02,
                max_depth=6,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                verbosity=0
            )
        )
        self.xgb_model.fit(X_scaled, y_np)
        
        # --- Train LightGBM ---
        self.lgb_model = MultiOutputRegressor(
            lgb.LGBMRegressor(
                n_estimators=500,
                learning_rate=0.02,
                num_leaves=31,
                max_depth=6,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                verbosity=-1
            )
        )
        self.lgb_model.fit(X_scaled, y_np)
        
        # --- Train RandomForest ---
        self.rf_model = MultiOutputRegressor(
            RandomForestRegressor(
                n_estimators=200,
                max_depth=10,
                random_state=42,
                n_jobs=-1
            )
        )
        self.rf_model.fit(X_scaled, y_np)

    def predict(self, X):
        device = next(self.mlp_models[0].parameters()).device
        
        if self.data_type == 'full':
            # --- TEST TIME AUGMENTATION (TTA) ---
            X_std = self.featurizer.featurize(X, flip=False)
            X_flip = self.featurizer.featurize(X, flip=True)
            
            X_std_scaled = self.scaler.transform(X_std.numpy())
            X_flip_scaled = self.scaler.transform(X_flip.numpy())
            
            # MLP predictions with TTA
            mlp_pred_sum = torch.zeros((len(X), 3)).to(device)
            with torch.no_grad():
                for model in self.mlp_models:
                    model.eval()
                    X_std_t = torch.tensor(X_std_scaled, dtype=torch.double).to(device)
                    X_flip_t = torch.tensor(X_flip_scaled, dtype=torch.double).to(device)
                    p1 = model(X_std_t)
                    p2 = model(X_flip_t)
                    mlp_pred_sum += (p1 + p2) * 0.5
            mlp_preds = (mlp_pred_sum / self.n_mlp_models).cpu().numpy()
            
            # GBDT predictions with TTA
            xgb_preds = (self.xgb_model.predict(X_std_scaled) + self.xgb_model.predict(X_flip_scaled)) / 2
            lgb_preds = (self.lgb_model.predict(X_std_scaled) + self.lgb_model.predict(X_flip_scaled)) / 2
            rf_preds = (self.rf_model.predict(X_std_scaled) + self.rf_model.predict(X_flip_scaled)) / 2
        else:
            X_std = self.featurizer.featurize(X)
            X_std_scaled = self.scaler.transform(X_std.numpy())
            
            # MLP predictions
            mlp_pred_sum = torch.zeros((len(X), 3)).to(device)
            with torch.no_grad():
                for model in self.mlp_models:
                    model.eval()
                    X_t = torch.tensor(X_std_scaled, dtype=torch.double).to(device)
                    mlp_pred_sum += model(X_t)
            mlp_preds = (mlp_pred_sum / self.n_mlp_models).cpu().numpy()
            
            # GBDT predictions
            xgb_preds = self.xgb_model.predict(X_std_scaled)
            lgb_preds = self.lgb_model.predict(X_std_scaled)
            rf_preds = self.rf_model.predict(X_std_scaled)
        
        # Weighted ensemble
        final_preds = (
            self.weights[0] * mlp_preds +
            self.weights[1] * xgb_preds +
            self.weights[2] * lgb_preds +
            self.weights[3] * rf_preds
        )
        
        # Clip to [0, 1]
        final_preds = np.clip(final_preds, 0, 1)
        
        return torch.tensor(final_preds)

In [6]:
# --- CROSS-VALIDATION FOR SINGLE SOLVENT TASK ---
print("\n=== TASK 0: Single Solvent (Leave-One-Out) ===")

X, Y = load_data("single_solvent")
print(f"Data shape: X={X.shape}, Y={Y.shape}")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
all_errors = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = EnsembleModel(data='single')
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold error
    fold_error = np.mean(np.abs(predictions_np - test_Y.values))
    all_errors.append(fold_error)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"\nSingle Solvent CV MAE: {np.mean(all_errors):.6f} +/- {np.std(all_errors):.6f}")


=== TASK 0: Single Solvent (Leave-One-Out) ===
Data shape: X=(656, 3), Y=(656, 3)


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:36<13:57, 36.40s/it]

  8%|▊         | 2/24 [01:12<13:17, 36.25s/it]

 12%|█▎        | 3/24 [01:47<12:31, 35.79s/it]

 17%|█▋        | 4/24 [02:22<11:43, 35.19s/it]

 21%|██        | 5/24 [02:57<11:13, 35.45s/it]

 25%|██▌       | 6/24 [03:34<10:46, 35.91s/it]

 29%|██▉       | 7/24 [04:11<10:14, 36.17s/it]

 33%|███▎      | 8/24 [04:47<09:37, 36.08s/it]

 38%|███▊      | 9/24 [05:22<08:58, 35.90s/it]

 42%|████▏     | 10/24 [05:58<08:22, 35.93s/it]

 46%|████▌     | 11/24 [06:34<07:44, 35.77s/it]

 50%|█████     | 12/24 [07:10<07:09, 35.77s/it]

 54%|█████▍    | 13/24 [07:46<06:34, 35.85s/it]

 58%|█████▊    | 14/24 [08:22<05:58, 35.88s/it]

 62%|██████▎   | 15/24 [08:59<05:26, 36.32s/it]

 67%|██████▋   | 16/24 [09:36<04:53, 36.69s/it]

 71%|███████   | 17/24 [10:18<04:27, 38.20s/it]

 75%|███████▌  | 18/24 [11:00<03:55, 39.28s/it]

 79%|███████▉  | 19/24 [11:38<03:14, 38.95s/it]

 83%|████████▎ | 20/24 [12:16<02:33, 38.50s/it]

 88%|████████▊ | 21/24 [12:52<01:53, 37.76s/it]

 92%|█████████▏| 22/24 [13:27<01:14, 37.15s/it]

 96%|█████████▌| 23/24 [14:03<00:36, 36.80s/it]

100%|██████████| 24/24 [14:39<00:00, 36.52s/it]

100%|██████████| 24/24 [14:39<00:00, 36.65s/it]


Single Solvent CV MAE: 0.068386 +/- 0.036173





In [7]:
# --- CROSS-VALIDATION FOR FULL DATA TASK ---
print("\n=== TASK 1: Full Data (Leave-One-Ramp-Out) ===")

X, Y = load_data("full")
print(f"Data shape: X={X.shape}, Y={Y.shape}")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []
all_errors_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = EnsembleModel(data='full')
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X)
    predictions_np = predictions.detach().cpu().numpy()
    
    # Calculate fold error
    fold_error = np.mean(np.abs(predictions_np - test_Y.values))
    all_errors_full.append(fold_error)
    
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"\nFull Data CV MAE: {np.mean(all_errors_full):.6f} +/- {np.std(all_errors_full):.6f}")


=== TASK 1: Full Data (Leave-One-Ramp-Out) ===
Data shape: X=(1227, 5), Y=(1227, 3)


  0%|          | 0/13 [00:00<?, ?it/s]

  8%|▊         | 1/13 [02:03<24:45, 123.83s/it]

 15%|█▌        | 2/13 [04:07<22:37, 123.44s/it]

 23%|██▎       | 3/13 [06:09<20:30, 123.08s/it]

 31%|███       | 4/13 [08:12<18:25, 122.80s/it]

 38%|███▊      | 5/13 [10:18<16:32, 124.02s/it]

 46%|████▌     | 6/13 [12:22<14:29, 124.15s/it]

 54%|█████▍    | 7/13 [14:24<12:19, 123.31s/it]

 62%|██████▏   | 8/13 [16:28<10:18, 123.62s/it]

 69%|██████▉   | 9/13 [18:31<08:13, 123.35s/it]

 77%|███████▋  | 10/13 [20:44<06:18, 126.30s/it]

 85%|████████▍ | 11/13 [22:56<04:16, 128.23s/it]

 92%|█████████▏| 12/13 [25:08<02:09, 129.29s/it]

100%|██████████| 13/13 [27:27<00:00, 132.21s/it]

100%|██████████| 13/13 [27:27<00:00, 126.72s/it]


Full Data CV MAE: 0.088347 +/- 0.030664





In [8]:
# --- SAVE SUBMISSION ---
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

# Save to submission folder
import os
os.makedirs('/home/submission', exist_ok=True)
submission.to_csv('/home/submission/submission.csv', index=True)

# Also save locally
submission.to_csv('/home/code/experiments/001_baseline/submission.csv', index=True)

print(f"\n=== FINAL RESULTS ===")
print(f"Single Solvent CV MAE: {np.mean(all_errors):.6f}")
print(f"Full Data CV MAE: {np.mean(all_errors_full):.6f}")

# Combined score (weighted average based on data sizes)
total_single = len(submission_single_solvent)
total_full = len(submission_full_data)
total = total_single + total_full
combined_mae = (np.mean(all_errors) * total_single + np.mean(all_errors_full) * total_full) / total
print(f"Combined CV MAE (weighted): {combined_mae:.6f}")
print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Submission shape: {submission.shape}")


=== FINAL RESULTS ===
Single Solvent CV MAE: 0.068386
Full Data CV MAE: 0.088347
Combined CV MAE (weighted): 0.081393

Submission saved to /home/submission/submission.csv
Submission shape: (1883, 7)
