# Experiment 001: Physics-Informed MLP Baseline

Implementing:
- Arrhenius kinetics features (1/T, ln(t), 1/T * ln(t))
- Spange descriptors for solvent features
- Robust MLP with BatchNorm, Dropout, Sigmoid output
- HuberLoss for robustness
- TTA for mixed solvent symmetry
- Ensemble of 5 models

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

# Check GPU
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

torch.set_default_dtype(torch.double)

GPU available: True
GPU: NVIDIA H100 80GB HBM3
Memory: 85.0 GB


In [2]:
# Data loading utilities - adapted for local paths
DATA_PATH = '/home/data'

INPUT_LABELS_FULL_SOLVENT = [
    "Residence Time",
    "Temperature",
    "SOLVENT A NAME",
    "SOLVENT B NAME",
    "SolventB%",
]

INPUT_LABELS_SINGLE_SOLVENT = [
    "Residence Time",
    "Temperature",
    "SOLVENT NAME",
]

INPUT_LABELS_NUMERIC = [
    "Residence Time",
    "Temperature",
]

TARGET_LABELS = [
    "Product 2",
    "Product 3",
    "SM",
]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    features = pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)
    return features

def generate_leave_one_out_splits(X, Y):
    """Leave-one-solvent-out for single solvent data."""
    all_solvents = X["SOLVENT NAME"].unique()
    for solvent_name in sorted(all_solvents):
        train_idcs_mask = X["SOLVENT NAME"] != solvent_name
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

def generate_leave_one_ramp_out_splits(X, Y):
    """Leave-one-ramp-out for full data."""
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, solvent_pair in all_solvent_ramps.iterrows():
        train_idcs_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != solvent_pair).any(axis=1)
        yield (
            (X[train_idcs_mask], Y[train_idcs_mask]),
            (X[~train_idcs_mask], Y[~train_idcs_mask]),
        )

In [3]:
# Load Spange descriptors
SPANGE_DF = load_features('spange_descriptors')
print(f"Spange descriptors shape: {SPANGE_DF.shape}")
print(f"Features: {list(SPANGE_DF.columns)}")

Spange descriptors shape: (26, 13)
Features: ['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 'SA', 'SB', 'SP', 'SdP', 'N', 'n', 'f(n)', 'delta']


In [4]:
# Featurizer with Arrhenius kinetics
class SmilesFeaturizer(ABC):
    def featurize(self, X, flip=False):
        raise NotImplementedError

class KineticFeaturizer(SmilesFeaturizer):
    """Featurizer with Arrhenius kinetics features."""
    def __init__(self, mixed=False):
        self.mixed = mixed
        self.featurizer = SPANGE_DF
        # Features: 2 numeric + 3 kinetic + 13 spange = 18
        self.feats_dim = self.featurizer.shape[1] + 2 + 3

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        
        # Arrhenius kinetics features
        temp_c = X_vals[:, 1:2]  # Temperature in Celsius
        time_m = X_vals[:, 0:1]  # Residence time in minutes
        
        temp_k = temp_c + 273.15  # Convert to Kelvin
        inv_temp = 1000.0 / temp_k  # Inverse temperature (Arrhenius)
        log_time = np.log(time_m + 1e-6)  # Log residence time
        interaction = inv_temp * log_time  # Kinetic interaction term
        
        X_kinetic = torch.tensor(np.hstack([X_vals, inv_temp, log_time, interaction]))
        
        # Solvent features
        if self.mixed:
            A = torch.tensor(self.featurizer.loc[X["SOLVENT A NAME"]].values)
            B = torch.tensor(self.featurizer.loc[X["SOLVENT B NAME"]].values)
            pct = torch.tensor(X["SolventB%"].values.reshape(-1, 1))
            
            if flip:
                # Symmetry flip: swap A and B
                X_chem = B * (1 - (1-pct)) + A * (1-pct)
            else:
                X_chem = A * (1 - pct) + B * pct
        else:
            X_chem = torch.tensor(self.featurizer.loc[X["SOLVENT NAME"]].values)
            
        return torch.cat([X_kinetic, X_chem], dim=1)

In [5]:
# Robust MLP architecture
class MLPInternal(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 128, 64], dropout=0.2):
        super(MLPInternal, self).__init__()
        
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
            ])
            prev_dim = h_dim
        
        layers.extend([
            nn.Linear(prev_dim, 3),
            nn.Sigmoid()  # Constrain output to [0, 1]
        ])
        
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

In [6]:
# Physics-Informed Model with TTA and Ensemble
class PhysicsInformedModel(nn.Module):
    def __init__(self, data='single', n_models=5, hidden_dims=[128, 128, 64], 
                 dropout=0.2, lr=5e-4, weight_decay=1e-5, epochs=300):
        super().__init__()
        self.data_type = data
        self.n_models = n_models
        self.hidden_dims = hidden_dims
        self.dropout = dropout
        self.lr = lr
        self.weight_decay = weight_decay
        self.epochs = epochs
        
        self.featurizer = KineticFeaturizer(mixed=(data=='full'))
        self.models = nn.ModuleList()

    def train_model(self, X_train, y_train):
        # Standard data
        X_std = self.featurizer.featurize(X_train, flip=False)
        y_vals = torch.tensor(y_train.values)
        
        if self.data_type == 'full':
            # Data augmentation with symmetric solvent swapping
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = torch.cat([X_std, X_flip], dim=0)
            y_all = torch.cat([y_vals, y_vals], dim=0)
        else:
            X_all = X_std
            y_all = y_vals
            
        input_dim = X_all.shape[1]
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        for i in range(self.n_models):
            torch.manual_seed(42 + i)  # Different seed for each model
            model = MLPInternal(input_dim, self.hidden_dims, self.dropout).to(device)
            model.train()
            self.models.append(model)
            
            dataset = TensorDataset(X_all, y_all)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            optimizer = torch.optim.AdamW(model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
            criterion = nn.HuberLoss()  # Robust to outliers
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, mode='min', factor=0.5, patience=20
            )
            
            for epoch in range(self.epochs):
                epoch_loss = 0.0
                for inputs, targets in loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    epoch_loss += loss.item() * inputs.size(0)
                
                scheduler.step(epoch_loss / len(dataset))

    def predict(self, X):
        device = next(self.models[0].parameters()).device
        
        if self.data_type == 'full':
            # Test Time Augmentation (TTA) for mixed solvents
            X_std = self.featurizer.featurize(X, flip=False).to(device)
            X_flip = self.featurizer.featurize(X, flip=True).to(device)
            
            pred_sum = torch.zeros((len(X), 3)).to(device)
            
            with torch.no_grad():
                for model in self.models:
                    model.eval()
                    p1 = model(X_std)
                    p2 = model(X_flip)
                    pred_sum += (p1 + p2) * 0.5  # Average both views
            
            avg_pred = pred_sum / self.n_models
        else:
            X_std = self.featurizer.featurize(X).to(device)
            pred_sum = torch.zeros((len(X), 3)).to(device)
            
            with torch.no_grad():
                for model in self.models:
                    model.eval()
                    pred_sum += model(X_std)
            
            avg_pred = pred_sum / self.n_models

        return avg_pred.cpu()

In [7]:
# Test the model on a single fold first
print("Testing on single fold...")
X, Y = load_data("single_solvent")
print(f"Single solvent data: {X.shape[0]} samples, {len(X['SOLVENT NAME'].unique())} solvents")

# Get first fold
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)
print(f"Train: {len(train_X)}, Test: {len(test_X)}")

# Train model
model = PhysicsInformedModel(data='single', n_models=2, epochs=50)  # Quick test
model.train_model(train_X, train_Y)

# Predict
preds = model.predict(test_X)
print(f"Predictions shape: {preds.shape}")
print(f"Predictions range: [{preds.min():.4f}, {preds.max():.4f}]")

# Calculate RMSE for this fold
rmse = np.sqrt(((preds.numpy() - test_Y.values) ** 2).mean())
print(f"Single fold RMSE: {rmse:.4f}")

Testing on single fold...
Single solvent data: 656 samples, 24 solvents
Train: 619, Test: 37


Predictions shape: torch.Size([37, 3])
Predictions range: [0.0252, 0.8897]
Single fold RMSE: 0.1850


In [8]:
# Full cross-validation for single solvent task
print("\n" + "="*50)
print("TASK 0: Single Solvent (Leave-One-Solvent-Out CV)")
print("="*50)

X, Y = load_data("single_solvent")
split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
fold_rmses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = PhysicsInformedModel(data='single', n_models=5, epochs=300)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    
    # Calculate fold RMSE
    fold_rmse = np.sqrt(((predictions - test_Y.values) ** 2).mean())
    fold_rmses.append(fold_rmse)
    
    # Store predictions
    for row_idx, row in enumerate(predictions):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"\nSingle Solvent CV RMSE: {np.mean(fold_rmses):.5f} ± {np.std(fold_rmses):.5f}")


TASK 0: Single Solvent (Leave-One-Solvent-Out CV)


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:51<19:49, 51.72s/it]

  8%|▊         | 2/24 [01:43<18:56, 51.66s/it]

 12%|█▎        | 3/24 [02:32<17:43, 50.64s/it]

 17%|█▋        | 4/24 [03:22<16:44, 50.24s/it]

 21%|██        | 5/24 [04:14<16:04, 50.78s/it]

 25%|██▌       | 6/24 [05:06<15:21, 51.17s/it]

 29%|██▉       | 7/24 [05:57<14:32, 51.31s/it]

 33%|███▎      | 8/24 [06:49<13:41, 51.35s/it]

 38%|███▊      | 9/24 [07:40<12:51, 51.45s/it]

 42%|████▏     | 10/24 [08:32<12:02, 51.64s/it]

 46%|████▌     | 11/24 [09:25<11:14, 51.90s/it]

 50%|█████     | 12/24 [10:17<10:24, 52.01s/it]

 54%|█████▍    | 13/24 [11:09<09:33, 52.10s/it]

 58%|█████▊    | 14/24 [12:02<08:42, 52.20s/it]

 62%|██████▎   | 15/24 [12:54<07:49, 52.22s/it]

 67%|██████▋   | 16/24 [13:46<06:57, 52.21s/it]

 71%|███████   | 17/24 [14:41<06:10, 52.97s/it]

 75%|███████▌  | 18/24 [15:33<05:16, 52.69s/it]

 79%|███████▉  | 19/24 [16:25<04:22, 52.40s/it]

 83%|████████▎ | 20/24 [17:17<03:29, 52.30s/it]

 88%|████████▊ | 21/24 [18:09<02:36, 52.19s/it]

 92%|█████████▏| 22/24 [19:01<01:44, 52.19s/it]

 96%|█████████▌| 23/24 [19:53<00:52, 52.28s/it]

100%|██████████| 24/24 [20:47<00:00, 52.77s/it]

100%|██████████| 24/24 [20:47<00:00, 51.99s/it]


Single Solvent CV RMSE: 0.08719 ± 0.04350





In [None]:
# Full cross-validation for full data task
print("\n" + "="*50)
print("TASK 1: Full Data (Leave-One-Ramp-Out CV)")
print("="*50)

X, Y = load_data("full")
split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []
fold_rmses_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = PhysicsInformedModel(data='full', n_models=5, epochs=300)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    
    # Calculate fold RMSE
    fold_rmse = np.sqrt(((predictions - test_Y.values) ** 2).mean())
    fold_rmses_full.append(fold_rmse)
    
    # Store predictions
    for row_idx, row in enumerate(predictions):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

submission_full_data = pd.DataFrame(all_predictions)
print(f"\nFull Data CV RMSE: {np.mean(fold_rmses_full):.5f} ± {np.std(fold_rmses_full):.5f}")

In [None]:
# Combine and save submission
submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"

# Save submission
submission.to_csv("/home/code/experiments/001_baseline/submission.csv", index=True)
submission.to_csv("/home/submission/submission.csv", index=True)

print(f"\nSubmission saved with {len(submission)} rows")
print(submission.head())

In [None]:
# Final summary
print("\n" + "="*50)
print("FINAL RESULTS")
print("="*50)
print(f"Single Solvent CV RMSE: {np.mean(fold_rmses):.5f} ± {np.std(fold_rmses):.5f}")
print(f"Full Data CV RMSE: {np.mean(fold_rmses_full):.5f} ± {np.std(fold_rmses_full):.5f}")

# Overall score (average of both tasks)
overall_rmse = (np.mean(fold_rmses) + np.mean(fold_rmses_full)) / 2
print(f"\nOverall CV RMSE: {overall_rmse:.5f}")