# Baseline Experiment
Implementing the seed strategy:
- Merging full and single solvent datasets.
- Feature engineering with Spange and PCA descriptors.
- Deep Sets architecture for mixture handling.
- Leave-One-Ramp-Out validation.

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import sys
import os

# Adjust paths for this environment
DATA_DIR = '/home/data'

# Load utils
sys.path.append(DATA_DIR)
# We will redefine load_data to use the correct path
def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(os.path.join(DATA_DIR, 'catechol_full_data_yields.csv'))
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(os.path.join(DATA_DIR, 'catechol_single_solvent_yields.csv'))
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    # assert name in ["spange_descriptors", "acs_pca_descriptors", "drfps_catechol", "fragprints", "smiles"]
    features = pd.read_csv(os.path.join(DATA_DIR, f'{name}_lookup.csv'), index_col=0)
    return features

# Define constants from utils.py (since we can't easily import if it has bad paths)
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

# Load Data
X_full, Y_full = load_data("full")
X_single, Y_single = load_data("single_solvent")

print("Full data shape:", X_full.shape)
print("Single data shape:", X_single.shape)

# Preprocessing: Unify formats
# Convert single to mixture format
X_single_formatted = X_single.copy()
X_single_formatted["SOLVENT A NAME"] = X_single["SOLVENT NAME"]
X_single_formatted["SOLVENT B NAME"] = X_single["SOLVENT NAME"]
X_single_formatted["SolventB%"] = 0.0
X_single_formatted = X_single_formatted[INPUT_LABELS_FULL_SOLVENT]

# Merge
X_all = pd.concat([X_full, X_single_formatted], axis=0).reset_index(drop=True)
Y_all = pd.concat([Y_full, Y_single], axis=0).reset_index(drop=True)

print("Merged data shape:", X_all.shape)

# Load Features
spange = load_features("spange_descriptors")
pca = load_features("acs_pca_descriptors")

# Merge features
solvent_features = pd.concat([spange, pca], axis=1)
# Fill NaNs if any (though lookups should be complete for used solvents)
solvent_features = solvent_features.fillna(0)

print("Solvent features shape:", solvent_features.shape)

# Check if all solvents in data are in features
unique_solvents = set(X_all["SOLVENT A NAME"].unique()) | set(X_all["SOLVENT B NAME"].unique())
missing_solvents = unique_solvents - set(solvent_features.index)
print("Missing solvents:", missing_solvents)


Full data shape: (1227, 5)
Single data shape: (656, 3)
Merged data shape: (1883, 5)
Solvent features shape: (26, 18)
Missing solvents: set()


In [6]:
# Feature Engineering Function
def get_mixture_features(X_df, solvent_features_df):
    # Normalize Process Conditions
    # We will do this inside the CV loop to avoid leakage, but for the structure we define it here
    # Actually, for Deep Sets, we pass raw features and let the network handle it, 
    # but we need to look up the solvent descriptors.
    
    # Map solvent names to descriptors
    # We'll create arrays for Solvent A and Solvent B features
    
    # Get feature dimension
    feat_dim = solvent_features_df.shape[1]
    
    # Initialize arrays
    n_samples = len(X_df)
    feat_A = np.zeros((n_samples, feat_dim))
    feat_B = np.zeros((n_samples, feat_dim))
    
    # We can use pandas mapping for speed
    # Reindex solvent_features to match X_df order
    
    # Solvent A
    feat_A = solvent_features_df.loc[X_df["SOLVENT A NAME"]].values
    
    # Solvent B
    feat_B = solvent_features_df.loc[X_df["SOLVENT B NAME"]].values
    
    # Ratios
    # SolventB% is in percentage (0-100 presumably, let's check)
    ratio_B = X_df["SolventB%"].values.reshape(-1, 1) / 100.0
    ratio_A = 1.0 - ratio_B
    
    # Process conditions
    temp = X_df["Temperature"].values.reshape(-1, 1)
    time = X_df["Residence Time"].values.reshape(-1, 1)
    
    return feat_A, feat_B, ratio_A, ratio_B, temp, time

# Test feature generation
fA, fB, rA, rB, T, t = get_mixture_features(X_all.iloc[:5], solvent_features)
print("Feature shapes:", fA.shape, fB.shape, rA.shape, rB.shape, T.shape, t.shape)


Feature shapes: (5, 18) (5, 18) (5, 1) (5, 1) (5, 1) (5, 1)


In [7]:
# Model Definition
class DeepSetsMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=3):
        super(DeepSetsMLP, self).__init__()
        
        # Solvent Encoder (Shared)
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )
        
        # Predictor
        # Input: hidden_dim (from mixture) + 2 (Temp, Time)
        self.predictor = nn.Sequential(
            nn.Linear(hidden_dim + 2, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, output_dim)
        )
        
    def forward(self, feat_A, feat_B, ratio_A, ratio_B, temp, time):
        # Encode solvents
        h_A = self.encoder(feat_A)
        h_B = self.encoder(feat_B)
        
        # Weighted Aggregation (Deep Sets)
        h_mix = ratio_A * h_A + ratio_B * h_B
        
        # Concatenate with process conditions
        # Ensure shapes match
        combined = torch.cat([h_mix, temp, time], dim=1)
        
        # Predict
        out = self.predictor(combined)
        return out

# Wrapper class to satisfy the template requirement (conceptually)
# The template asks for `model = MLPModel()`. 
# We will implement a class that handles training and prediction to keep the notebook clean.

class MLPModel:
    def __init__(self, input_dim=None):
        self.model = None
        self.input_dim = input_dim
        self.scaler_temp = StandardScaler()
        self.scaler_time = StandardScaler()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def preprocess(self, X_df, solvent_features, fit_scalers=False):
        fA, fB, rA, rB, temp, time = get_mixture_features(X_df, solvent_features)
        
        if fit_scalers:
            temp = self.scaler_temp.fit_transform(temp)
            time = self.scaler_time.fit_transform(time)
        else:
            temp = self.scaler_temp.transform(temp)
            time = self.scaler_time.transform(time)
            
        # Convert to tensors
        fA = torch.FloatTensor(fA).to(self.device)
        fB = torch.FloatTensor(fB).to(self.device)
        rA = torch.FloatTensor(rA).to(self.device)
        rB = torch.FloatTensor(rB).to(self.device)
        temp = torch.FloatTensor(temp).to(self.device)
        time = torch.FloatTensor(time).to(self.device)
        
        return fA, fB, rA, rB, temp, time

    def fit(self, X_df, Y_df, solvent_features, epochs=100, batch_size=32, lr=0.001):
        if self.input_dim is None:
            self.input_dim = solvent_features.shape[1]
            
        self.model = DeepSetsMLP(self.input_dim).to(self.device)
        optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=1e-4)
        criterion = nn.MSELoss()
        
        # Preprocess
        fA, fB, rA, rB, temp, time = self.preprocess(X_df, solvent_features, fit_scalers=True)
        targets = torch.FloatTensor(Y_df.values).to(self.device)
        
        dataset = TensorDataset(fA, fB, rA, rB, temp, time, targets)
        # Use drop_last=True to avoid BatchNorm error with batch size 1
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
        
        self.model.train()
        for epoch in range(epochs):
            total_loss = 0
            for batch in loader:
                optimizer.zero_grad()
                b_fA, b_fB, b_rA, b_rB, b_temp, b_time, b_y = batch
                preds = self.model(b_fA, b_fB, b_rA, b_rB, b_temp, b_time)
                loss = criterion(preds, b_y)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            # if epoch % 10 == 0:
            #     print(f"Epoch {epoch}, Loss: {total_loss / len(loader):.4f}")
                
    def predict(self, X_df, solvent_features):
        self.model.eval()
        with torch.no_grad():
            fA, fB, rA, rB, temp, time = self.preprocess(X_df, solvent_features, fit_scalers=False)
            preds = self.model(fA, fB, rA, rB, temp, time)
        return preds.cpu().numpy()

In [8]:
# Validation Loop
# We need to implement `generate_leave_one_ramp_out_splits` logic here or import it.
# Since we can't import easily due to path issues in utils.py, I'll copy the logic.

def generate_leave_one_ramp_out_splits(X, Y):
    all_solvent_ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    # Sort to ensure deterministic order
    all_solvent_ramps = all_solvent_ramps.sort_values(by=["SOLVENT A NAME", "SOLVENT B NAME"])
    
    for _, solvent_pair in all_solvent_ramps.iterrows():
        # Create mask for this pair
        # We need to match both A and B. 
        # Note: In the merged dataset, single solvents have A=B.
        
        mask = (X["SOLVENT A NAME"] == solvent_pair["SOLVENT A NAME"]) & \
               (X["SOLVENT B NAME"] == solvent_pair["SOLVENT B NAME"])
        
        train_mask = ~mask
        test_mask = mask
        
        yield (X[train_mask], Y[train_mask]), (X[test_mask], Y[test_mask])

# Run CV
scores = []
all_preds = []
all_targets = []

print("Starting Leave-One-Ramp-Out CV...")

# For speed in this baseline, we might limit folds or run all. 
# There are ~13 ramps + 24 single solvents? Let's check number of splits.
splits = list(generate_leave_one_ramp_out_splits(X_all, Y_all))
print(f"Total splits: {len(splits)}")

# Limit to first 5 splits for quick testing if needed, but better to run all for baseline.
# We will run all.

fold = 0
for (X_train, Y_train), (X_test, Y_test) in splits:
    fold += 1
    # print(f"Fold {fold}/{len(splits)}")
    
    model = MLPModel()
    model.fit(X_train, Y_train, solvent_features, epochs=50, lr=0.001) # Reduced epochs for speed
    
    preds = model.predict(X_test, solvent_features)
    
    # Calculate metrics
    mse = mean_squared_error(Y_test, preds)
    mae = mean_absolute_error(Y_test, preds)
    scores.append({"fold": fold, "mse": mse, "mae": mae})
    
    # Store preds for analysis
    # We need to align preds with the original index or just store them
    # Let's store them with the test dataframe
    test_res = X_test.copy()
    test_res[TARGET_LABELS] = Y_test
    test_res[[f"{t}_pred" for t in TARGET_LABELS]] = preds
    all_preds.append(test_res)

# Aggregate results
results_df = pd.DataFrame(scores)
print("\nCV Results:")
print(results_df.describe())

mean_mse = results_df["mse"].mean()
mean_mae = results_df["mae"].mean()
print(f"\nMean MSE: {mean_mse:.5f}")
print(f"Mean MAE: {mean_mae:.5f}")

# Save predictions
all_preds_df = pd.concat(all_preds)
all_preds_df.to_csv("cv_predictions.csv", index=False)
print("Saved cv_predictions.csv")


Starting Leave-One-Ramp-Out CV...
Total splits: 37



CV Results:
            fold        mse        mae
count  37.000000  37.000000  37.000000
mean   19.000000   0.009073   0.063882
std    10.824355   0.009026   0.025883
min     1.000000   0.001173   0.027722
25%    10.000000   0.003890   0.048155
50%    19.000000   0.006586   0.059722
75%    28.000000   0.011075   0.076156
max    37.000000   0.051368   0.146515

Mean MSE: 0.00907
Mean MAE: 0.06388
Saved cv_predictions.csv
