# Experiment 004: Feature Expansion + Deeper MLP

Combining:
1. Spange descriptors (13 features)
2. DRFP with PCA to 20-30 components
3. Fragprints with PCA to 20-30 components  
4. Arrhenius kinetics features (3 features)
5. Deeper MLP [256, 256, 128, 64]
6. TTA for mixed solvents
7. Ensemble of 5 models

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import tqdm
import warnings
warnings.filterwarnings('ignore')

print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
torch.set_default_dtype(torch.double)

GPU available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# Data loading utilities
DATA_PATH = '/home/data'

INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    return X, df[TARGET_LABELS]

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent_name in sorted(X["SOLVENT NAME"].unique()):
        train_mask = X["SOLVENT NAME"] != solvent_name
        yield (X[train_mask], Y[train_mask]), (X[~train_mask], Y[~train_mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates().sort_values(["SOLVENT A NAME", "SOLVENT B NAME"])
    for _, row in ramps.iterrows():
        train_mask = (X[["SOLVENT A NAME", "SOLVENT B NAME"]] != row).any(axis=1)
        yield (X[train_mask], Y[train_mask]), (X[~train_mask], Y[~train_mask])

In [3]:
# Load all feature sets
SPANGE_DF = load_features('spange_descriptors')
DRFP_DF = load_features('drfps_catechol')
FRAGPRINTS_DF = load_features('fragprints')

print(f"Spange: {SPANGE_DF.shape}")
print(f"DRFP: {DRFP_DF.shape}")
print(f"Fragprints: {FRAGPRINTS_DF.shape}")

# Check non-zero columns
drfp_nonzero = (DRFP_DF != 0).any(axis=0).sum()
frag_nonzero = (FRAGPRINTS_DF != 0).any(axis=0).sum()
print(f"\nDRFP non-zero columns: {drfp_nonzero}")
print(f"Fragprints non-zero columns: {frag_nonzero}")

Spange: (26, 13)
DRFP: (24, 2048)
Fragprints: (24, 2133)

DRFP non-zero columns: 165
Fragprints non-zero columns: 144


In [4]:
# Apply PCA to reduce DRFP and Fragprints
# Note: We only have 26 solvents, so max PCA components is 25

N_COMPONENTS = 20  # Safe value below 26 solvents

# DRFP PCA
drfp_scaler = StandardScaler()
drfp_scaled = drfp_scaler.fit_transform(DRFP_DF)
drfp_pca = PCA(n_components=N_COMPONENTS)
DRFP_PCA = pd.DataFrame(
    drfp_pca.fit_transform(drfp_scaled),
    index=DRFP_DF.index,
    columns=[f'drfp_pc{i}' for i in range(N_COMPONENTS)]
)
print(f"DRFP PCA explained variance: {drfp_pca.explained_variance_ratio_.sum():.4f}")

# Fragprints PCA
frag_scaler = StandardScaler()
frag_scaled = frag_scaler.fit_transform(FRAGPRINTS_DF)
frag_pca = PCA(n_components=N_COMPONENTS)
FRAGPRINTS_PCA = pd.DataFrame(
    frag_pca.fit_transform(frag_scaled),
    index=FRAGPRINTS_DF.index,
    columns=[f'frag_pc{i}' for i in range(N_COMPONENTS)]
)
print(f"Fragprints PCA explained variance: {frag_pca.explained_variance_ratio_.sum():.4f}")

print(f"\nDRFP PCA shape: {DRFP_PCA.shape}")
print(f"Fragprints PCA shape: {FRAGPRINTS_PCA.shape}")

DRFP PCA explained variance: 0.9985
Fragprints PCA explained variance: 0.9905

DRFP PCA shape: (24, 20)
Fragprints PCA shape: (24, 20)


In [5]:
# Extended Featurizer with all feature sets
class ExtendedFeaturizer:
    """Featurizer with Spange + DRFP PCA + Fragprints PCA + Arrhenius kinetics."""
    def __init__(self, mixed=False, use_drfp=True, use_fragprints=True):
        self.mixed = mixed
        self.use_drfp = use_drfp
        self.use_fragprints = use_fragprints
        
        self.spange = SPANGE_DF
        self.drfp_pca = DRFP_PCA if use_drfp else None
        self.frag_pca = FRAGPRINTS_PCA if use_fragprints else None
        
        # Calculate feature dimension
        # 2 numeric + 3 kinetic + 13 spange + 20 drfp_pca + 20 frag_pca = 58
        self.feats_dim = 2 + 3 + 13
        if use_drfp:
            self.feats_dim += N_COMPONENTS
        if use_fragprints:
            self.feats_dim += N_COMPONENTS

    def featurize(self, X, flip=False):
        X_vals = X[INPUT_LABELS_NUMERIC].values.astype(np.float64)
        
        # Arrhenius kinetics features
        temp_c, time_m = X_vals[:, 1:2], X_vals[:, 0:1]
        temp_k = temp_c + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(time_m + 1e-6)
        interaction = inv_temp * log_time
        X_kinetic = np.hstack([X_vals, inv_temp, log_time, interaction])
        
        # Solvent features
        if self.mixed:
            solvent_a = X["SOLVENT A NAME"].values
            solvent_b = X["SOLVENT B NAME"].values
            pct = X["SolventB%"].values.reshape(-1, 1)
            
            if flip:
                solvent_a, solvent_b = solvent_b, solvent_a
                pct = 1 - pct
            
            # Spange features (linear mixing)
            A_spange = self.spange.loc[solvent_a].values
            B_spange = self.spange.loc[solvent_b].values
            X_spange = A_spange * (1 - pct) + B_spange * pct
            
            features = [X_kinetic, X_spange]
            
            if self.use_drfp:
                A_drfp = self.drfp_pca.loc[solvent_a].values
                B_drfp = self.drfp_pca.loc[solvent_b].values
                X_drfp = A_drfp * (1 - pct) + B_drfp * pct
                features.append(X_drfp)
            
            if self.use_fragprints:
                A_frag = self.frag_pca.loc[solvent_a].values
                B_frag = self.frag_pca.loc[solvent_b].values
                X_frag = A_frag * (1 - pct) + B_frag * pct
                features.append(X_frag)
        else:
            solvent = X["SOLVENT NAME"].values
            X_spange = self.spange.loc[solvent].values
            features = [X_kinetic, X_spange]
            
            if self.use_drfp:
                X_drfp = self.drfp_pca.loc[solvent].values
                features.append(X_drfp)
            
            if self.use_fragprints:
                X_frag = self.frag_pca.loc[solvent].values
                features.append(X_frag)
        
        return np.hstack(features)

In [6]:
# Deeper MLP architecture
class DeeperMLP(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 256, 128, 64], dropout=0.2):
        super().__init__()
        layers = [nn.BatchNorm1d(input_dim)]
        prev_dim = input_dim
        
        for h_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h_dim),
                nn.BatchNorm1d(h_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
            ])
            prev_dim = h_dim
        
        layers.extend([nn.Linear(prev_dim, 3), nn.Sigmoid()])
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x)

In [7]:
# Extended Model with feature expansion
class ExtendedModel(nn.Module):
    def __init__(self, data='single', n_models=5, use_drfp=True, use_fragprints=True,
                 hidden_dims=[256, 256, 128, 64], dropout=0.2, lr=5e-4, epochs=300):
        super().__init__()
        self.data_type = data
        self.n_models = n_models
        self.hidden_dims = hidden_dims
        self.dropout = dropout
        self.lr = lr
        self.epochs = epochs
        
        self.featurizer = ExtendedFeaturizer(
            mixed=(data=='full'), 
            use_drfp=use_drfp, 
            use_fragprints=use_fragprints
        )
        self.models = nn.ModuleList()

    def train_model(self, X_train, y_train):
        X_feats = self.featurizer.featurize(X_train, flip=False)
        y_vals = y_train.values
        
        if self.data_type == 'full':
            X_flip = self.featurizer.featurize(X_train, flip=True)
            X_all = np.vstack([X_feats, X_flip])
            y_all = np.vstack([y_vals, y_vals])
        else:
            X_all = X_feats
            y_all = y_vals
        
        X_tensor = torch.tensor(X_all)
        y_tensor = torch.tensor(y_all)
        
        input_dim = X_all.shape[1]
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.models = nn.ModuleList()
        for i in range(self.n_models):
            torch.manual_seed(42 + i)
            model = DeeperMLP(input_dim, self.hidden_dims, self.dropout).to(device)
            model.train()
            self.models.append(model)
            
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            
            optimizer = torch.optim.AdamW(model.parameters(), lr=self.lr, weight_decay=1e-5)
            criterion = nn.HuberLoss()
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=20)
            
            for epoch in range(self.epochs):
                epoch_loss = 0.0
                for inputs, targets in loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    epoch_loss += loss.item() * inputs.size(0)
                scheduler.step(epoch_loss / len(dataset))

    def predict(self, X_test):
        device = next(self.models[0].parameters()).device
        
        if self.data_type == 'full':
            X_std = torch.tensor(self.featurizer.featurize(X_test, flip=False)).to(device)
            X_flip = torch.tensor(self.featurizer.featurize(X_test, flip=True)).to(device)
            
            pred_sum = torch.zeros((len(X_test), 3)).to(device)
            with torch.no_grad():
                for model in self.models:
                    model.eval()
                    pred_sum += (model(X_std) + model(X_flip)) * 0.5
            avg_pred = pred_sum / self.n_models
        else:
            X_t = torch.tensor(self.featurizer.featurize(X_test)).to(device)
            pred_sum = torch.zeros((len(X_test), 3)).to(device)
            with torch.no_grad():
                for model in self.models:
                    model.eval()
                    pred_sum += model(X_t)
            avg_pred = pred_sum / self.n_models
        
        return avg_pred.cpu()

In [8]:
# Test on single fold
print("Testing Extended Model on single fold...")
X, Y = load_data("single_solvent")
split_gen = generate_leave_one_out_splits(X, Y)
(train_X, train_Y), (test_X, test_Y) = next(split_gen)
print(f"Train: {len(train_X)}, Test: {len(test_X)}")

model = ExtendedModel(data='single', n_models=2, epochs=50)  # Quick test
print(f"Feature dimension: {model.featurizer.feats_dim}")
model.train_model(train_X, train_Y)

preds = model.predict(test_X)
rmse = np.sqrt(((preds.numpy() - test_Y.values) ** 2).mean())
print(f"Single fold RMSE: {rmse:.4f}")

Testing Extended Model on single fold...
Train: 619, Test: 37
Feature dimension: 58


Single fold RMSE: 0.2548


In [9]:
# Full CV for single solvent task
print("\n" + "="*50)
print("TASK 0: Single Solvent (Leave-One-Solvent-Out CV)")
print("="*50)

X, Y = load_data("single_solvent")
split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []
fold_rmses = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=24):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = ExtendedModel(data='single', n_models=5, epochs=300)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    fold_rmse = np.sqrt(((predictions - test_Y.values) ** 2).mean())
    fold_rmses.append(fold_rmse)
    
    for row_idx, row in enumerate(predictions):
        all_predictions.append({"task": 0, "fold": fold_idx, "row": row_idx,
                               "target_1": row[0], "target_2": row[1], "target_3": row[2]})

submission_single_solvent = pd.DataFrame(all_predictions)
print(f"\nSingle Solvent CV RMSE: {np.mean(fold_rmses):.5f} ± {np.std(fold_rmses):.5f}")


TASK 0: Single Solvent (Leave-One-Solvent-Out CV)


  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [01:01<23:25, 61.13s/it]

  8%|▊         | 2/24 [02:02<22:23, 61.08s/it]

 12%|█▎        | 3/24 [02:59<20:49, 59.51s/it]

 17%|█▋        | 4/24 [03:58<19:39, 59.00s/it]

 21%|██        | 5/24 [05:01<19:10, 60.55s/it]

 25%|██▌       | 6/24 [06:04<18:28, 61.57s/it]

 29%|██▉       | 7/24 [07:08<17:36, 62.17s/it]

 33%|███▎      | 8/24 [08:12<16:43, 62.71s/it]

 38%|███▊      | 9/24 [09:14<15:37, 62.52s/it]

 42%|████▏     | 10/24 [10:15<14:31, 62.22s/it]

 46%|████▌     | 11/24 [11:17<13:24, 61.91s/it]

 50%|█████     | 12/24 [12:18<12:22, 61.84s/it]

 54%|█████▍    | 13/24 [13:19<11:16, 61.49s/it]

 58%|█████▊    | 14/24 [14:20<10:14, 61.45s/it]

 62%|██████▎   | 15/24 [15:22<09:13, 61.49s/it]

 67%|██████▋   | 16/24 [16:22<08:09, 61.23s/it]

 71%|███████   | 17/24 [17:27<07:16, 62.30s/it]

 75%|███████▌  | 18/24 [18:29<06:12, 62.02s/it]

 79%|███████▉  | 19/24 [19:30<05:09, 61.89s/it]

 83%|████████▎ | 20/24 [20:32<04:06, 61.75s/it]

 88%|████████▊ | 21/24 [21:33<03:04, 61.56s/it]

 92%|█████████▏| 22/24 [22:34<02:03, 61.57s/it]

 96%|█████████▌| 23/24 [23:36<01:01, 61.47s/it]

100%|██████████| 24/24 [24:37<00:00, 61.53s/it]

100%|██████████| 24/24 [24:37<00:00, 61.57s/it]


Single Solvent CV RMSE: 0.10092 ± 0.05125





In [None]:
# Full CV for full data task
print("\n" + "="*50)
print("TASK 1: Full Data (Leave-One-Ramp-Out CV)")
print("="*50)

X, Y = load_data("full")
split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []
fold_rmses_full = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator), total=13):
    (train_X, train_Y), (test_X, test_Y) = split
    
    model = ExtendedModel(data='full', n_models=5, epochs=300)
    model.train_model(train_X, train_Y)
    
    predictions = model.predict(test_X).numpy()
    fold_rmse = np.sqrt(((predictions - test_Y.values) ** 2).mean())
    fold_rmses_full.append(fold_rmse)
    
    for row_idx, row in enumerate(predictions):
        all_predictions.append({"task": 1, "fold": fold_idx, "row": row_idx,
                               "target_1": row[0], "target_2": row[1], "target_3": row[2]})

submission_full_data = pd.DataFrame(all_predictions)
print(f"\nFull Data CV RMSE: {np.mean(fold_rmses_full):.5f} ± {np.std(fold_rmses_full):.5f}")

In [None]:
# Save submission
submission = pd.concat([submission_single_solvent, submission_full_data]).reset_index()
submission.index.name = "id"
submission.to_csv("/home/code/experiments/004_feature_expansion/submission.csv", index=True)
submission.to_csv("/home/submission/submission.csv", index=True)
print(f"Submission saved with {len(submission)} rows")

In [None]:
# Final summary
print("\n" + "="*50)
print("FINAL RESULTS - Feature Expansion + Deeper MLP")
print("="*50)
print(f"Single Solvent CV RMSE: {np.mean(fold_rmses):.5f} ± {np.std(fold_rmses):.5f}")
print(f"Full Data CV RMSE: {np.mean(fold_rmses_full):.5f} ± {np.std(fold_rmses_full):.5f}")
overall_rmse = (np.mean(fold_rmses) + np.mean(fold_rmses_full)) / 2
print(f"\nOverall CV RMSE: {overall_rmse:.5f}")
print(f"\nComparison:")
print(f"  Baseline MLP (Spange only): 0.08819")
print(f"  Extended (Spange+DRFP+Frag): {overall_rmse:.5f}")
print(f"  Target:                      0.04740")