# Experiment 016: Hybrid Model with Task-Specific Configurations

**Key insight from exp_015 analysis:**
- Single solvent: exp_015 (0.0638) is BETTER than exp_004 (0.0659)
- Full data: exp_004 (0.0603) is MUCH BETTER than exp_015 (0.1027)

**Solution: Use different configurations for each task:**
- Single solvent: Deep models + MLP + COMBINED features (exp_015 approach)
- Full data: Shallow models + NO MLP + Arrhenius features (exp_004 approach)

**Expected CV**: 0.0638 * 0.35 + 0.0603 * 0.65 = 0.0615 (better than exp_004's 0.0623!)

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')

CUDA available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# --- LOAD FEATURES ---
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

# Load features
Spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv')
ACS_PCA = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv')

print(f"Spange: {Spange.shape}")
print(f"ACS_PCA: {ACS_PCA.shape}")

# Create lookup dictionaries
Spange_dict = {row['SOLVENT NAME']: row.drop('SOLVENT NAME').values.astype(float) for _, row in Spange.iterrows()}
ACS_PCA_dict = {row['SOLVENT NAME']: row.drop('SOLVENT NAME').values.astype(float) for _, row in ACS_PCA.iterrows()}

Spange: (26, 14)
ACS_PCA: (24, 6)


In [3]:
# --- LOO UTILITY FUNCTIONS (REQUIRED FOR SUBMISSION) ---
def generate_leave_one_out_splits(X, Y):
    """Leave-One-Solvent-Out for single solvent data (24 folds)."""
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    """Leave-One-Ramp-Out for full data (13 folds)."""
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & 
                 (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print("LOO utility functions defined")

LOO utility functions defined


In [4]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [5]:
# --- MLP ARCHITECTURE (for single solvent only) ---
class SimpleMLP(nn.Module):
    """MLP with BatchNorm + ReLU + Dropout."""
    def __init__(self, input_dim, hidden_dims=[128, 64, 32], output_dim=3, dropout=0.2):
        super().__init__()
        layers = []
        layers.append(nn.BatchNorm1d(input_dim))
        
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        
        layers.append(nn.Linear(prev_dim, output_dim))
        layers.append(nn.Sigmoid())
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        if x.size(0) == 1 and self.training:
            self.eval()
            out = self.network(x)
            self.train()
            return out
        return self.network(x)

print("SimpleMLP defined")

SimpleMLP defined


In [6]:
# --- HYBRID TASK-SPECIFIC MODEL ---
class HybridTaskModel(BaseModel):
    """Hybrid model with task-specific configurations.
    
    Single solvent (exp_015 approach - works better):
    - Deep models (depth=None) + MLP
    - COMBINED features (0.8*ACS_PCA + 0.2*Spange)
    - MLP weight = 0.5
    
    Full data (exp_004 approach - works better):
    - Shallow models (depth=7/10) + NO MLP
    - COMBINED features + Arrhenius kinetics
    - Per-target: HGB for SM, ETR for Products
    """
    
    def __init__(self, data='single'):
        super().__init__()
        self.data = data
        self.scaler = StandardScaler()
        self.mlp = None
        self.hgb = None
        self.etr = None
        
        # Task-specific configurations
        if data == 'single':
            # exp_015 approach: Deep + MLP
            self.use_mlp = True
            self.mlp_weight = 0.5
            self.hgb_depth = None  # Unlimited
            self.etr_depth = None  # Unlimited
            self.hgb_lr = 0.1
            self.hgb_iter = 200
            self.etr_n_estimators = 200
        else:
            # exp_004 approach: Shallow + NO MLP
            self.use_mlp = False
            self.mlp_weight = 0.0
            self.hgb_depth = 7  # Shallow like exp_004
            self.etr_depth = 10  # Shallow like exp_004
            self.hgb_lr = 0.04
            self.hgb_iter = 700
            self.etr_n_estimators = 500
    
    def _get_features(self, X):
        """Extract features with task-specific approach."""
        SPANGE_WEIGHT = 0.2
        ACS_WEIGHT = 0.8
        
        features = []
        if self.data == 'single':
            for _, row in X.iterrows():
                solvent = row['SOLVENT NAME']
                spange = Spange_dict.get(solvent, np.zeros(13))
                acs_pca = ACS_PCA_dict.get(solvent, np.zeros(5))
                
                rt = row['Residence Time']
                temp = row['Temperature']
                
                # COMBINED features
                combined = np.concatenate([
                    [rt, temp],
                    ACS_WEIGHT * acs_pca,
                    SPANGE_WEIGHT * spange
                ])
                features.append(combined)
        else:
            # Full data: Add Arrhenius kinetics features (CRITICAL for exp_004)
            for _, row in X.iterrows():
                solvent_a = row['SOLVENT A NAME']
                solvent_b = row['SOLVENT B NAME']
                pct_b = row['SolventB%'] / 100.0
                
                spange_a = Spange_dict.get(solvent_a, np.zeros(13))
                spange_b = Spange_dict.get(solvent_b, np.zeros(13))
                acs_a = ACS_PCA_dict.get(solvent_a, np.zeros(5))
                acs_b = ACS_PCA_dict.get(solvent_b, np.zeros(5))
                
                # Linear interpolation for mixed solvents
                spange_mix = (1 - pct_b) * spange_a + pct_b * spange_b
                acs_mix = (1 - pct_b) * acs_a + pct_b * acs_b
                
                rt = row['Residence Time']
                temp = row['Temperature']
                
                # Arrhenius kinetics features (CRITICAL - missing in exp_015)
                temp_k = temp + 273.15
                inv_temp = 1000.0 / temp_k  # 1/T
                log_time = np.log(rt + 1e-6)  # ln(t)
                interaction = inv_temp * log_time  # t*T interaction
                
                # COMBINED features + Arrhenius
                combined = np.concatenate([
                    [rt, temp, pct_b],
                    [inv_temp, log_time, interaction],  # Arrhenius features
                    ACS_WEIGHT * acs_mix,
                    SPANGE_WEIGHT * spange_mix
                ])
                features.append(combined)
        
        return np.array(features)
    
    def train_model(self, X_train, y_train):
        X_feat = self._get_features(X_train)
        y_np = y_train.values
        X_scaled = self.scaler.fit_transform(X_feat)
        
        # 1. Train MLP (only for single solvent)
        if self.use_mlp:
            input_dim = X_scaled.shape[1]
            self.mlp = SimpleMLP(input_dim, hidden_dims=[128, 64, 32], output_dim=3, dropout=0.2).to(device)
            X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(device)
            y_tensor = torch.tensor(y_np, dtype=torch.double).to(device)
            dataset = TensorDataset(X_tensor, y_tensor)
            loader = DataLoader(dataset, batch_size=32, shuffle=True)
            optimizer = torch.optim.Adam(self.mlp.parameters(), lr=1e-3)
            criterion = nn.MSELoss()
            
            self.mlp.train()
            for epoch in range(100):
                for batch_X, batch_y in loader:
                    optimizer.zero_grad()
                    pred = self.mlp(batch_X)
                    loss = criterion(pred, batch_y)
                    loss.backward()
                    optimizer.step()
        
        # 2. Train HGB for SM (target 2)
        self.hgb = HistGradientBoostingRegressor(
            max_depth=self.hgb_depth,
            learning_rate=self.hgb_lr,
            max_iter=self.hgb_iter,
            random_state=42
        )
        self.hgb.fit(X_scaled, y_np[:, 2])
        
        # 3. Train ETR for Products (targets 0, 1)
        self.etr = ExtraTreesRegressor(
            n_estimators=self.etr_n_estimators,
            max_depth=self.etr_depth,
            min_samples_split=2,
            min_samples_leaf=2 if self.data == 'full' else 1,
            random_state=42,
            n_jobs=-1
        )
        self.etr.fit(X_scaled, y_np[:, :2])
    
    def predict(self, X_test):
        X_feat = self._get_features(X_test)
        X_scaled = self.scaler.transform(X_feat)
        
        # GBDT predictions
        hgb_pred_sm = self.hgb.predict(X_scaled).reshape(-1, 1)
        etr_pred_products = self.etr.predict(X_scaled)
        gbdt_pred = np.column_stack([etr_pred_products, hgb_pred_sm])
        
        if self.use_mlp:
            # MLP prediction
            self.mlp.eval()
            with torch.no_grad():
                X_tensor = torch.tensor(X_scaled, dtype=torch.double).to(device)
                mlp_pred = self.mlp(X_tensor).cpu().numpy()
            
            # Weighted ensemble
            final_pred = self.mlp_weight * mlp_pred + (1 - self.mlp_weight) * gbdt_pred
        else:
            # GBDT only for full data
            final_pred = gbdt_pred
        
        final_pred = np.clip(final_pred, 0, 1)
        return torch.tensor(final_pred)

print("HybridTaskModel defined")

HybridTaskModel defined


In [7]:
# --- QUICK VALIDATION TEST ---
print("Quick test of HybridTaskModel...")

# Test single solvent
X_single, Y_single = load_data("single_solvent")
errors_single = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_single, Y_single)):
    if i >= 3: break
    solvent = test_X['SOLVENT NAME'].iloc[0]
    model = HybridTaskModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors_single.append(mae)
    print(f"Single Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nSingle solvent quick test MAE: {np.mean(errors_single):.4f}")

# Test full data
X_full, Y_full = load_data("full")
errors_full = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    if i >= 3: break
    model = HybridTaskModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors_full.append(mae)
    print(f"Full Fold {i}: MAE = {mae:.4f}")

print(f"\nFull data quick test MAE: {np.mean(errors_full):.4f}")

Quick test of HybridTaskModel...


Single Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1666


Single Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1055


Single Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0404

Single solvent quick test MAE: 0.1042


Full Fold 0: MAE = 0.0764


Full Fold 1: MAE = 0.1203


Full Fold 2: MAE = 0.0973

Full data quick test MAE: 0.0980


In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = HybridTaskModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:03,  3.23s/it]

2it [00:06,  3.22s/it]

3it [00:09,  3.20s/it]

4it [00:12,  3.15s/it]

5it [00:15,  3.18s/it]

6it [00:19,  3.20s/it]

7it [00:22,  3.21s/it]

8it [00:25,  3.20s/it]

9it [00:28,  3.21s/it]

10it [00:32,  3.22s/it]

11it [00:35,  3.22s/it]

12it [00:38,  3.23s/it]

13it [00:41,  3.22s/it]

14it [00:44,  3.22s/it]

15it [00:48,  3.22s/it]

16it [00:51,  3.23s/it]

17it [00:54,  3.27s/it]

18it [00:57,  3.25s/it]

19it [01:01,  3.24s/it]

20it [01:04,  3.24s/it]

21it [01:07,  3.23s/it]

22it [01:11,  3.29s/it]

23it [01:14,  3.27s/it]

24it [01:17,  3.26s/it]

24it [01:17,  3.23s/it]




In [9]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = HybridTaskModel(data='full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:00,  1.21it/s]

2it [00:01,  1.26it/s]

3it [00:02,  1.26it/s]

4it [00:03,  1.26it/s]

5it [00:03,  1.26it/s]

6it [00:04,  1.25it/s]

7it [00:05,  1.26it/s]

8it [00:06,  1.23it/s]

9it [00:07,  1.24it/s]

10it [00:08,  1.20it/s]

11it [00:08,  1.20it/s]

12it [00:09,  1.21it/s]

13it [00:10,  1.21it/s]

13it [00:10,  1.23it/s]




In [10]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################