# Experiment 018: DRFP-Based Ensemble with Prediction Combination

**Key insight from research:**
- Paper arxiv:2512.19530 shows GNN + DRFP achieves MSE 0.0039 on Catechol benchmark
- DRFP (Differential Reaction Fingerprints) captures reaction-level information
- May generalize better to unseen solvents than Spange/ACS_PCA

**Architecture:**
- Build on exp_004's successful dual-model ensemble
- Add DRFP-PCA as THIRD feature set
- Train separate models on each feature set
- Combine PREDICTIONS: 0.35 * drfp_pred + 0.45 * acs_pred + 0.20 * spange_pred

**Expected result:**
- DRFP may capture chemical patterns that generalize better
- Could reduce the 53% CV-LB gap

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from abc import ABC
import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'
torch.set_default_dtype(torch.double)
print("Setup complete")

Setup complete


In [2]:
# --- UTILITY FUNCTIONS ---
INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    assert name in ["full", "single_solvent"]
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[["Residence Time", "Temperature", "SOLVENT NAME"]]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name="spange_descriptors"):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & 
                 (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

# Load feature dataframes
SPANGE_DF = load_features('spange_descriptors')
ACS_PCA_DF = load_features('acs_pca_descriptors')
DRFP_DF = load_features('drfps_catechol')

print(f"Spange: {SPANGE_DF.shape}, ACS_PCA: {ACS_PCA_DF.shape}, DRFP: {DRFP_DF.shape}")

# Apply PCA to DRFP (2048 dims -> 20 dims, captures 99.73% variance)
DRFP_PCA = PCA(n_components=20)
DRFP_PCA_VALUES = DRFP_PCA.fit_transform(DRFP_DF)
DRFP_PCA_DF = pd.DataFrame(DRFP_PCA_VALUES, index=DRFP_DF.index, 
                           columns=[f'drfp_pca_{i}' for i in range(20)])
print(f"DRFP-PCA: {DRFP_PCA_DF.shape}, variance explained: {DRFP_PCA.explained_variance_ratio_.sum():.2%}")

Spange: (26, 13), ACS_PCA: (24, 5), DRFP: (24, 2048)
DRFP-PCA: (24, 20), variance explained: 99.73%


In [3]:
# --- BASE CLASSES ---
class SmilesFeaturizer(ABC):
    def __init__(self): raise NotImplementedError
    def featurize(self, X): raise NotImplementedError

class BaseModel(ABC):
    def __init__(self): pass
    def train_model(self, X_train, y_train): raise NotImplementedError
    def predict(self): raise NotImplementedError

In [4]:
# --- DRFP-ENHANCED ENSEMBLE MODEL ---
class DRFPEnsembleModel(BaseModel):
    """Ensemble with THREE feature sets: Spange, ACS_PCA, and DRFP-PCA.
    
    Architecture (building on exp_004):
    - Train SEPARATE models on each feature set for EACH target
    - Combine PREDICTIONS: 0.35 * drfp_pred + 0.45 * acs_pred + 0.20 * spange_pred
    - HGB for SM (depth=7, iter=700, lr=0.04)
    - ETR for Products (n_estimators=500, depth=10, min_samples_leaf=2)
    - Arrhenius kinetics features (inv_temp, log_time, interaction)
    - NO TTA
    
    Key hypothesis: DRFP captures reaction-level patterns that may generalize better.
    """
    def __init__(self, data='single'):
        self.data_type = data
        self.mixed = (data == 'full')
        self.targets = ['Product 2', 'Product 3', 'SM']
        
        # Load all three feature sets
        self.spange = SPANGE_DF
        self.acs_pca = ACS_PCA_DF
        self.drfp_pca = DRFP_PCA_DF
        
        # Scalers for each feature set
        self.scaler_spange = StandardScaler()
        self.scaler_acs = StandardScaler()
        self.scaler_drfp = StandardScaler()
        
        # Models: {target: {feature_set: model}}
        self.models = {}
        
        # Feature weights for PREDICTION combination
        # Hypothesis: DRFP may generalize better, so give it significant weight
        self.drfp_weight = 0.35
        self.acs_weight = 0.45
        self.spange_weight = 0.20

    def _build_features(self, X, feature_df):
        """Build features with Arrhenius kinetics."""
        rt = X['Residence Time'].values.astype(np.float64).reshape(-1, 1)
        temp = X['Temperature'].values.astype(np.float64).reshape(-1, 1)
        
        # Arrhenius kinetic features
        temp_k = temp + 273.15
        inv_temp = 1000.0 / temp_k
        log_time = np.log(rt + 1e-6)
        interaction = inv_temp * log_time
        
        if self.mixed:
            pct = X['SolventB%'].values.reshape(-1, 1)
            A = feature_df.loc[X['SOLVENT A NAME']].values
            B = feature_df.loc[X['SOLVENT B NAME']].values
            solvent_feats = A * (1 - pct) + B * pct
            return np.hstack([rt, temp, inv_temp, log_time, interaction, pct, solvent_feats])
        else:
            solvent_feats = feature_df.loc[X['SOLVENT NAME']].values
            return np.hstack([rt, temp, inv_temp, log_time, interaction, solvent_feats])

    def train_model(self, X_train, y_train):
        # Build features for all three feature sets
        X_spange = self._build_features(X_train, self.spange)
        X_acs = self._build_features(X_train, self.acs_pca)
        X_drfp = self._build_features(X_train, self.drfp_pca)
        
        # Scale
        X_spange_sc = self.scaler_spange.fit_transform(X_spange)
        X_acs_sc = self.scaler_acs.fit_transform(X_acs)
        X_drfp_sc = self.scaler_drfp.fit_transform(X_drfp)
        
        y = y_train.values
        
        # Train per-target models (SEPARATE models for each feature set)
        for i, target in enumerate(self.targets):
            y_target = y[:, i]
            
            if target == 'SM':
                # HistGradientBoosting for SM
                model_spange = HistGradientBoostingRegressor(
                    max_depth=7, max_iter=700, learning_rate=0.04, random_state=42
                )
                model_acs = HistGradientBoostingRegressor(
                    max_depth=7, max_iter=700, learning_rate=0.04, random_state=42
                )
                model_drfp = HistGradientBoostingRegressor(
                    max_depth=7, max_iter=700, learning_rate=0.04, random_state=42
                )
            else:
                # ExtraTrees for Products
                model_spange = ExtraTreesRegressor(
                    n_estimators=500, max_depth=10, min_samples_leaf=2,
                    random_state=42, n_jobs=-1
                )
                model_acs = ExtraTreesRegressor(
                    n_estimators=500, max_depth=10, min_samples_leaf=2,
                    random_state=42, n_jobs=-1
                )
                model_drfp = ExtraTreesRegressor(
                    n_estimators=500, max_depth=10, min_samples_leaf=2,
                    random_state=42, n_jobs=-1
                )
            
            model_spange.fit(X_spange_sc, y_target)
            model_acs.fit(X_acs_sc, y_target)
            model_drfp.fit(X_drfp_sc, y_target)
            
            self.models[target] = {
                'spange': model_spange, 
                'acs': model_acs,
                'drfp': model_drfp
            }

    def predict(self, X):
        # Build features
        X_spange = self._build_features(X, self.spange)
        X_acs = self._build_features(X, self.acs_pca)
        X_drfp = self._build_features(X, self.drfp_pca)
        
        X_spange_sc = self.scaler_spange.transform(X_spange)
        X_acs_sc = self.scaler_acs.transform(X_acs)
        X_drfp_sc = self.scaler_drfp.transform(X_drfp)
        
        preds_all = []
        for target in self.targets:
            p_spange = self.models[target]['spange'].predict(X_spange_sc)
            p_acs = self.models[target]['acs'].predict(X_acs_sc)
            p_drfp = self.models[target]['drfp'].predict(X_drfp_sc)
            
            # PREDICTION combination: weighted average of all three
            p_combined = (self.drfp_weight * p_drfp + 
                          self.acs_weight * p_acs + 
                          self.spange_weight * p_spange)
            preds_all.append(p_combined.reshape(-1, 1))
        
        preds = np.hstack(preds_all)
        preds = np.clip(preds, 0, 1)
        return torch.tensor(preds, dtype=torch.double)

print("DRFPEnsembleModel defined")

DRFPEnsembleModel defined


In [5]:
# --- QUICK VALIDATION TEST ---
print("Testing DRFPEnsembleModel...")
X_test, Y_test = load_data("single_solvent")

# Quick test on first 3 solvents
errors = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_out_splits(X_test, Y_test)):
    if i >= 3: break
    model = DRFPEnsembleModel(data='single')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors.append(mae)
    solvent = test_X['SOLVENT NAME'].iloc[0]
    print(f"Single Fold {i} ({solvent}): MAE = {mae:.4f}")

print(f"\nSingle solvent quick test MAE: {np.mean(errors):.4f}")

# Also test on full data
print("\nTesting on full data...")
X_full, Y_full = load_data("full")
errors_full = []
for i, ((train_X, train_Y), (test_X, test_Y)) in enumerate(generate_leave_one_ramp_out_splits(X_full, Y_full)):
    if i >= 3: break
    model = DRFPEnsembleModel(data='full')
    model.train_model(train_X, train_Y)
    preds = model.predict(test_X).numpy()
    mae = np.mean(np.abs(preds - test_Y.values))
    errors_full.append(mae)
    print(f"Full Fold {i}: MAE = {mae:.4f}")

print(f"\nFull data quick test MAE: {np.mean(errors_full):.4f}")

Testing DRFPEnsembleModel...


Single Fold 0 (1,1,1,3,3,3-Hexafluoropropan-2-ol): MAE = 0.1532


Single Fold 1 (2,2,2-Trifluoroethanol): MAE = 0.1015


Single Fold 2 (2-Methyltetrahydrofuran [2-MeTHF]): MAE = 0.0375

Single solvent quick test MAE: 0.0974

Testing on full data...


Full Fold 0: MAE = 0.0673


Full Fold 1: MAE = 0.0963


Full Fold 2: MAE = 0.0618

Full data quick test MAE: 0.0752


In [6]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = DRFPEnsembleModel(data='single') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:02,  2.87s/it]

2it [00:05,  2.92s/it]

3it [00:08,  2.94s/it]

4it [00:11,  2.95s/it]

5it [00:14,  2.96s/it]

6it [00:17,  2.94s/it]

7it [00:20,  2.96s/it]

8it [00:23,  2.93s/it]

9it [00:26,  2.89s/it]

10it [00:29,  2.90s/it]

11it [00:32,  2.91s/it]

12it [00:35,  2.95s/it]

13it [00:38,  2.94s/it]

14it [00:41,  2.99s/it]

15it [00:44,  2.97s/it]

16it [00:47,  2.96s/it]

17it [00:50,  2.97s/it]

18it [00:53,  2.98s/it]

19it [00:55,  2.96s/it]

20it [00:58,  2.98s/it]

21it [01:01,  2.94s/it]

22it [01:04,  2.93s/it]

23it [01:07,  2.92s/it]

24it [01:10,  2.93s/it]

24it [01:10,  2.94s/it]




In [7]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = DRFPEnsembleModel(data = 'full') # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

0it [00:00, ?it/s]

1it [00:03,  3.23s/it]

2it [00:06,  3.14s/it]

3it [00:09,  3.14s/it]

4it [00:12,  3.09s/it]

5it [00:15,  3.13s/it]

6it [00:18,  3.15s/it]

7it [00:21,  3.12s/it]

8it [00:25,  3.13s/it]

9it [00:28,  3.14s/it]

10it [00:31,  3.16s/it]

11it [00:34,  3.15s/it]

12it [00:37,  3.15s/it]

13it [00:40,  3.12s/it]

13it [00:40,  3.14s/it]




In [8]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################