# Experiment 065: Add Fragprints to Best Ensemble

Adding Fragprints features to our best GP + MLP + LGBM ensemble.

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, Matern
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)

DATA_PATH = '/home/data'
print('Libraries loaded')

Libraries loaded


In [2]:
# Load all feature sources
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFPS_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
FRAGPRINTS_DF = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)

# Load yield data
SINGLE_SOLVENT_DF = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
FULL_DATA_DF = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')

print(f'Spange: {SPANGE_DF.shape}')
print(f'DRFPS: {DRFPS_DF.shape}')
print(f'ACS PCA: {ACS_PCA_DF.shape}')
print(f'Fragprints: {FRAGPRINTS_DF.shape}')

Spange: (26, 13)
DRFPS: (24, 2048)
ACS PCA: (24, 5)
Fragprints: (24, 2133)


In [3]:
# Filter fragprints to non-zero variance columns
frag_variance = FRAGPRINTS_DF.var()
frag_nonzero = frag_variance[frag_variance > 0].index.tolist()
FRAGPRINTS_FILTERED = FRAGPRINTS_DF[frag_nonzero]
print(f'Fragprints after filtering: {FRAGPRINTS_FILTERED.shape}')

# Filter DRFP to non-zero variance
drfp_variance = DRFPS_DF.var()
drfp_nonzero = drfp_variance[drfp_variance > 0].index.tolist()
DRFPS_FILTERED = DRFPS_DF[drfp_nonzero]
print(f'DRFP after filtering: {DRFPS_FILTERED.shape}')

Fragprints after filtering: (24, 144)
DRFP after filtering: (24, 122)


In [4]:
# MLP Model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=3, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, output_dim),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.net(x)

class MLPWrapper:
    def __init__(self, input_dim, hidden_dim=128, lr=0.001, epochs=200):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.lr = lr
        self.epochs = epochs
        self.scaler = StandardScaler()
        self.model = None
        self.targets = ['Product 2', 'Product 3', 'SM']
    
    def fit(self, X, Y):
        X_scaled = self.scaler.fit_transform(X)
        X_tensor = torch.FloatTensor(X_scaled)
        Y_tensor = torch.FloatTensor(Y[self.targets].values)
        
        self.model = MLPModel(self.input_dim, self.hidden_dim)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        criterion = nn.L1Loss()
        
        self.model.train()
        for _ in range(self.epochs):
            optimizer.zero_grad()
            pred = self.model(X_tensor)
            loss = criterion(pred, Y_tensor)
            loss.backward()
            optimizer.step()
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        X_tensor = torch.FloatTensor(X_scaled)
        self.model.eval()
        with torch.no_grad():
            pred = self.model(X_tensor).numpy()
        return {t: pred[:, i] for i, t in enumerate(self.targets)}

print('MLP defined')

MLP defined


In [5]:
# GP Model
class GPWrapper:
    def __init__(self):
        self.models = {}
        self.scaler = StandardScaler()
        self.targets = ['Product 2', 'Product 3', 'SM']
    
    def fit(self, X, Y):
        X_scaled = self.scaler.fit_transform(X)
        for target in self.targets:
            kernel = Matern(nu=2.5) + WhiteKernel(noise_level=0.1)
            self.models[target] = GaussianProcessRegressor(
                kernel=kernel, alpha=1e-6, normalize_y=True, random_state=42
            )
            self.models[target].fit(X_scaled, Y[target].values)
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return {t: self.models[t].predict(X_scaled) for t in self.targets}

print('GP defined')

GP defined


In [6]:
# LGBM Model
class LGBMWrapper:
    def __init__(self):
        self.models = {}
        self.targets = ['Product 2', 'Product 3', 'SM']
    
    def fit(self, X, Y):
        for target in self.targets:
            self.models[target] = lgb.LGBMRegressor(
                n_estimators=100, learning_rate=0.05, max_depth=5,
                num_leaves=31, objective='mae', verbose=-1, random_state=42
            )
            self.models[target].fit(X, Y[target].values)
    
    def predict(self, X):
        return {t: self.models[t].predict(X) for t in self.targets}

print('LGBM defined')

LGBM defined


In [7]:
# Ensemble Model
class EnsembleModel:
    def __init__(self, input_dim, gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.30):
        self.input_dim = input_dim
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        self.gp = GPWrapper()
        self.mlp = MLPWrapper(input_dim)
        self.lgbm = LGBMWrapper()
        self.targets = ['Product 2', 'Product 3', 'SM']
    
    def fit(self, X, Y):
        self.gp.fit(X, Y)
        self.mlp.fit(X, Y)
        self.lgbm.fit(X, Y)
    
    def predict(self, X):
        gp_pred = self.gp.predict(X)
        mlp_pred = self.mlp.predict(X)
        lgbm_pred = self.lgbm.predict(X)
        
        preds = {}
        for t in self.targets:
            preds[t] = (self.gp_weight * gp_pred[t] + 
                       self.mlp_weight * mlp_pred[t] + 
                       self.lgbm_weight * lgbm_pred[t])
        return preds

print('Ensemble defined')

Ensemble defined


In [8]:
# Build feature table with Fragprints
def build_feature_table(solvents, include_fragprints=True):
    # Spange features
    spange_features = SPANGE_DF.loc[solvents].copy()
    
    # ACS PCA features
    acs_features = ACS_PCA_DF.loc[solvents].copy()
    
    # DRFP features
    drfp_features = DRFPS_FILTERED.loc[solvents].copy()
    
    # Combine
    combined = pd.concat([spange_features, acs_features, drfp_features], axis=1)
    
    if include_fragprints:
        frag_features = FRAGPRINTS_FILTERED.loc[solvents].copy()
        combined = pd.concat([combined, frag_features], axis=1)
    
    print(f'Feature table shape: {combined.shape}')
    return combined

print('Feature table function defined')

Feature table function defined


In [9]:
# Leave-One-Solvent-Out CV
def run_loso_cv(data_df, include_fragprints=True):
    solvents = data_df['SOLVENT NAME'].unique()
    targets = ['Product 2', 'Product 3', 'SM']
    
    feature_table = build_feature_table(solvents, include_fragprints)
    input_dim = feature_table.shape[1]
    
    all_errors = []
    fold_errors = []
    
    for test_solvent in solvents:
        train_mask = data_df['SOLVENT NAME'] != test_solvent
        test_mask = data_df['SOLVENT NAME'] == test_solvent
        
        train_df = data_df[train_mask]
        test_df = data_df[test_mask]
        
        train_solvents = train_df['SOLVENT NAME'].values
        test_solvents = test_df['SOLVENT NAME'].values
        
        X_train = feature_table.loc[train_solvents].values
        X_test = feature_table.loc[test_solvents].values
        
        Y_train = train_df[targets]
        Y_test = test_df[targets]
        
        model = EnsembleModel(input_dim)
        model.fit(X_train, Y_train)
        preds = model.predict(X_test)
        
        fold_mae = []
        for target in targets:
            mae = mean_absolute_error(Y_test[target], preds[target])
            fold_mae.append(mae)
            all_errors.extend(np.abs(Y_test[target].values - preds[target]))
        
        fold_errors.append(np.mean(fold_mae))
    
    return np.mean(all_errors), np.std(fold_errors), fold_errors, input_dim

print('LOSO CV function defined')

LOSO CV function defined


In [10]:
# Test without fragprints first (baseline)
print('Testing WITHOUT Fragprints (baseline)...')
print('='*60)

cv_mae_base, cv_std_base, fold_errors_base, input_dim_base = run_loso_cv(
    SINGLE_SOLVENT_DF, include_fragprints=False
)

print(f'\nBaseline CV MAE: {cv_mae_base:.6f} +/- {cv_std_base:.6f}')

Testing WITHOUT Fragprints (baseline)...
Feature table shape: (24, 140)



Baseline CV MAE: 0.170319 +/- 0.076907


In [None]:
# Test WITH fragprints
print('\nTesting WITH Fragprints...')
print('='*60)

cv_mae_frag, cv_std_frag, fold_errors_frag, input_dim_frag = run_loso_cv(
    SINGLE_SOLVENT_DF, include_fragprints=True
)

print(f'\nWith Fragprints CV MAE: {cv_mae_frag:.6f} +/- {cv_std_frag:.6f}')
print(f'\nImprovement: {(cv_mae_base - cv_mae_frag) / cv_mae_base * 100:.2f}%')

In [None]:
# Full data CV
def get_mixture_features(row, feature_table):
    solvent_a = row['SOLVENT A NAME']
    solvent_b = row['SOLVENT B NAME']
    ratio_b = row['SolventB%'] / 100.0
    ratio_a = 1.0 - ratio_b
    
    feat_a = feature_table.loc[solvent_a].values
    feat_b = feature_table.loc[solvent_b].values
    
    return ratio_a * feat_a + ratio_b * feat_b

def run_loro_cv(data_df, include_fragprints=True):
    # Get all unique solvents
    solvents_a = data_df['SOLVENT A NAME'].unique()
    solvents_b = data_df['SOLVENT B NAME'].unique()
    all_solvents = list(set(solvents_a) | set(solvents_b))
    
    feature_table = build_feature_table(all_solvents, include_fragprints)
    input_dim = feature_table.shape[1]
    
    ramps = data_df['RAMP NUM'].unique()
    targets = ['Product 2', 'Product 3', 'SM']
    
    all_errors = []
    fold_errors = []
    
    for test_ramp in ramps:
        train_mask = data_df['RAMP NUM'] != test_ramp
        test_mask = data_df['RAMP NUM'] == test_ramp
        
        train_df = data_df[train_mask]
        test_df = data_df[test_mask]
        
        X_train = np.array([get_mixture_features(row, feature_table) for _, row in train_df.iterrows()])
        X_test = np.array([get_mixture_features(row, feature_table) for _, row in test_df.iterrows()])
        
        Y_train = train_df[targets]
        Y_test = test_df[targets]
        
        model = EnsembleModel(input_dim)
        model.fit(X_train, Y_train)
        preds = model.predict(X_test)
        
        fold_mae = []
        for target in targets:
            mae = mean_absolute_error(Y_test[target], preds[target])
            fold_mae.append(mae)
            all_errors.extend(np.abs(Y_test[target].values - preds[target]))
        
        fold_errors.append(np.mean(fold_mae))
    
    return np.mean(all_errors), np.std(fold_errors), fold_errors

print('LORO CV function defined')

In [None]:
# Run full data CV with fragprints
print('\nRunning Full Data CV with Fragprints...')
print('='*60)

full_cv_mae, full_cv_std, full_fold_errors = run_loro_cv(
    FULL_DATA_DF, include_fragprints=True
)

print(f'\nFull Data CV MAE: {full_cv_mae:.6f} +/- {full_cv_std:.6f}')

In [None]:
# Combined CV score
n_single = len(SINGLE_SOLVENT_DF)
n_full = len(FULL_DATA_DF)

weighted_cv = (cv_mae_frag * n_single + full_cv_mae * n_full) / (n_single + n_full)

print(f'\n' + '='*60)
print(f'FINAL RESULTS')
print(f'='*60)
print(f'Single Solvent CV MAE: {cv_mae_frag:.6f} +/- {cv_std_frag:.6f}')
print(f'Full Data CV MAE: {full_cv_mae:.6f} +/- {full_cv_std:.6f}')
print(f'Weighted Combined CV MAE: {weighted_cv:.6f}')
print(f'\nBest baseline CV: 0.008194')
print(f'Improvement: {(0.008194 - weighted_cv) / 0.008194 * 100:.2f}%')