# Experiment 065: Add Fragprints to Best Ensemble

Adding Fragprints features to our best GP + MLP + LGBM ensemble.
Best baseline: CV 0.008194 (exp_032)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)

DATA_PATH = '/home/data'
print('Libraries loaded')

In [None]:
# Load all feature sources
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFPS_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
FRAGPRINTS_DF = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)

# Load yield data
SINGLE_SOLVENT_DF = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
FULL_DATA_DF = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')

# Filter DRFP to non-zero variance
drfp_variance = DRFPS_DF.var()
drfp_nonzero = drfp_variance[drfp_variance > 0].index.tolist()
DRFPS_FILTERED = DRFPS_DF[drfp_nonzero]

# Filter Fragprints to non-zero variance
frag_variance = FRAGPRINTS_DF.var()
frag_nonzero = frag_variance[frag_variance > 0].index.tolist()
FRAGPRINTS_FILTERED = FRAGPRINTS_DF[frag_nonzero]

print(f'Spange: {SPANGE_DF.shape}')
print(f'DRFP filtered: {DRFPS_FILTERED.shape}')
print(f'ACS PCA: {ACS_PCA_DF.shape}')
print(f'Fragprints filtered: {FRAGPRINTS_FILTERED.shape}')

In [None]:
# Feature extraction function with Arrhenius kinetics
def get_features(row, spange_df, drfp_df, acs_df, frag_df=None, include_fragprints=False):
    """Extract features for a single row"""
    solvent = row['SOLVENT NAME']
    
    # Spange features
    spange_feat = spange_df.loc[solvent].values
    
    # DRFP features
    drfp_feat = drfp_df.loc[solvent].values
    
    # ACS PCA features
    acs_feat = acs_df.loc[solvent].values
    
    # Arrhenius kinetics features
    T = row['Temperature']
    t = row['Residence Time']
    T_kelvin = T + 273.15
    arrhenius_feat = np.array([
        1.0 / T_kelvin,  # 1/T (Arrhenius)
        np.log(t + 1e-6),  # ln(t)
        np.log(t + 1e-6) / T_kelvin,  # ln(t)/T interaction
        T / 200.0,  # Normalized T
        t / 10.0  # Normalized t
    ])
    
    # Combine features
    if include_fragprints and frag_df is not None:
        frag_feat = frag_df.loc[solvent].values
        return np.concatenate([spange_feat, drfp_feat, acs_feat, arrhenius_feat, frag_feat])
    else:
        return np.concatenate([spange_feat, drfp_feat, acs_feat, arrhenius_feat])

def get_mixture_features(row, spange_df, drfp_df, acs_df, frag_df=None, include_fragprints=False):
    """Extract features for mixture solvents"""
    solvent_a = row['SOLVENT A NAME']
    solvent_b = row['SOLVENT B NAME']
    ratio_b = row['SolventB%'] / 100.0
    ratio_a = 1.0 - ratio_b
    
    # Spange features (weighted average)
    spange_a = spange_df.loc[solvent_a].values
    spange_b = spange_df.loc[solvent_b].values
    spange_feat = ratio_a * spange_a + ratio_b * spange_b
    
    # DRFP features (weighted average)
    drfp_a = drfp_df.loc[solvent_a].values
    drfp_b = drfp_df.loc[solvent_b].values
    drfp_feat = ratio_a * drfp_a + ratio_b * drfp_b
    
    # ACS PCA features (weighted average)
    acs_a = acs_df.loc[solvent_a].values
    acs_b = acs_df.loc[solvent_b].values
    acs_feat = ratio_a * acs_a + ratio_b * acs_b
    
    # Arrhenius kinetics features
    T = row['Temperature']
    t = row['Residence Time']
    T_kelvin = T + 273.15
    arrhenius_feat = np.array([
        1.0 / T_kelvin,
        np.log(t + 1e-6),
        np.log(t + 1e-6) / T_kelvin,
        T / 200.0,
        t / 10.0
    ])
    
    if include_fragprints and frag_df is not None:
        frag_a = frag_df.loc[solvent_a].values
        frag_b = frag_df.loc[solvent_b].values
        frag_feat = ratio_a * frag_a + ratio_b * frag_b
        return np.concatenate([spange_feat, drfp_feat, acs_feat, arrhenius_feat, frag_feat])
    else:
        return np.concatenate([spange_feat, drfp_feat, acs_feat, arrhenius_feat])

print('Feature extraction functions defined')

In [None]:
# MLP Model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=3, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(input_dim),
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, output_dim),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.net(x)

class MLPWrapper:
    def __init__(self, input_dim, hidden_dim=64, lr=0.001, epochs=200, n_models=5):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.lr = lr
        self.epochs = epochs
        self.n_models = n_models
        self.models = []
        self.scalers = []
        self.targets = ['Product 2', 'Product 3', 'SM']
    
    def fit(self, X, Y):
        for seed in range(self.n_models):
            torch.manual_seed(42 + seed)
            np.random.seed(42 + seed)
            
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            X_tensor = torch.FloatTensor(X_scaled)
            Y_tensor = torch.FloatTensor(Y[self.targets].values)
            
            model = MLPModel(self.input_dim, self.hidden_dim)
            optimizer = torch.optim.Adam(model.parameters(), lr=self.lr, weight_decay=1e-5)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=20, factor=0.5)
            criterion = nn.HuberLoss()
            
            model.train()
            for epoch in range(self.epochs):
                optimizer.zero_grad()
                pred = model(X_tensor)
                loss = criterion(pred, Y_tensor)
                loss.backward()
                optimizer.step()
                scheduler.step(loss)
            
            self.models.append(model)
            self.scalers.append(scaler)
    
    def predict(self, X):
        preds = []
        for model, scaler in zip(self.models, self.scalers):
            X_scaled = scaler.transform(X)
            X_tensor = torch.FloatTensor(X_scaled)
            model.eval()
            with torch.no_grad():
                pred = model(X_tensor).numpy()
            preds.append(pred)
        
        avg_pred = np.mean(preds, axis=0)
        return {t: avg_pred[:, i] for i, t in enumerate(self.targets)}

print('MLP defined')

In [None]:
# GP Model
class GPWrapper:
    def __init__(self):
        self.models = {}
        self.scaler = StandardScaler()
        self.targets = ['Product 2', 'Product 3', 'SM']
    
    def fit(self, X, Y):
        X_scaled = self.scaler.fit_transform(X)
        for target in self.targets:
            kernel = Matern(nu=2.5) + WhiteKernel(noise_level=0.1)
            self.models[target] = GaussianProcessRegressor(
                kernel=kernel, alpha=1e-6, normalize_y=True, random_state=42
            )
            self.models[target].fit(X_scaled, Y[target].values)
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return {t: self.models[t].predict(X_scaled) for t in self.targets}

# LGBM Model
class LGBMWrapper:
    def __init__(self):
        self.models = {}
        self.targets = ['Product 2', 'Product 3', 'SM']
    
    def fit(self, X, Y):
        for target in self.targets:
            self.models[target] = lgb.LGBMRegressor(
                n_estimators=100, learning_rate=0.05, max_depth=5,
                num_leaves=31, objective='mae', verbose=-1, random_state=42
            )
            self.models[target].fit(X, Y[target].values)
    
    def predict(self, X):
        return {t: self.models[t].predict(X) for t in self.targets}

print('GP and LGBM defined')

In [None]:
# Ensemble Model (GP 0.15 + MLP 0.55 + LGBM 0.30)
class EnsembleModel:
    def __init__(self, input_dim, gp_weight=0.15, mlp_weight=0.55, lgbm_weight=0.30):
        self.input_dim = input_dim
        self.gp_weight = gp_weight
        self.mlp_weight = mlp_weight
        self.lgbm_weight = lgbm_weight
        self.gp = GPWrapper()
        self.mlp = MLPWrapper(input_dim)
        self.lgbm = LGBMWrapper()
        self.targets = ['Product 2', 'Product 3', 'SM']
    
    def fit(self, X, Y):
        self.gp.fit(X, Y)
        self.mlp.fit(X, Y)
        self.lgbm.fit(X, Y)
    
    def predict(self, X):
        gp_pred = self.gp.predict(X)
        mlp_pred = self.mlp.predict(X)
        lgbm_pred = self.lgbm.predict(X)
        
        preds = {}
        for t in self.targets:
            preds[t] = (self.gp_weight * gp_pred[t] + 
                       self.mlp_weight * mlp_pred[t] + 
                       self.lgbm_weight * lgbm_pred[t])
        return preds

print('Ensemble defined')

In [None]:
# Leave-One-Solvent-Out CV for single solvent data
def run_loso_cv(data_df, include_fragprints=False):
    solvents = data_df['SOLVENT NAME'].unique()
    targets = ['Product 2', 'Product 3', 'SM']
    
    all_errors = []
    fold_errors = []
    
    for test_solvent in solvents:
        train_mask = data_df['SOLVENT NAME'] != test_solvent
        test_mask = data_df['SOLVENT NAME'] == test_solvent
        
        train_df = data_df[train_mask]
        test_df = data_df[test_mask]
        
        # Extract features
        X_train = np.array([get_features(row, SPANGE_DF, DRFPS_FILTERED, ACS_PCA_DF, 
                                         FRAGPRINTS_FILTERED, include_fragprints) 
                           for _, row in train_df.iterrows()])
        X_test = np.array([get_features(row, SPANGE_DF, DRFPS_FILTERED, ACS_PCA_DF,
                                        FRAGPRINTS_FILTERED, include_fragprints) 
                          for _, row in test_df.iterrows()])
        
        Y_train = train_df[targets]
        Y_test = test_df[targets]
        
        input_dim = X_train.shape[1]
        model = EnsembleModel(input_dim)
        model.fit(X_train, Y_train)
        preds = model.predict(X_test)
        
        fold_mae = []
        for target in targets:
            mae = mean_absolute_error(Y_test[target], preds[target])
            fold_mae.append(mae)
            all_errors.extend(np.abs(Y_test[target].values - preds[target]))
        
        fold_errors.append(np.mean(fold_mae))
    
    return np.mean(all_errors), np.std(fold_errors), fold_errors, input_dim

print('LOSO CV function defined')

In [None]:
# Test WITHOUT fragprints (baseline)
print('Testing WITHOUT Fragprints (baseline)...')
print('='*60)

cv_mae_base, cv_std_base, fold_errors_base, input_dim_base = run_loso_cv(
    SINGLE_SOLVENT_DF, include_fragprints=False
)

print(f'\nBaseline CV MAE: {cv_mae_base:.6f} +/- {cv_std_base:.6f}')
print(f'Input dim: {input_dim_base}')

In [None]:
# Test WITH fragprints
print('\nTesting WITH Fragprints...')
print('='*60)

cv_mae_frag, cv_std_frag, fold_errors_frag, input_dim_frag = run_loso_cv(
    SINGLE_SOLVENT_DF, include_fragprints=True
)

print(f'\nWith Fragprints CV MAE: {cv_mae_frag:.6f} +/- {cv_std_frag:.6f}')
print(f'Input dim: {input_dim_frag}')
print(f'\nImprovement: {(cv_mae_base - cv_mae_frag) / cv_mae_base * 100:.2f}%')