# Experiment 064: Ens Model Kernel Approach

Implementing the "Ens Model" kernel approach with:
1. Feature priority-based correlation filtering
2. Combine ALL feature sources (Spange + ACS PCA + DRFP + Fragprints)
3. Different ensemble weights for single vs full data
4. Multi-target normalization

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
torch.manual_seed(42)

DATA_PATH = '/home/data'
print('Libraries loaded')

In [None]:
# Load all feature sources
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFPS_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
FRAGPRINTS_DF = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)

# Load yield data
SINGLE_SOLVENT_DF = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
FULL_DATA_DF = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')

print(f'Spange: {SPANGE_DF.shape}')
print(f'DRFPS: {DRFPS_DF.shape}')
print(f'ACS PCA: {ACS_PCA_DF.shape}')
print(f'Fragprints: {FRAGPRINTS_DF.shape}')
print(f'Single solvent: {SINGLE_SOLVENT_DF.shape}')
print(f'Full data: {FULL_DATA_DF.shape}')
print(f'\nSingle solvent columns: {list(SINGLE_SOLVENT_DF.columns)}')
print(f'\nFull data columns: {list(FULL_DATA_DF.columns)}')

In [None]:
# Feature priority function (from Ens Model kernel)
def feature_priority(name):
    if name.startswith('spange_'): return 5
    if name.startswith('acs_'): return 4
    if name.startswith('drfps_'): return 3
    if name.startswith('frag_'): return 2
    return 0

def filter_correlated_features(df, threshold=0.90):
    """Remove correlated features, keeping higher priority ones"""
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    to_drop = set()
    for col in upper.columns:
        for idx in upper.index:
            if pd.notna(upper.loc[idx, col]) and upper.loc[idx, col] > threshold:
                if feature_priority(col) >= feature_priority(idx):
                    to_drop.add(idx)
                else:
                    to_drop.add(col)
    
    print(f'Dropping {len(to_drop)} correlated features')
    return df.drop(columns=list(to_drop))

print('Feature filtering functions defined')

In [None]:
# Build combined feature table
def build_feature_table(solvents, threshold=0.90):
    # Spange features
    spange_cols = [f'spange_{c}' for c in SPANGE_DF.columns]
    spange_features = SPANGE_DF.loc[solvents].copy()
    spange_features.columns = spange_cols
    
    # ACS PCA features
    acs_cols = [f'acs_{c}' for c in ACS_PCA_DF.columns]
    acs_features = ACS_PCA_DF.loc[solvents].copy()
    acs_features.columns = acs_cols
    
    # DRFP features - filter zero variance
    drfp_variance = DRFPS_DF.var()
    drfp_nonzero = drfp_variance[drfp_variance > 0].index.tolist()
    drfp_features = DRFPS_DF.loc[solvents, drfp_nonzero].copy()
    drfp_features.columns = [f'drfps_{c}' for c in drfp_features.columns]
    
    # Fragprints - filter zero variance
    frag_variance = FRAGPRINTS_DF.var()
    frag_nonzero = frag_variance[frag_variance > 0].index.tolist()
    frag_features = FRAGPRINTS_DF.loc[solvents, frag_nonzero].copy()
    frag_features.columns = [f'frag_{c}' for c in frag_features.columns]
    
    print(f'Spange: {spange_features.shape[1]}, ACS: {acs_features.shape[1]}, DRFP: {drfp_features.shape[1]}, Frag: {frag_features.shape[1]}')
    
    # Combine all features
    combined = pd.concat([spange_features, acs_features, drfp_features, frag_features], axis=1)
    print(f'Combined: {combined.shape[1]} features')
    
    # Filter correlated features
    filtered = filter_correlated_features(combined, threshold=threshold)
    print(f'After filtering: {filtered.shape[1]} features')
    
    return filtered

print('Build feature table function defined')

In [None]:
# Ens Model class
class EnsModel:
    def __init__(self, data_type='single'):
        self.data_type = data_type
        self.targets = ['Product 2', 'Product 3', 'SM']
        
        # Different weights for single vs full (from Ens Model kernel)
        if data_type == 'single':
            self.cat_weight = 7.0 / 13.0  # 0.538
            self.xgb_weight = 6.0 / 13.0  # 0.462
        else:
            self.cat_weight = 1.0 / 3.0  # 0.333
            self.xgb_weight = 2.0 / 3.0  # 0.667
        
        self.cat_models = {}
        self.xgb_models = {}
        self.scaler = StandardScaler()
        
    def fit(self, X, Y):
        X_scaled = self.scaler.fit_transform(X)
        
        for target in self.targets:
            y = Y[target].values
            
            # CatBoost
            self.cat_models[target] = CatBoostRegressor(
                iterations=500, learning_rate=0.05, depth=6,
                loss_function='MAE', verbose=False, random_seed=42
            )
            self.cat_models[target].fit(X_scaled, y)
            
            # XGBoost
            self.xgb_models[target] = xgb.XGBRegressor(
                n_estimators=500, learning_rate=0.05, max_depth=6,
                objective='reg:absoluteerror', verbosity=0, random_state=42
            )
            self.xgb_models[target].fit(X_scaled, y)
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        
        preds = {}
        for target in self.targets:
            cat_pred = self.cat_models[target].predict(X_scaled)
            xgb_pred = self.xgb_models[target].predict(X_scaled)
            preds[target] = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        
        # Stack predictions
        pred_array = np.column_stack([preds[t] for t in self.targets])
        
        # Multi-target normalization: clip to [0, 1] and renormalize to sum to 1
        pred_array = np.clip(pred_array, 0, 1)
        totals = pred_array.sum(axis=1, keepdims=True)
        pred_array = pred_array / np.maximum(totals, 1e-8)
        
        return {t: pred_array[:, i] for i, t in enumerate(self.targets)}

print('EnsModel class defined')

In [None]:
# Leave-One-Solvent-Out CV for single solvent data
def run_loso_cv(feature_table, data_df, model_class, **model_kwargs):
    solvents = data_df['SOLVENT NAME'].unique()
    targets = ['Product 2', 'Product 3', 'SM']
    
    all_errors = []
    fold_errors = []
    
    for test_solvent in solvents:
        train_mask = data_df['SOLVENT NAME'] != test_solvent
        test_mask = data_df['SOLVENT NAME'] == test_solvent
        
        train_df = data_df[train_mask]
        test_df = data_df[test_mask]
        
        train_solvents = train_df['SOLVENT NAME'].values
        test_solvents = test_df['SOLVENT NAME'].values
        
        X_train = feature_table.loc[train_solvents].values
        X_test = feature_table.loc[test_solvents].values
        
        Y_train = train_df[targets]
        Y_test = test_df[targets]
        
        model = model_class(**model_kwargs)
        model.fit(X_train, Y_train)
        preds = model.predict(X_test)
        
        fold_mae = []
        for target in targets:
            mae = mean_absolute_error(Y_test[target], preds[target])
            fold_mae.append(mae)
            all_errors.extend(np.abs(Y_test[target].values - preds[target]))
        
        fold_errors.append(np.mean(fold_mae))
    
    return np.mean(all_errors), np.std(fold_errors), fold_errors

print('LOSO CV function defined')

In [None]:
# Run CV on single solvent data
print('Running Leave-One-Solvent-Out CV on single solvent data...')
print('='*60)

solvents = SINGLE_SOLVENT_DF['SOLVENT NAME'].unique()
print(f'Number of solvents: {len(solvents)}')

feature_table = build_feature_table(solvents, threshold=0.90)

cv_mae, cv_std, fold_errors = run_loso_cv(
    feature_table, SINGLE_SOLVENT_DF, EnsModel, data_type='single'
)

print(f'\nSingle Solvent CV MAE: {cv_mae:.6f} +/- {cv_std:.6f}')

In [None]:
# For full data, we need to handle mixture solvents
# Check the structure
print('Full data unique solvents:')
print(f"SOLVENT A: {FULL_DATA_DF['SOLVENT A NAME'].unique()}")
print(f"SOLVENT B: {FULL_DATA_DF['SOLVENT B NAME'].unique()}")
print(f"\nRamps: {FULL_DATA_DF['RAMP NUM'].unique()}")

In [None]:
# Build feature table for full data (mixture solvents)
def get_mixture_features(row, feature_table):
    """Get features for a mixture by weighted average based on ratio"""
    solvent_a = row['SOLVENT A NAME']
    solvent_b = row['SOLVENT B NAME']
    ratio_b = row['SolventB%'] / 100.0  # Convert percentage to fraction
    ratio_a = 1.0 - ratio_b
    
    feat_a = feature_table.loc[solvent_a].values
    feat_b = feature_table.loc[solvent_b].values
    
    return ratio_a * feat_a + ratio_b * feat_b

def run_loro_cv(feature_table, data_df, model_class, **model_kwargs):
    """Leave-One-Ramp-Out CV for full data"""
    ramps = data_df['RAMP NUM'].unique()
    targets = ['Product 2', 'Product 3', 'SM']
    
    all_errors = []
    fold_errors = []
    
    for test_ramp in ramps:
        train_mask = data_df['RAMP NUM'] != test_ramp
        test_mask = data_df['RAMP NUM'] == test_ramp
        
        train_df = data_df[train_mask]
        test_df = data_df[test_mask]
        
        # Get features for mixtures
        X_train = np.array([get_mixture_features(row, feature_table) for _, row in train_df.iterrows()])
        X_test = np.array([get_mixture_features(row, feature_table) for _, row in test_df.iterrows()])
        
        Y_train = train_df[targets]
        Y_test = test_df[targets]
        
        model = model_class(**model_kwargs)
        model.fit(X_train, Y_train)
        preds = model.predict(X_test)
        
        fold_mae = []
        for target in targets:
            mae = mean_absolute_error(Y_test[target], preds[target])
            fold_mae.append(mae)
            all_errors.extend(np.abs(Y_test[target].values - preds[target]))
        
        fold_errors.append(np.mean(fold_mae))
    
    return np.mean(all_errors), np.std(fold_errors), fold_errors

print('LORO CV function defined')

In [None]:
# Build feature table for full data solvents
full_solvents_a = FULL_DATA_DF['SOLVENT A NAME'].unique()
full_solvents_b = FULL_DATA_DF['SOLVENT B NAME'].unique()
full_solvents = list(set(full_solvents_a) | set(full_solvents_b))
print(f'Full data unique solvents: {len(full_solvents)}')

# Build feature table for these solvents
full_feature_table = build_feature_table(full_solvents, threshold=0.90)

In [None]:
# Run CV on full data
print('\nRunning Leave-One-Ramp-Out CV on full data...')
print('='*60)

full_cv_mae, full_cv_std, full_fold_errors = run_loro_cv(
    full_feature_table, FULL_DATA_DF, EnsModel, data_type='full'
)

print(f'\nFull Data CV MAE: {full_cv_mae:.6f} +/- {full_cv_std:.6f}')

In [None]:
# Combined CV score
n_single = len(SINGLE_SOLVENT_DF)
n_full = len(FULL_DATA_DF)

weighted_cv = (cv_mae * n_single + full_cv_mae * n_full) / (n_single + n_full)

print(f'\n' + '='*60)
print(f'FINAL RESULTS')
print(f'='*60)
print(f'Single Solvent CV MAE: {cv_mae:.6f} +/- {cv_std:.6f}')
print(f'Full Data CV MAE: {full_cv_mae:.6f} +/- {full_cv_std:.6f}')
print(f'Weighted Combined CV MAE: {weighted_cv:.6f}')
print(f'\nBest baseline CV: 0.008194')
print(f'Improvement: {(0.008194 - weighted_cv) / 0.008194 * 100:.2f}%')