# Experiment 064: Ens Model Kernel Approach

Implementing the "Ens Model" kernel approach with:
1. Feature priority-based correlation filtering
2. Combine ALL feature sources (Spange + ACS PCA + DRFP + Fragprints)
3. Different ensemble weights for single vs full data (CatBoost 7:6 for single, 1:2 for full)
4. Multi-target normalization

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set seeds
np.random.seed(42)
torch.manual_seed(42)

DATA_PATH = '/home/data'
print('Libraries loaded')

In [None]:
# Load all feature sources
SPANGE_DF = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
DRFPS_DF = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
ACS_PCA_DF = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
FRAGPRINTS_DF = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)

# Load yield data
SINGLE_SOLVENT_DF = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
FULL_DATA_DF = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')

print(f'Spange: {SPANGE_DF.shape}')
print(f'DRFPS: {DRFPS_DF.shape}')
print(f'ACS PCA: {ACS_PCA_DF.shape}')
print(f'Fragprints: {FRAGPRINTS_DF.shape}')
print(f'Single solvent: {SINGLE_SOLVENT_DF.shape}')
print(f'Full data: {FULL_DATA_DF.shape}')

In [None]:
# Feature priority function (from Ens Model kernel)
def feature_priority(name):
    """Higher priority features are kept when correlated"""
    if name.startswith('spange_'):
        return 5  # Highest priority - physical descriptors
    if name.startswith('acs_'):
        return 4  # ACS PCA descriptors
    if name.startswith('drfps_'):
        return 3  # DRFP fingerprints
    if name.startswith('frag_'):
        return 2  # Fragprints
    if name.startswith('smiles_'):
        return 1  # SMILES-based
    return 0

def filter_correlated_features(df, threshold=0.90):
    """Remove correlated features, keeping higher priority ones"""
    # Get correlation matrix
    corr_matrix = df.corr().abs()
    
    # Get upper triangle
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find features to drop
    to_drop = set()
    for col in upper.columns:
        for idx in upper.index:
            if upper.loc[idx, col] > threshold:
                # Compare priorities
                if feature_priority(col) >= feature_priority(idx):
                    to_drop.add(idx)
                else:
                    to_drop.add(col)
    
    print(f'Dropping {len(to_drop)} correlated features')
    return df.drop(columns=list(to_drop))

print('Feature filtering functions defined')

In [None]:
# Build combined feature table
def build_feature_table(solvents, threshold=0.90):
    """Build combined feature table with priority-based filtering"""
    
    # Spange features (highest priority)
    spange_cols = [f'spange_{c}' for c in SPANGE_DF.columns]
    spange_features = SPANGE_DF.loc[solvents].copy()
    spange_features.columns = spange_cols
    
    # ACS PCA features
    acs_cols = [f'acs_{c}' for c in ACS_PCA_DF.columns]
    acs_features = ACS_PCA_DF.loc[solvents].copy()
    acs_features.columns = acs_cols
    
    # DRFP features - filter zero variance
    drfp_variance = DRFPS_DF.var()
    drfp_nonzero = drfp_variance[drfp_variance > 0].index.tolist()
    drfp_features = DRFPS_DF.loc[solvents, drfp_nonzero].copy()
    drfp_features.columns = [f'drfps_{c}' for c in drfp_features.columns]
    
    # Fragprints - filter zero variance
    frag_variance = FRAGPRINTS_DF.var()
    frag_nonzero = frag_variance[frag_variance > 0].index.tolist()
    frag_features = FRAGPRINTS_DF.loc[solvents, frag_nonzero].copy()
    frag_features.columns = [f'frag_{c}' for c in frag_features.columns]
    
    print(f'Spange: {spange_features.shape[1]} features')
    print(f'ACS PCA: {acs_features.shape[1]} features')
    print(f'DRFP (non-zero var): {drfp_features.shape[1]} features')
    print(f'Fragprints (non-zero var): {frag_features.shape[1]} features')
    
    # Combine all features
    combined = pd.concat([spange_features, acs_features, drfp_features, frag_features], axis=1)
    print(f'Combined: {combined.shape[1]} features')
    
    # Filter correlated features
    filtered = filter_correlated_features(combined, threshold=threshold)
    print(f'After filtering: {filtered.shape[1]} features')
    
    return filtered

# Test with single solvent data
solvents = SINGLE_SOLVENT_DF['Solvent Name'].unique()
print(f'\nBuilding feature table for {len(solvents)} solvents...')
feature_table = build_feature_table(solvents, threshold=0.90)

In [None]:
# Ens Model class
class EnsModel:
    def __init__(self, data_type='single', cat_weight=None, xgb_weight=None):
        self.data_type = data_type
        self.targets = ['Product 2', 'Product 3', 'SM']
        
        # Different weights for single vs full (from Ens Model kernel)
        if cat_weight is None or xgb_weight is None:
            if data_type == 'single':
                self.cat_weight = 7.0 / 13.0  # 0.538
                self.xgb_weight = 6.0 / 13.0  # 0.462
            else:
                self.cat_weight = 1.0 / 3.0  # 0.333
                self.xgb_weight = 2.0 / 3.0  # 0.667
        else:
            self.cat_weight = cat_weight
            self.xgb_weight = xgb_weight
        
        self.cat_models = {}
        self.xgb_models = {}
        self.scaler = StandardScaler()
        
    def fit(self, X, Y):
        """Train CatBoost and XGBoost for each target"""
        X_scaled = self.scaler.fit_transform(X)
        
        for target in self.targets:
            y = Y[target].values
            
            # CatBoost
            self.cat_models[target] = CatBoostRegressor(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                loss_function='MAE',
                verbose=False,
                random_seed=42
            )
            self.cat_models[target].fit(X_scaled, y)
            
            # XGBoost
            self.xgb_models[target] = xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=6,
                objective='reg:absoluteerror',
                verbosity=0,
                random_state=42
            )
            self.xgb_models[target].fit(X_scaled, y)
    
    def predict(self, X):
        """Weighted ensemble with multi-target normalization"""
        X_scaled = self.scaler.transform(X)
        
        preds = {}
        for target in self.targets:
            cat_pred = self.cat_models[target].predict(X_scaled)
            xgb_pred = self.xgb_models[target].predict(X_scaled)
            preds[target] = self.cat_weight * cat_pred + self.xgb_weight * xgb_pred
        
        # Stack predictions
        pred_array = np.column_stack([preds[t] for t in self.targets])
        
        # Multi-target normalization: clip to [0, 1] and renormalize to sum to 1
        pred_array = np.clip(pred_array, 0, 1)
        totals = pred_array.sum(axis=1, keepdims=True)
        pred_array = pred_array / np.maximum(totals, 1e-8)
        
        # Convert back to dict
        return {t: pred_array[:, i] for i, t in enumerate(self.targets)}

print('EnsModel class defined')

In [None]:
# Leave-One-Solvent-Out CV for single solvent data
def run_loso_cv(feature_table, data_df, model_class, **model_kwargs):
    """Leave-One-Solvent-Out cross-validation"""
    solvents = data_df['Solvent Name'].unique()
    targets = ['Product 2', 'Product 3', 'SM']
    
    all_errors = []
    fold_errors = []
    
    for test_solvent in solvents:
        # Split data
        train_mask = data_df['Solvent Name'] != test_solvent
        test_mask = data_df['Solvent Name'] == test_solvent
        
        train_df = data_df[train_mask]
        test_df = data_df[test_mask]
        
        # Get features
        train_solvents = train_df['Solvent Name'].values
        test_solvents = test_df['Solvent Name'].values
        
        X_train = feature_table.loc[train_solvents].values
        X_test = feature_table.loc[test_solvents].values
        
        Y_train = train_df[targets]
        Y_test = test_df[targets]
        
        # Train model
        model = model_class(**model_kwargs)
        model.fit(X_train, Y_train)
        
        # Predict
        preds = model.predict(X_test)
        
        # Calculate errors
        fold_mae = []
        for target in targets:
            mae = mean_absolute_error(Y_test[target], preds[target])
            fold_mae.append(mae)
            all_errors.extend(np.abs(Y_test[target].values - preds[target]))
        
        fold_errors.append(np.mean(fold_mae))
    
    overall_mae = np.mean(all_errors)
    fold_std = np.std(fold_errors)
    
    return overall_mae, fold_std, fold_errors

print('CV function defined')

In [None]:
# Run CV on single solvent data
print('Running Leave-One-Solvent-Out CV on single solvent data...')
print('='*60)

# Build feature table
solvents = SINGLE_SOLVENT_DF['Solvent Name'].unique()
feature_table = build_feature_table(solvents, threshold=0.90)

# Run CV
cv_mae, cv_std, fold_errors = run_loso_cv(
    feature_table, 
    SINGLE_SOLVENT_DF, 
    EnsModel,
    data_type='single'
)

print(f'\nSingle Solvent CV MAE: {cv_mae:.6f} ± {cv_std:.6f}')
print(f'Fold errors: {[f"{e:.6f}" for e in fold_errors]}')
print(f'Min fold: {min(fold_errors):.6f}, Max fold: {max(fold_errors):.6f}')

In [None]:
# Leave-One-Ramp-Out CV for full data
def run_loro_cv(feature_table, data_df, model_class, **model_kwargs):
    """Leave-One-Ramp-Out cross-validation for full data"""
    ramps = data_df['Ramp'].unique()
    targets = ['Product 2', 'Product 3', 'SM']
    
    all_errors = []
    fold_errors = []
    
    for test_ramp in ramps:
        # Split data
        train_mask = data_df['Ramp'] != test_ramp
        test_mask = data_df['Ramp'] == test_ramp
        
        train_df = data_df[train_mask]
        test_df = data_df[test_mask]
        
        # Get features - need to handle mixture solvents
        def get_mixture_features(df, feature_table):
            """Get features for mixture solvents by averaging"""
            features = []
            for _, row in df.iterrows():
                solvent = row['Solvent Name']
                if solvent in feature_table.index:
                    features.append(feature_table.loc[solvent].values)
                else:
                    # Handle mixture - average the components
                    # For now, use mean of all solvents as fallback
                    features.append(feature_table.mean().values)
            return np.array(features)
        
        X_train = get_mixture_features(train_df, feature_table)
        X_test = get_mixture_features(test_df, feature_table)
        
        Y_train = train_df[targets]
        Y_test = test_df[targets]
        
        # Train model
        model = model_class(**model_kwargs)
        model.fit(X_train, Y_train)
        
        # Predict
        preds = model.predict(X_test)
        
        # Calculate errors
        fold_mae = []
        for target in targets:
            mae = mean_absolute_error(Y_test[target], preds[target])
            fold_mae.append(mae)
            all_errors.extend(np.abs(Y_test[target].values - preds[target]))
        
        fold_errors.append(np.mean(fold_mae))
    
    overall_mae = np.mean(all_errors)
    fold_std = np.std(fold_errors)
    
    return overall_mae, fold_std, fold_errors

print('LORO CV function defined')

In [None]:
# Check what solvents are in full data
full_solvents = FULL_DATA_DF['Solvent Name'].unique()
print(f'Full data solvents: {len(full_solvents)}')
print(full_solvents[:10])

# Check which are in feature table
missing = [s for s in full_solvents if s not in feature_table.index]
print(f'\nMissing from feature table: {len(missing)}')
if missing:
    print(missing[:5])

In [None]:
# Build feature table for full data (including mixtures)
def build_full_feature_table(data_df, threshold=0.90):
    """Build feature table that handles mixture solvents"""
    
    # Get all unique solvents
    all_solvents = data_df['Solvent Name'].unique()
    
    # Separate single solvents and mixtures
    single_solvents = [s for s in all_solvents if s in SPANGE_DF.index]
    mixture_solvents = [s for s in all_solvents if s not in SPANGE_DF.index]
    
    print(f'Single solvents: {len(single_solvents)}')
    print(f'Mixture solvents: {len(mixture_solvents)}')
    
    # Build feature table for single solvents first
    base_table = build_feature_table(single_solvents, threshold=threshold)
    
    # For mixtures, we need to parse and average
    mixture_features = []
    for mixture in mixture_solvents:
        # Parse mixture (e.g., "Solvent1:Solvent2" or "Solvent1/Solvent2")
        parts = mixture.replace('/', ':').split(':')
        parts = [p.strip() for p in parts]
        
        # Get features for each component
        component_features = []
        for part in parts:
            if part in base_table.index:
                component_features.append(base_table.loc[part].values)
        
        if component_features:
            # Average the components
            avg_features = np.mean(component_features, axis=0)
        else:
            # Fallback to mean of all solvents
            avg_features = base_table.mean().values
        
        mixture_features.append(avg_features)
    
    # Create DataFrame for mixtures
    if mixture_features:
        mixture_df = pd.DataFrame(
            mixture_features,
            index=mixture_solvents,
            columns=base_table.columns
        )
        # Combine
        full_table = pd.concat([base_table, mixture_df])
    else:
        full_table = base_table
    
    return full_table

print('Building full feature table...')
full_feature_table = build_full_feature_table(FULL_DATA_DF, threshold=0.90)

In [None]:
# Run CV on full data
print('\nRunning Leave-One-Ramp-Out CV on full data...')
print('='*60)

full_cv_mae, full_cv_std, full_fold_errors = run_loro_cv(
    full_feature_table, 
    FULL_DATA_DF, 
    EnsModel,
    data_type='full'
)

print(f'\nFull Data CV MAE: {full_cv_mae:.6f} ± {full_cv_std:.6f}')
print(f'Fold errors: {[f"{e:.6f}" for e in full_fold_errors]}')
print(f'Min fold: {min(full_fold_errors):.6f}, Max fold: {max(full_fold_errors):.6f}')

In [None]:
# Combined CV score (weighted average as per competition)
# Single solvent: 24 solvents, Full data: 13 ramps
# Weight by number of samples
n_single = len(SINGLE_SOLVENT_DF)
n_full = len(FULL_DATA_DF)

weighted_cv = (cv_mae * n_single + full_cv_mae * n_full) / (n_single + n_full)

print(f'\n' + '='*60)
print(f'FINAL RESULTS')
print(f'='*60)
print(f'Single Solvent CV MAE: {cv_mae:.6f} ± {cv_std:.6f}')
print(f'Full Data CV MAE: {full_cv_mae:.6f} ± {full_cv_std:.6f}')
print(f'Weighted Combined CV MAE: {weighted_cv:.6f}')
print(f'\nBest baseline CV: 0.008194')
print(f'Improvement: {(0.008194 - weighted_cv) / 0.008194 * 100:.2f}%')

In [None]:
# Save submission
import os
os.makedirs('/home/submission', exist_ok=True)

# For submission, we need to follow the competition template
# The submission is a notebook, not a CSV
# But we can save our model predictions for reference

print('Experiment complete!')
print(f'\nFinal CV: {weighted_cv:.6f}')