# Loop 3 Analysis: DRFP Failure and Next Steps

## Key Findings from Evaluator
1. **PCA is wrong for sparse fingerprints** - DRFP is 97.4% sparse, PCA treats zeros as informative
2. **DRFP CV 0.017 vs Spange CV 0.011** - DRFP performed WORSE
3. **GNN benchmark used graph architecture, not just DRFP features**

## Research Insights
- Use Truncated SVD instead of PCA for sparse data
- Or use raw DRFP with strong regularization
- Or combine DRFP + Spange features
- Consider feature selection based on variance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.feature_selection import VarianceThreshold
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'

# Load all feature sets
spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
drfp = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
acs_pca = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

print('Feature dimensions:')
print(f'  Spange: {spange.shape}')
print(f'  DRFP: {drfp.shape}')
print(f'  ACS PCA: {acs_pca.shape}')
print(f'\nDRFP sparsity: {(drfp.values == 0).mean():.2%}')

In [None]:
# Analyze DRFP feature variance
drfp_var = drfp.var(axis=0)
print(f'DRFP feature variance statistics:')
print(f'  Min: {drfp_var.min():.6f}')
print(f'  Max: {drfp_var.max():.6f}')
print(f'  Mean: {drfp_var.mean():.6f}')
print(f'  Median: {drfp_var.median():.6f}')

# How many features have non-zero variance?
nonzero_var = (drfp_var > 0).sum()
print(f'\nFeatures with non-zero variance: {nonzero_var} / {len(drfp_var)}')

# How many features have variance > 0.01?
high_var = (drfp_var > 0.01).sum()
print(f'Features with variance > 0.01: {high_var}')

# How many features have variance > 0.05?
very_high_var = (drfp_var > 0.05).sum()
print(f'Features with variance > 0.05: {very_high_var}')

In [None]:
# Compare PCA vs Truncated SVD on DRFP
from sklearn.decomposition import TruncatedSVD, PCA

# PCA (what we used - problematic for sparse data)
pca = PCA(n_components=min(50, drfp.shape[0]-1))
drfp_pca = pca.fit_transform(drfp.values)
print(f'PCA explained variance ratio (first 10): {pca.explained_variance_ratio_[:10].round(3)}')
print(f'PCA total explained variance: {pca.explained_variance_ratio_.sum():.3f}')

# Truncated SVD (better for sparse data)
svd = TruncatedSVD(n_components=min(50, drfp.shape[0]-1))
drfp_svd = svd.fit_transform(drfp.values)
print(f'\nTruncated SVD explained variance ratio (first 10): {svd.explained_variance_ratio_[:10].round(3)}')
print(f'Truncated SVD total explained variance: {svd.explained_variance_ratio_.sum():.3f}')

In [None]:
# Feature selection: keep only non-zero variance features
from sklearn.feature_selection import VarianceThreshold

# Remove zero-variance features
selector = VarianceThreshold(threshold=0.0)
drfp_selected = selector.fit_transform(drfp.values)
print(f'DRFP after removing zero-variance features: {drfp_selected.shape}')

# More aggressive: remove low-variance features
selector_high = VarianceThreshold(threshold=0.01)
drfp_high_var = selector_high.fit_transform(drfp.values)
print(f'DRFP after removing low-variance features (threshold=0.01): {drfp_high_var.shape}')

In [None]:
# Analyze Spange descriptors - these are working well
print('Spange descriptor statistics:')
print(spange.describe().T[['mean', 'std', 'min', 'max']])

# Check correlation between Spange features
spange_corr = spange.corr()
print(f'\nSpange feature correlations (high correlations > 0.8):')
high_corr = []
for i in range(len(spange_corr.columns)):
    for j in range(i+1, len(spange_corr.columns)):
        if abs(spange_corr.iloc[i, j]) > 0.8:
            high_corr.append((spange_corr.columns[i], spange_corr.columns[j], spange_corr.iloc[i, j]))
for c1, c2, corr in high_corr:
    print(f'  {c1} - {c2}: {corr:.3f}')

In [None]:
# Key insight: What if we combine Spange + selected DRFP features?
# This could give us the best of both worlds

# Get the indices of high-variance DRFP features
high_var_mask = drfp_var > 0.01
high_var_indices = drfp_var[high_var_mask].index.tolist()
print(f'Number of high-variance DRFP features: {len(high_var_indices)}')

# Create combined feature set
drfp_subset = drfp[high_var_indices]
print(f'\nCombined feature dimensions:')
print(f'  Spange: {spange.shape[1]}')
print(f'  DRFP (high-var): {drfp_subset.shape[1]}')
print(f'  Total: {spange.shape[1] + drfp_subset.shape[1]}')

# Check if solvents match
print(f'\nSolvents in Spange: {len(spange.index)}')
print(f'Solvents in DRFP: {len(drfp.index)}')
print(f'Common solvents: {len(set(spange.index) & set(drfp.index))}')

In [None]:
# Analyze the CV-LB gap
print('CV-LB Gap Analysis:')
print('='*50)
print('Submission 1 (MLP): CV 0.0111 -> LB 0.0982 (gap: 0.0871)')
print('Submission 2 (LightGBM): CV 0.0123 -> LB 0.1065 (gap: 0.0942)')
print('\nThe gap is MASSIVE - 8-9x difference!')
print('\nPossible explanations:')
print('1. Our local CV calculation differs from competition')
print('2. The competition uses different random seeds')
print('3. Model variance between runs')
print('4. Different library versions on Kaggle')
print('\nKey insight: Both MLP and LightGBM show similar gaps.')
print('This suggests the gap is NOT due to model variance.')

In [None]:
# Strategy analysis: What should we try next?
print('STRATEGY ANALYSIS')
print('='*50)
print('\nWhat has worked:')
print('  - MLP with Spange + Arrhenius: LB 0.0982 (best)')
print('  - Chemical symmetry handling (TTA)')
print('\nWhat has NOT worked:')
print('  - LightGBM: LB 0.1065 (worse than MLP)')
print('  - DRFP with PCA: CV 0.017 (worse than Spange CV 0.011)')
print('\nWhat to try next:')
print('  1. DRFP without PCA - use raw features with regularization')
print('  2. DRFP with Truncated SVD instead of PCA')
print('  3. Combine Spange + DRFP (high-variance features only)')
print('  4. Hyperparameter tuning on MLP')
print('  5. Ensemble: average MLP predictions from different seeds')
print('\nTarget: 0.0333 (need 3x improvement from 0.0982)')

In [None]:
# Final recommendation
print('RECOMMENDED NEXT EXPERIMENTS')
print('='*50)
print('\nPriority 1: Combine Spange + DRFP (high-variance)')
print('  - Spange (13 features) + DRFP high-var (~50-100 features)')
print('  - Use Truncated SVD on DRFP, not PCA')
print('  - Keep Arrhenius kinetics features')
print('  - Same MLP architecture')
print('\nPriority 2: Hyperparameter tuning on baseline MLP')
print('  - Try different hidden layer sizes')
print('  - Try different dropout rates')
print('  - Try more epochs')
print('  - Try different learning rates')
print('\nPriority 3: Ensemble multiple MLP models')
print('  - Train 10+ models with different seeds')
print('  - Average predictions')
print('  - This should reduce variance and improve LB')