# Loop 1 Analysis: Understanding DRFP Features and Improvement Potential

Analyzing:
1. DRFP feature dimensions and sparsity
2. Potential for combining DRFP with Arrhenius features
3. Understanding per-fold MSE variance

In [None]:
import pandas as pd
import numpy as np

# Load all feature types
DATA_PATH = '/home/data'

spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
acs_pca = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
drfp = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
fragprints = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)

print('Feature dimensions:')
print(f'  Spange descriptors: {spange.shape}')
print(f'  ACS PCA descriptors: {acs_pca.shape}')
print(f'  DRFP: {drfp.shape}')
print(f'  Fragprints: {fragprints.shape}')

In [None]:
# Analyze DRFP sparsity
print('\nDRFP Analysis:')
print(f'  Non-zero values: {(drfp.values != 0).sum()}')
print(f'  Total values: {drfp.values.size}')
print(f'  Sparsity: {1 - (drfp.values != 0).sum() / drfp.values.size:.2%}')
print(f'  Mean non-zero per solvent: {(drfp.values != 0).sum(axis=1).mean():.1f}')
print(f'  Value range: [{drfp.values.min():.3f}, {drfp.values.max():.3f}]')

In [None]:
# Check if DRFP has same solvents as Spange
print('\nSolvent coverage:')
print(f'  Spange solvents: {len(spange)}')
print(f'  DRFP solvents: {len(drfp)}')
print(f'  Common solvents: {len(set(spange.index) & set(drfp.index))}')
print(f'  Missing in DRFP: {set(spange.index) - set(drfp.index)}')

In [None]:
# Load actual data to understand the task better
full_data = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
single_data = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')

print('\nData statistics:')
print(f'  Single solvent samples: {len(single_data)}')
print(f'  Full data samples: {len(full_data)}')
print(f'  Unique solvents in single: {single_data["SOLVENT NAME"].nunique()}')
print(f'  Unique ramps in full: {full_data[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates().shape[0]}')

In [None]:
# Analyze target distributions
print('\nTarget statistics (Single Solvent):')
for col in ['SM', 'Product 2', 'Product 3']:
    print(f'  {col}: mean={single_data[col].mean():.3f}, std={single_data[col].std():.3f}, range=[{single_data[col].min():.3f}, {single_data[col].max():.3f}]')

print('\nTarget statistics (Full Data):')
for col in ['SM', 'Product 2', 'Product 3']:
    print(f'  {col}: mean={full_data[col].mean():.3f}, std={full_data[col].std():.3f}, range=[{full_data[col].min():.3f}, {full_data[col].max():.3f}]')