# Loop 1 Analysis: Understanding DRFP Features and Improvement Potential

Analyzing:
1. DRFP feature dimensions and sparsity
2. Potential for combining DRFP with Arrhenius features
3. Understanding per-fold MSE variance

In [1]:
import pandas as pd
import numpy as np

# Load all feature types
DATA_PATH = '/home/data'

spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
acs_pca = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)
drfp = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv', index_col=0)
fragprints = pd.read_csv(f'{DATA_PATH}/fragprints_lookup.csv', index_col=0)

print('Feature dimensions:')
print(f'  Spange descriptors: {spange.shape}')
print(f'  ACS PCA descriptors: {acs_pca.shape}')
print(f'  DRFP: {drfp.shape}')
print(f'  Fragprints: {fragprints.shape}')

Feature dimensions:
  Spange descriptors: (26, 13)
  ACS PCA descriptors: (24, 5)
  DRFP: (24, 2048)
  Fragprints: (24, 2133)


In [2]:
# Analyze DRFP sparsity
print('\nDRFP Analysis:')
print(f'  Non-zero values: {(drfp.values != 0).sum()}')
print(f'  Total values: {drfp.values.size}')
print(f'  Sparsity: {1 - (drfp.values != 0).sum() / drfp.values.size:.2%}')
print(f'  Mean non-zero per solvent: {(drfp.values != 0).sum(axis=1).mean():.1f}')
print(f'  Value range: [{drfp.values.min():.3f}, {drfp.values.max():.3f}]')


DRFP Analysis:
  Non-zero values: 1261
  Total values: 49152
  Sparsity: 97.43%
  Mean non-zero per solvent: 52.5
  Value range: [0.000, 1.000]


In [3]:
# Check if DRFP has same solvents as Spange
print('\nSolvent coverage:')
print(f'  Spange solvents: {len(spange)}')
print(f'  DRFP solvents: {len(drfp)}')
print(f'  Common solvents: {len(set(spange.index) & set(drfp.index))}')
print(f'  Missing in DRFP: {set(spange.index) - set(drfp.index)}')


Solvent coverage:
  Spange solvents: 26
  DRFP solvents: 24
  Common solvents: 24
  Missing in DRFP: {'Water', 'Acetic Acid'}


In [4]:
# Load actual data to understand the task better
full_data = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
single_data = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')

print('\nData statistics:')
print(f'  Single solvent samples: {len(single_data)}')
print(f'  Full data samples: {len(full_data)}')
print(f'  Unique solvents in single: {single_data["SOLVENT NAME"].nunique()}')
print(f'  Unique ramps in full: {full_data[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates().shape[0]}')


Data statistics:
  Single solvent samples: 656
  Full data samples: 1227
  Unique solvents in single: 24
  Unique ramps in full: 13


In [5]:
# Analyze target distributions
print('\nTarget statistics (Single Solvent):')
for col in ['SM', 'Product 2', 'Product 3']:
    print(f'  {col}: mean={single_data[col].mean():.3f}, std={single_data[col].std():.3f}, range=[{single_data[col].min():.3f}, {single_data[col].max():.3f}]')

print('\nTarget statistics (Full Data):')
for col in ['SM', 'Product 2', 'Product 3']:
    print(f'  {col}: mean={full_data[col].mean():.3f}, std={full_data[col].std():.3f}, range=[{full_data[col].min():.3f}, {full_data[col].max():.3f}]')


Target statistics (Single Solvent):
  SM: mean=0.522, std=0.360, range=[0.000, 1.000]
  Product 2: mean=0.150, std=0.143, range=[0.000, 0.464]
  Product 3: mean=0.123, std=0.132, range=[0.000, 0.534]

Target statistics (Full Data):
  SM: mean=0.495, std=0.379, range=[0.000, 1.083]
  Product 2: mean=0.165, std=0.153, range=[0.000, 0.464]
  Product 3: mean=0.144, std=0.146, range=[0.000, 0.534]


In [6]:
# Check which solvents are used in the data
print('Solvents in single solvent data:')
single_solvents = set(single_data['SOLVENT NAME'].unique())
print(f'  {single_solvents}')

print('\nSolvents in full data:')
full_solvents_a = set(full_data['SOLVENT A NAME'].unique())
full_solvents_b = set(full_data['SOLVENT B NAME'].unique())
all_full_solvents = full_solvents_a | full_solvents_b
print(f'  A: {full_solvents_a}')
print(f'  B: {full_solvents_b}')

print('\nSolvents missing DRFP features:')
missing_drfp = (single_solvents | all_full_solvents) - set(drfp.index)
print(f'  {missing_drfp}')

Solvents in single solvent data:
  {'MTBE [tert-Butylmethylether]', 'Water.Acetonitrile', 'Ethanol', 'Acetonitrile', 'Butanone [MEK]', '2,2,2-Trifluoroethanol', 'Cyclohexane', 'DMA [N,N-Dimethylacetamide]', 'Methanol', '1,1,1,3,3,3-Hexafluoropropan-2-ol', 'Dihydrolevoglucosenone (Cyrene)', 'Acetonitrile.Acetic Acid', '2-Methyltetrahydrofuran [2-MeTHF]', 'Ethyl Acetate', 'Dimethyl Carbonate', 'Water.2,2,2-Trifluoroethanol', 'THF [Tetrahydrofuran]', 'Methyl Propionate', 'Ethyl Lactate', 'Diethyl Ether [Ether]', 'Ethylene Glycol [1,2-Ethanediol]', 'Decanol', 'tert-Butanol [2-Methylpropan-2-ol]', 'IPA [Propan-2-ol]'}

Solvents in full data:
  A: {'MTBE [tert-Butylmethylether]', 'tert-Butanol [2-Methylpropan-2-ol]', 'Methanol', 'Water.Acetonitrile', '1,1,1,3,3,3-Hexafluoropropan-2-ol', 'Methyl Propionate', 'Dihydrolevoglucosenone (Cyrene)', '2-Methyltetrahydrofuran [2-MeTHF]', 'Ethanol', 'Acetonitrile', '2,2,2-Trifluoroethanol', 'Cyclohexane', 'DMA [N,N-Dimethylacetamide]'}
  B: {'Decanol',