In [1]:
import pandas as pd
import numpy as np

# Load data
full_df = pd.read_csv('/home/data/catechol_full_data_yields.csv')
single_df = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')

print("Full data shape:", full_df.shape)
print("Single solvent data shape:", single_df.shape)
print("\nFull data columns:", full_df.columns.tolist())
print("\nSingle solvent columns:", single_df.columns.tolist())

Full data shape: (1227, 19)
Single solvent data shape: (656, 13)

Full data columns: ['EXP NUM', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT A SMILES', 'SOLVENT B SMILES', 'SOLVENT A Ratio', 'SOLVENT B Ratio', 'Reaction SMILES A', 'Reaction SMILES B', 'RAMP NUM']

Single solvent columns: ['EXP NUM', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT NAME', 'SOLVENT SMILES', 'SOLVENT Ratio', 'Reaction SMILES']


In [2]:
# Check target distributions and key statistics
print("=== Single Solvent Data ===")
print(single_df[['SM', 'Product 2', 'Product 3']].describe())
print("\nNumber of unique solvents:", single_df['SOLVENT NAME'].nunique())
print("Solvents:", single_df['SOLVENT NAME'].unique())

print("\n=== Full Data (Mixtures) ===")
print(full_df[['SM', 'Product 2', 'Product 3']].describe())
print("\nNumber of unique solvent A:", full_df['SOLVENT A NAME'].nunique())
print("Number of unique solvent B:", full_df['SOLVENT B NAME'].nunique())
print("\nSolventB% range:", full_df['SolventB%'].min(), "-", full_df['SolventB%'].max())

=== Single Solvent Data ===
               SM   Product 2   Product 3
count  656.000000  656.000000  656.000000
mean     0.522192    0.149932    0.123380
std      0.360229    0.143136    0.131528
min      0.000000    0.000000    0.000000
25%      0.145001    0.012976    0.009445
50%      0.656558    0.102813    0.078298
75%      0.857019    0.281654    0.193353
max      1.000000    0.463632    0.533768

Number of unique solvents: 24
Solvents: ['Methanol' 'Ethylene Glycol [1,2-Ethanediol]'
 '1,1,1,3,3,3-Hexafluoropropan-2-ol' '2-Methyltetrahydrofuran [2-MeTHF]'
 'Cyclohexane' 'IPA [Propan-2-ol]' 'Water.Acetonitrile' 'Acetonitrile'
 'Acetonitrile.Acetic Acid' 'Diethyl Ether [Ether]'
 '2,2,2-Trifluoroethanol' 'Water.2,2,2-Trifluoroethanol'
 'DMA [N,N-Dimethylacetamide]' 'Decanol' 'Ethanol' 'THF [Tetrahydrofuran]'
 'Dihydrolevoglucosenone (Cyrene)' 'Ethyl Acetate'
 'MTBE [tert-Butylmethylether]' 'Butanone [MEK]'
 'tert-Butanol [2-Methylpropan-2-ol]' 'Dimethyl Carbonate'
 'Methyl Propionate

In [4]:
# Check lookup tables
spange = pd.read_csv('/home/data/spange_descriptors_lookup.csv', index_col=0)
acs = pd.read_csv('/home/data/acs_pca_descriptors_lookup.csv', index_col=0)
drfps = pd.read_csv('/home/data/drfps_catechol_lookup.csv', index_col=0)
fragprints = pd.read_csv('/home/data/fragprints_lookup.csv', index_col=0)

print("Spange descriptors shape:", spange.shape)
print("Spange columns:", spange.columns.tolist()[:10], "...")
print("\nACS PCA descriptors shape:", acs.shape)
print("DRFPS shape:", drfps.shape)
print("Fragprints shape:", fragprints.shape)

print("\nSpange descriptors sample:")
print(spange.head())

Spange descriptors shape: (26, 13)
Spange columns: ['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 'SA', 'SB', 'SP', 'SdP', 'N'] ...

ACS PCA descriptors shape: (24, 5)
DRFPS shape: (24, 2048)
Fragprints shape: (24, 2133)

Spange descriptors sample:
                                   dielectric constant  ET(30)  alpha  beta  \
SOLVENT NAME                                                                  
Cyclohexane                                       2.02    30.9   0.00  0.00   
Ethyl Acetate                                     6.02    38.1   0.00  0.45   
Acetic Acid                                       6.15    51.7   1.12  0.45   
2-Methyltetrahydrofuran [2-MeTHF]                 7.58    36.5   0.00  0.45   
1,1,1,3,3,3-Hexafluoropropan-2-ol                16.70    62.1   1.96  0.00   

                                    pi*     SA     SB     SP    SdP        N  \
SOLVENT NAME                                                                   
Cyclohexane               