In [1]:
import pandas as pd
import numpy as np

# Load datasets
full_data = pd.read_csv('/home/data/catechol_full_data_yields.csv')
single_data = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')

print('Full data shape:', full_data.shape)
print('Single solvent data shape:', single_data.shape)
print('\nFull data columns:', full_data.columns.tolist())
print('\nSingle data columns:', single_data.columns.tolist())

Full data shape: (1227, 19)
Single solvent data shape: (656, 13)

Full data columns: ['EXP NUM', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT A SMILES', 'SOLVENT B SMILES', 'SOLVENT A Ratio', 'SOLVENT B Ratio', 'Reaction SMILES A', 'Reaction SMILES B', 'RAMP NUM']

Single data columns: ['EXP NUM', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT NAME', 'SOLVENT SMILES', 'SOLVENT Ratio', 'Reaction SMILES']


In [3]:
# Check target distributions and ranges
print('=== Target Statistics (Full Data) ===')
print(full_data[['SM', 'Product 2', 'Product 3']].describe())

print('\n=== Target Statistics (Single Solvent) ===')
print(single_data[['SM', 'Product 2', 'Product 3']].describe())

# Check if targets sum to 1
print('\n=== Row sums (Full Data) ===')
row_sums = full_data[['SM', 'Product 2', 'Product 3']].sum(axis=1)
print(f'Min: {row_sums.min():.4f}, Max: {row_sums.max():.4f}, Mean: {row_sums.mean():.4f}')

=== Target Statistics (Full Data) ===
                SM    Product 2    Product 3
count  1227.000000  1227.000000  1227.000000
mean      0.495178     0.164626     0.143668
std       0.379425     0.153467     0.145779
min       0.000000     0.000000     0.000000
25%       0.068573     0.012723     0.012260
50%       0.606454     0.117330     0.094413
75%       0.877448     0.308649     0.254630
max       1.083254     0.463632     0.533768

=== Target Statistics (Single Solvent) ===
               SM   Product 2   Product 3
count  656.000000  656.000000  656.000000
mean     0.522192    0.149932    0.123380
std      0.360229    0.143136    0.131528
min      0.000000    0.000000    0.000000
25%      0.145001    0.012976    0.009445
50%      0.656558    0.102813    0.078298
75%      0.857019    0.281654    0.193353
max      1.000000    0.463632    0.533768

=== Row sums (Full Data) ===
Min: 0.0112, Max: 1.1233, Mean: 0.8035


In [5]:
# Check unique solvents and CV structure
print('=== Single Solvent CV Structure ===')
print(f'Unique solvents: {single_data["SOLVENT NAME"].nunique()}')
print(single_data['SOLVENT NAME'].value_counts())

print('\n=== Full Data CV Structure ===')
ramps = full_data[['SOLVENT A NAME', 'SOLVENT B NAME']].drop_duplicates()
print(f'Unique ramps (folds): {len(ramps)}')
print(ramps)

=== Single Solvent CV Structure ===
Unique solvents: 24
SOLVENT NAME
Acetonitrile                          59
2-Methyltetrahydrofuran [2-MeTHF]     58
Ethanol                               42
DMA [N,N-Dimethylacetamide]           41
Water.Acetonitrile                    37
2,2,2-Trifluoroethanol                37
1,1,1,3,3,3-Hexafluoropropan-2-ol     37
Methanol                              36
Cyclohexane                           34
Ethylene Glycol [1,2-Ethanediol]      22
Water.2,2,2-Trifluoroethanol          22
Acetonitrile.Acetic Acid              22
Diethyl Ether [Ether]                 22
THF [Tetrahydrofuran]                 21
Decanol                               20
Dihydrolevoglucosenone (Cyrene)       18
Methyl Propionate                     18
Ethyl Acetate                         18
Butanone [MEK]                        18
tert-Butanol [2-Methylpropan-2-ol]    18
Dimethyl Carbonate                    18
Ethyl Lactate                         17
MTBE [tert-Butylmethylether] 

In [6]:
# Check numeric feature ranges
print('=== Numeric Features (Full Data) ===')
print(full_data[['Residence Time', 'Temperature', 'SolventB%']].describe())

print('\n=== Numeric Features (Single Solvent) ===')
print(single_data[['Residence Time', 'Temperature']].describe())

# Check lookup tables
spange = pd.read_csv('/home/data/spange_descriptors_lookup.csv', index_col=0)
print('\n=== Spange Descriptors Shape ===')
print(spange.shape)
print(spange.head())

=== Numeric Features (Full Data) ===
       Residence Time  Temperature    SolventB%
count     1227.000000  1227.000000  1227.000000
mean        13.726828   200.937287     0.435865
std          3.280033    23.118608     0.400336
min          2.001019   175.000000     0.000000
25%         14.988550   175.000000     0.000000
50%         15.000393   200.000000     0.330153
75%         15.017208   225.000000     0.858481
max         15.017208   225.000000     1.000000

=== Numeric Features (Single Solvent) ===
       Residence Time  Temperature
count      656.000000   656.000000
mean        12.621262   201.181480
std          4.184125    21.349418
min          2.001019   175.000000
25%         11.269690   175.000000
50%         15.017208   200.000000
75%         15.017208   225.000000
max         15.017208   225.000000

=== Spange Descriptors Shape ===
(26, 13)
                                   dielectric constant  ET(30)  alpha  beta  \
SOLVENT NAME                                       

In [7]:
# Check all lookup tables
import os

for f in ['acs_pca_descriptors_lookup.csv', 'drfps_catechol_lookup.csv', 'fragprints_lookup.csv']:
    df = pd.read_csv(f'/home/data/{f}', index_col=0)
    print(f'{f}: shape={df.shape}')
    print(df.head(2))
    print()

acs_pca_descriptors_lookup.csv: shape=(24, 5)
                                       PC1      PC2      PC3      PC4  \
SOLVENT NAME                                                            
Methanol                          -8.72651 -5.31265  1.25112 -1.85931   
Ethylene Glycol [1,2-Ethanediol] -10.71010 -1.88606  4.37327  1.45192   

                                       PC5  
SOLVENT NAME                                
Methanol                         -0.620266  
Ethylene Glycol [1,2-Ethanediol] -1.034370  

drfps_catechol_lookup.csv: shape=(24, 2048)
                                    0    1    2    3    4    5    6    7    8  \
SOLVENT NAME                                                                    
Methanol                          0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
Ethylene Glycol [1,2-Ethanediol]  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

                                    9  ...  2038  2039  2040  2041  2042  \
SOLVENT NAME                       