In [1]:
import pandas as pd
import numpy as np

# Load the data
full_data = pd.read_csv('/home/data/catechol_full_data_yields.csv')
single_data = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')

print('Full data shape:', full_data.shape)
print('Single solvent data shape:', single_data.shape)
print('\nFull data columns:', full_data.columns.tolist())
print('\nSingle data columns:', single_data.columns.tolist())

Full data shape: (1227, 19)
Single solvent data shape: (656, 13)

Full data columns: ['EXP NUM', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT A SMILES', 'SOLVENT B SMILES', 'SOLVENT A Ratio', 'SOLVENT B Ratio', 'Reaction SMILES A', 'Reaction SMILES B', 'RAMP NUM']

Single data columns: ['EXP NUM', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT NAME', 'SOLVENT SMILES', 'SOLVENT Ratio', 'Reaction SMILES']


In [2]:
# Check target distributions and key statistics
print('=== Target Statistics (Full Data) ===')
print(full_data[['SM', 'Product 2', 'Product 3']].describe())

print('\n=== Target Statistics (Single Solvent) ===')
print(single_data[['SM', 'Product 2', 'Product 3']].describe())

print('\n=== Unique solvents ===')
print('Single solvent unique:', single_data['SOLVENT NAME'].nunique())
print('Full data - Solvent A unique:', full_data['SOLVENT A NAME'].nunique())
print('Full data - Solvent B unique:', full_data['SOLVENT B NAME'].nunique())

print('\n=== Unique solvent pairs (ramps) ===')
ramps = full_data[['SOLVENT A NAME', 'SOLVENT B NAME']].drop_duplicates()
print('Number of unique ramps:', len(ramps))

=== Target Statistics (Full Data) ===
                SM    Product 2    Product 3
count  1227.000000  1227.000000  1227.000000
mean      0.495178     0.164626     0.143668
std       0.379425     0.153467     0.145779
min       0.000000     0.000000     0.000000
25%       0.068573     0.012723     0.012260
50%       0.606454     0.117330     0.094413
75%       0.877448     0.308649     0.254630
max       1.083254     0.463632     0.533768

=== Target Statistics (Single Solvent) ===
               SM   Product 2   Product 3
count  656.000000  656.000000  656.000000
mean     0.522192    0.149932    0.123380
std      0.360229    0.143136    0.131528
min      0.000000    0.000000    0.000000
25%      0.145001    0.012976    0.009445
50%      0.656558    0.102813    0.078298
75%      0.857019    0.281654    0.193353
max      1.000000    0.463632    0.533768

=== Unique solvents ===
Single solvent unique: 24
Full data - Solvent A unique: 13
Full data - Solvent B unique: 13

=== Unique solven

In [3]:
# Check the lookup tables
spange = pd.read_csv('/home/data/spange_descriptors_lookup.csv', index_col=0)
acs_pca = pd.read_csv('/home/data/acs_pca_descriptors_lookup.csv', index_col=0)
drfps = pd.read_csv('/home/data/drfps_catechol_lookup.csv', index_col=0)
fragprints = pd.read_csv('/home/data/fragprints_lookup.csv', index_col=0)

print('Spange descriptors shape:', spange.shape)
print('ACS PCA descriptors shape:', acs_pca.shape)
print('DRFPS shape:', drfps.shape)
print('Fragprints shape:', fragprints.shape)

print('\\nSpange columns:', spange.columns.tolist()[:10])
print('\\nSpange index (solvents):', spange.index.tolist()[:10])

Spange descriptors shape: (26, 13)
ACS PCA descriptors shape: (24, 5)
DRFPS shape: (24, 2048)
Fragprints shape: (24, 2133)
\nSpange columns: ['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 'SA', 'SB', 'SP', 'SdP', 'N']
\nSpange index (solvents): ['Cyclohexane', 'Ethyl Acetate', 'Acetic Acid', '2-Methyltetrahydrofuran [2-MeTHF]', '1,1,1,3,3,3-Hexafluoropropan-2-ol', 'IPA [Propan-2-ol]', 'Ethanol', 'Methanol', 'Ethylene Glycol [1,2-Ethanediol]', 'Acetonitrile']


In [4]:
# Check input ranges for Temperature and Residence Time
print('=== Input Ranges (Full Data) ===')
print(full_data[['Residence Time', 'Temperature', 'SolventB%']].describe())

print('\\n=== Input Ranges (Single Solvent) ===')
print(single_data[['Residence Time', 'Temperature']].describe())

# Check if targets sum to 1
print('\\n=== Target Sum Check (Full Data) ===')
target_sum = full_data['SM'] + full_data['Product 2'] + full_data['Product 3']
print('Target sum stats:', target_sum.describe())

=== Input Ranges (Full Data) ===
       Residence Time  Temperature    SolventB%
count     1227.000000  1227.000000  1227.000000
mean        13.726828   200.937287     0.435865
std          3.280033    23.118608     0.400336
min          2.001019   175.000000     0.000000
25%         14.988550   175.000000     0.000000
50%         15.000393   200.000000     0.330153
75%         15.017208   225.000000     0.858481
max         15.017208   225.000000     1.000000
\n=== Input Ranges (Single Solvent) ===
       Residence Time  Temperature
count      656.000000   656.000000
mean        12.621262   201.181480
std          4.184125    21.349418
min          2.001019   175.000000
25%         11.269690   175.000000
50%         15.017208   200.000000
75%         15.017208   225.000000
max         15.017208   225.000000
\n=== Target Sum Check (Full Data) ===
Target sum stats: count    1227.000000
mean        0.803472
std         0.209172
min         0.011194
25%         0.737871
50%         0.8721