In [1]:
import pandas as pd
import numpy as np

# Load both datasets
full_data = pd.read_csv('/home/data/catechol_full_data_yields.csv')
single_data = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')

print("Full data shape:", full_data.shape)
print("Single solvent data shape:", single_data.shape)
print("\nFull data columns:")
print(full_data.columns.tolist())
print("\nSingle solvent columns:")
print(single_data.columns.tolist())

Full data shape: (1227, 19)
Single solvent data shape: (656, 13)

Full data columns:
['EXP NUM', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT A SMILES', 'SOLVENT B SMILES', 'SOLVENT A Ratio', 'SOLVENT B Ratio', 'Reaction SMILES A', 'Reaction SMILES B', 'RAMP NUM']

Single solvent columns:
['EXP NUM', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT NAME', 'SOLVENT SMILES', 'SOLVENT Ratio', 'Reaction SMILES']


In [2]:
# Check target distributions and unique solvents
print("Target statistics (Single Solvent):")
print(single_data[['SM', 'Product 2', 'Product 3']].describe())

print("\n\nUnique solvents in single solvent data:", single_data['SOLVENT NAME'].nunique())
print(single_data['SOLVENT NAME'].unique())

print("\n\nUnique solvent pairs in full data:")
print("Solvent A:", full_data['SOLVENT A NAME'].nunique())
print("Solvent B:", full_data['SOLVENT B NAME'].nunique())
print("\nUnique ramps:", full_data[['SOLVENT A NAME', 'SOLVENT B NAME']].drop_duplicates().shape[0])

Target statistics (Single Solvent):
               SM   Product 2   Product 3
count  656.000000  656.000000  656.000000
mean     0.522192    0.149932    0.123380
std      0.360229    0.143136    0.131528
min      0.000000    0.000000    0.000000
25%      0.145001    0.012976    0.009445
50%      0.656558    0.102813    0.078298
75%      0.857019    0.281654    0.193353
max      1.000000    0.463632    0.533768


Unique solvents in single solvent data: 24
['Methanol' 'Ethylene Glycol [1,2-Ethanediol]'
 '1,1,1,3,3,3-Hexafluoropropan-2-ol' '2-Methyltetrahydrofuran [2-MeTHF]'
 'Cyclohexane' 'IPA [Propan-2-ol]' 'Water.Acetonitrile' 'Acetonitrile'
 'Acetonitrile.Acetic Acid' 'Diethyl Ether [Ether]'
 '2,2,2-Trifluoroethanol' 'Water.2,2,2-Trifluoroethanol'
 'DMA [N,N-Dimethylacetamide]' 'Decanol' 'Ethanol' 'THF [Tetrahydrofuran]'
 'Dihydrolevoglucosenone (Cyrene)' 'Ethyl Acetate'
 'MTBE [tert-Butylmethylether]' 'Butanone [MEK]'
 'tert-Butanol [2-Methylpropan-2-ol]' 'Dimethyl Carbonate'
 'Methy

In [3]:
# Check the lookup tables
spange = pd.read_csv('/home/data/spange_descriptors_lookup.csv', index_col=0)
acs_pca = pd.read_csv('/home/data/acs_pca_descriptors_lookup.csv', index_col=0)
drfps = pd.read_csv('/home/data/drfps_catechol_lookup.csv', index_col=0)
fragprints = pd.read_csv('/home/data/fragprints_lookup.csv', index_col=0)

print("Spange descriptors shape:", spange.shape)
print("ACS PCA descriptors shape:", acs_pca.shape)
print("DRFP shape:", drfps.shape)
print("Fragprints shape:", fragprints.shape)

print("\nSpange columns:", spange.columns.tolist())

Spange descriptors shape: (26, 13)
ACS PCA descriptors shape: (24, 5)
DRFP shape: (24, 2048)
Fragprints shape: (24, 2133)

Spange columns: ['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 'SA', 'SB', 'SP', 'SdP', 'N', 'n', 'f(n)', 'delta']


In [4]:
# Check temperature and residence time ranges
print("Temperature range (single):", single_data['Temperature'].min(), "-", single_data['Temperature'].max())
print("Residence Time range (single):", single_data['Residence Time'].min(), "-", single_data['Residence Time'].max())

print("\nTemperature range (full):", full_data['Temperature'].min(), "-", full_data['Temperature'].max())
print("Residence Time range (full):", full_data['Residence Time'].min(), "-", full_data['Residence Time'].max())
print("SolventB% range:", full_data['SolventB%'].min(), "-", full_data['SolventB%'].max())

# Check if targets sum to 1
print("\n\nTarget sum statistics (single):")
print((single_data['SM'] + single_data['Product 2'] + single_data['Product 3']).describe())

Temperature range (single): 175.0 - 225.0
Residence Time range (single): 2.001019108286073 - 15.017208412882612

Temperature range (full): 175.0 - 225.0
Residence Time range (full): 2.001019108286073 - 15.017208412882612
SolventB% range: 0.0 - 1.0


Target sum statistics (single):
count    656.000000
mean       0.795504
std        0.194306
min        0.028752
25%        0.708417
50%        0.849648
75%        0.927955
max        1.000000
dtype: float64
