In [1]:
import pandas as pd
import numpy as np

# Load both datasets
df_full = pd.read_csv('/home/data/catechol_full_data_yields.csv')
df_single = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')

print("Full data shape:", df_full.shape)
print("Single solvent data shape:", df_single.shape)
print("\nFull data columns:", df_full.columns.tolist())
print("\nSingle solvent columns:", df_single.columns.tolist())

Full data shape: (1227, 19)
Single solvent data shape: (656, 13)

Full data columns: ['EXP NUM', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT A SMILES', 'SOLVENT B SMILES', 'SOLVENT A Ratio', 'SOLVENT B Ratio', 'Reaction SMILES A', 'Reaction SMILES B', 'RAMP NUM']

Single solvent columns: ['EXP NUM', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT NAME', 'SOLVENT SMILES', 'SOLVENT Ratio', 'Reaction SMILES']


In [2]:
# Explore target distributions and key features
print("=== Target Statistics (Full Data) ===")
print(df_full[['SM', 'Product 2', 'Product 3']].describe())

print("\n=== Unique solvents ===")
print("Single solvent unique:", df_single['SOLVENT NAME'].nunique())
print("Full data - Solvent A unique:", df_full['SOLVENT A NAME'].nunique())
print("Full data - Solvent B unique:", df_full['SOLVENT B NAME'].nunique())

print("\n=== Numeric feature ranges ===")
print("Residence Time range:", df_full['Residence Time'].min(), "-", df_full['Residence Time'].max())
print("Temperature range:", df_full['Temperature'].min(), "-", df_full['Temperature'].max())
print("SolventB% range:", df_full['SolventB%'].min(), "-", df_full['SolventB%'].max())

=== Target Statistics (Full Data) ===
                SM    Product 2    Product 3
count  1227.000000  1227.000000  1227.000000
mean      0.495178     0.164626     0.143668
std       0.379425     0.153467     0.145779
min       0.000000     0.000000     0.000000
25%       0.068573     0.012723     0.012260
50%       0.606454     0.117330     0.094413
75%       0.877448     0.308649     0.254630
max       1.083254     0.463632     0.533768

=== Unique solvents ===
Single solvent unique: 24
Full data - Solvent A unique: 13
Full data - Solvent B unique: 13

=== Numeric feature ranges ===
Residence Time range: 2.001019108286073 - 15.017208412882612
Temperature range: 175.0 - 225.0
SolventB% range: 0.0 - 1.0


In [3]:
# Check the lookup tables
spange = pd.read_csv('/home/data/spange_descriptors_lookup.csv', index_col=0)
acs = pd.read_csv('/home/data/acs_pca_descriptors_lookup.csv', index_col=0)
drfps = pd.read_csv('/home/data/drfps_catechol_lookup.csv', index_col=0)
fragprints = pd.read_csv('/home/data/fragprints_lookup.csv', index_col=0)

print("Spange descriptors shape:", spange.shape)
print("ACS PCA descriptors shape:", acs.shape)
print("DRFPS shape:", drfps.shape)
print("Fragprints shape:", fragprints.shape)

print("\nSpange columns:", spange.columns.tolist())
print("\nSpange head:")
print(spange.head())

Spange descriptors shape: (26, 13)
ACS PCA descriptors shape: (24, 5)
DRFPS shape: (24, 2048)
Fragprints shape: (24, 2133)

Spange columns: ['dielectric constant', 'ET(30)', 'alpha', 'beta', 'pi*', 'SA', 'SB', 'SP', 'SdP', 'N', 'n', 'f(n)', 'delta']

Spange head:
                                   dielectric constant  ET(30)  alpha  beta  \
SOLVENT NAME                                                                  
Cyclohexane                                       2.02    30.9   0.00  0.00   
Ethyl Acetate                                     6.02    38.1   0.00  0.45   
Acetic Acid                                       6.15    51.7   1.12  0.45   
2-Methyltetrahydrofuran [2-MeTHF]                 7.58    36.5   0.00  0.45   
1,1,1,3,3,3-Hexafluoropropan-2-ol                16.70    62.1   1.96  0.00   

                                    pi*     SA     SB     SP    SdP        N  \
SOLVENT NAME                                                                   
Cyclohexane           

In [None]:
# Check CV split counts
print("=== CV Split Analysis ===")
print("\nSingle solvent - Leave-one-out splits:")
print("Number of unique solvents:", df_single['SOLVENT NAME'].nunique())
print("Samples per solvent:")
print(df_single.groupby('SOLVENT NAME').size().describe())

print("\nFull data - Leave-one-ramp-out splits:")
ramps = df_full[['SOLVENT A NAME', 'SOLVENT B NAME']].drop_duplicates()
print("Number of unique ramps:", len(ramps))
print("Samples per ramp:")
print(df_full.groupby(['SOLVENT A NAME', 'SOLVENT B NAME']).size().describe())