In [1]:
import pandas as pd
import numpy as np

# Load datasets
full_data = pd.read_csv('/home/data/catechol_full_data_yields.csv')
single_data = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')

print('Full data shape:', full_data.shape)
print('Single solvent data shape:', single_data.shape)
print('\nFull data columns:', full_data.columns.tolist())
print('\nSingle data columns:', single_data.columns.tolist())

Full data shape: (1227, 19)
Single solvent data shape: (656, 13)

Full data columns: ['EXP NUM', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT A SMILES', 'SOLVENT B SMILES', 'SOLVENT A Ratio', 'SOLVENT B Ratio', 'Reaction SMILES A', 'Reaction SMILES B', 'RAMP NUM']

Single data columns: ['EXP NUM', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT NAME', 'SOLVENT SMILES', 'SOLVENT Ratio', 'Reaction SMILES']


In [3]:
# Check target distributions and ranges
print('=== Target Statistics (Full Data) ===')
print(full_data[['SM', 'Product 2', 'Product 3']].describe())

print('\n=== Target Statistics (Single Solvent) ===')
print(single_data[['SM', 'Product 2', 'Product 3']].describe())

# Check if targets sum to 1
print('\n=== Row sums (Full Data) ===')
row_sums = full_data[['SM', 'Product 2', 'Product 3']].sum(axis=1)
print(f'Min: {row_sums.min():.4f}, Max: {row_sums.max():.4f}, Mean: {row_sums.mean():.4f}')

=== Target Statistics (Full Data) ===
                SM    Product 2    Product 3
count  1227.000000  1227.000000  1227.000000
mean      0.495178     0.164626     0.143668
std       0.379425     0.153467     0.145779
min       0.000000     0.000000     0.000000
25%       0.068573     0.012723     0.012260
50%       0.606454     0.117330     0.094413
75%       0.877448     0.308649     0.254630
max       1.083254     0.463632     0.533768

=== Target Statistics (Single Solvent) ===
               SM   Product 2   Product 3
count  656.000000  656.000000  656.000000
mean     0.522192    0.149932    0.123380
std      0.360229    0.143136    0.131528
min      0.000000    0.000000    0.000000
25%      0.145001    0.012976    0.009445
50%      0.656558    0.102813    0.078298
75%      0.857019    0.281654    0.193353
max      1.000000    0.463632    0.533768

=== Row sums (Full Data) ===
Min: 0.0112, Max: 1.1233, Mean: 0.8035


In [None]:
# Check unique solvents and CV structure
print('=== Single Solvent CV Structure ===')
print(f'Unique solvents: {single_data["SOLVENT NAME"].nunique()}')
print(single_data['SOLVENT NAME'].value_counts())

print('\n=== Full Data CV Structure ===')
ramps = full_data[['SOLVENT A NAME', 'SOLVENT B NAME']].drop_duplicates()
print(f'Unique ramps (folds): {len(ramps)}')
print(ramps)