In [1]:
import pandas as pd
import numpy as np

# Load data
full_df = pd.read_csv('/home/data/catechol_full_data_yields.csv')
single_df = pd.read_csv('/home/data/catechol_single_solvent_yields.csv')

print('Full data shape:', full_df.shape)
print('Single solvent data shape:', single_df.shape)
print('\nFull data columns:', full_df.columns.tolist())
print('\nSingle solvent columns:', single_df.columns.tolist())

Full data shape: (1227, 19)
Single solvent data shape: (656, 13)

Full data columns: ['EXP NUM', 'SOLVENT A NAME', 'SOLVENT B NAME', 'SolventB%', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT A SMILES', 'SOLVENT B SMILES', 'SOLVENT A Ratio', 'SOLVENT B Ratio', 'Reaction SMILES A', 'Reaction SMILES B', 'RAMP NUM']

Single solvent columns: ['EXP NUM', 'Residence Time', 'Temperature', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 'Product 3 SMILES', 'SOLVENT NAME', 'SOLVENT SMILES', 'SOLVENT Ratio', 'Reaction SMILES']


In [2]:
# Check target distributions and key features
print('Target columns (SM, Product 2, Product 3):')
print(full_df[['SM', 'Product 2', 'Product 3']].describe())

print('\nUnique solvents in single data:', single_df['SOLVENT NAME'].nunique())
print('Unique solvent A in full data:', full_df['SOLVENT A NAME'].nunique())
print('Unique solvent B in full data:', full_df['SOLVENT B NAME'].nunique())

print('\nResidence Time range:', full_df['Residence Time'].min(), '-', full_df['Residence Time'].max())
print('Temperature range:', full_df['Temperature'].min(), '-', full_df['Temperature'].max())
print('SolventB% range:', full_df['SolventB%'].min(), '-', full_df['SolventB%'].max())

Target columns (SM, Product 2, Product 3):
                SM    Product 2    Product 3
count  1227.000000  1227.000000  1227.000000
mean      0.495178     0.164626     0.143668
std       0.379425     0.153467     0.145779
min       0.000000     0.000000     0.000000
25%       0.068573     0.012723     0.012260
50%       0.606454     0.117330     0.094413
75%       0.877448     0.308649     0.254630
max       1.083254     0.463632     0.533768

Unique solvents in single data: 24
Unique solvent A in full data: 13
Unique solvent B in full data: 13

Residence Time range: 2.001019108286073 - 15.017208412882612
Temperature range: 175.0 - 225.0
SolventB% range: 0.0 - 1.0


In [None]:
# Check lookup tables
spange = pd.read_csv('/home/data/spange_descriptors_lookup.csv', index_col=0)
print('Spange descriptors shape:', spange.shape)
print('Spange columns:', spange.columns.tolist())

acs = pd.read_csv('/home/data/acs_pca_descriptors_lookup.csv', index_col=0)
print('\nACS PCA descriptors shape:', acs.shape)

drfps = pd.read_csv('/home/data/drfps_catechol_lookup.csv', index_col=0)
print('DRFP shape:', drfps.shape)

fragprints = pd.read_csv('/home/data/fragprints_lookup.csv', index_col=0)
print('Fragprints shape:', fragprints.shape)