# Evolver Loop 73 Analysis

Goal: diagnose AD distance/shrinkage design by measuring how well distance-to-training correlates with fold error, and compare distance featurizations (single + full). Use latest `submission.csv` (exp_073) predictions for per-row error analysis.


In [None]:
import numpy as np, pandas as pd
from pathlib import Path

DATA_PATH=Path('/home/data')
SUB_PATH=Path('/home/submission/submission.csv')

single_df=pd.read_csv(DATA_PATH/'catechol_single_solvent_yields.csv')
full_df=pd.read_csv(DATA_PATH/'catechol_full_data_yields.csv')
sub=pd.read_csv(SUB_PATH)

print('single',single_df.shape,'full',full_df.shape,'sub',sub.shape)
print(sub.task.value_counts())

TARGETS=['Product 2','Product 3','SM']
print('single sum(y) stats', (single_df[TARGETS].sum(1)).describe())
print('full sum(y) stats', (full_df[TARGETS].sum(1)).describe())

# load solvent tables used in experiments (lookup tables)
spange=pd.read_csv(DATA_PATH/'spange_descriptors_lookup.csv',index_col=0)
drfp=pd.read_csv(DATA_PATH/'drfps_catechol_lookup.csv',index_col=0)
acs=pd.read_csv(DATA_PATH/'acs_pca_descriptors_lookup.csv',index_col=0)

# Filter nonzero variance DRFP (as exp_073)
var=drfp.var(0)
drfp_nz=drfp.loc[:, var>0]

print('spange',spange.shape,'drfp_nz',drfp_nz.shape,'acs',acs.shape)

def solvent_embed(names):
    return np.hstack([spange.loc[names].values, drfp_nz.loc[names].values, acs.loc[names].values])


In [None]:
# Template split generators replicated

def generate_leave_one_out_splits(X,Y):
    all_solvents = X['SOLVENT NAME'].unique()
    for solvent_name in sorted(all_solvents):
        m = X['SOLVENT NAME'] != solvent_name
        yield ( (X[m], Y[m]), (X[~m], Y[~m]) )

def generate_leave_one_ramp_out_splits(X,Y):
    all_pairs = X[['SOLVENT A NAME','SOLVENT B NAME']].drop_duplicates().sort_values(by=['SOLVENT A NAME','SOLVENT B NAME'])
    for _,pair in all_pairs.iterrows():
        m = (X[['SOLVENT A NAME','SOLVENT B NAME']] != pair).any(axis=1)
        yield ( (X[m], Y[m]), (X[~m], Y[~m]) )

print('single folds', sum(1 for _ in generate_leave_one_out_splits(single_df[['Residence Time','Temperature','SOLVENT NAME']], single_df[TARGETS])))
print('full folds', sum(1 for _ in generate_leave_one_ramp_out_splits(full_df[['Residence Time','Temperature','SOLVENT A NAME','SOLVENT B NAME','SolventB%']], full_df[TARGETS])))


In [None]:
# Template split generators replicated

def generate_leave_one_out_splits(X,Y):
    all_solvents = X['SOLVENT NAME'].unique()
    for solvent_name in sorted(all_solvents):
        m = X['SOLVENT NAME'] != solvent_name
        yield ( (X[m], Y[m]), (X[~m], Y[~m]) )

def generate_leave_one_ramp_out_splits(X,Y):
    all_pairs = X[['SOLVENT A NAME','SOLVENT B NAME']].drop_duplicates().sort_values(by=['SOLVENT A NAME','SOLVENT B NAME'])
    for _,pair in all_pairs.iterrows():
        m = (X[['SOLVENT A NAME','SOLVENT B NAME']] != pair).any(axis=1)
        yield ( (X[m], Y[m]), (X[~m], Y[~m]) )

print('single folds', sum(1 for _ in generate_leave_one_out_splits(single_df[['Residence Time','Temperature','SOLVENT NAME']], single_df[TARGETS])))
print('full folds', sum(1 for _ in generate_leave_one_ramp_out_splits(full_df[['Residence Time','Temperature','SOLVENT A NAME','SOLVENT B NAME','SolventB%']], full_df[TARGETS])))


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

def dist_single(X):
    return solvent_embed(X['SOLVENT NAME'])

def dist_full_blend(X):
    A=solvent_embed(X['SOLVENT A NAME'])
    B=solvent_embed(X['SOLVENT B NAME'])
    pct=X['SolventB%'].values.reshape(-1,1)
    blend=(1-pct)*A + pct*B
    return np.hstack([blend,pct])

def dist_full_symm(X):
    # canonical order by solvent name to enforce symmetry
    Aname=X['SOLVENT A NAME'].astype(str).values
    Bname=X['SOLVENT B NAME'].astype(str).values
    swap = Aname > Bname
    Aname2=Aname.copy(); Bname2=Bname.copy();
    Aname2[swap]=Bname[swap]; Bname2[swap]=Aname[swap]
    pct=X['SolventB%'].values.reshape(-1,1).astype(float)
    pct2=pct.copy(); pct2[swap]=1.0-pct2[swap]

    A=solvent_embed(Aname2)
    B=solvent_embed(Bname2)
    # symmetric features (a, b, |a-b|, a*b) + composition scalars
    return np.hstack([
        A, B,
        np.abs(A-B), A*B,
        pct2, pct2*(1-pct2)
    ])

print('dist dims single',dist_single(single_df.head(2)).shape)
print('dist dims full blend',dist_full_blend(full_df.head(2)).shape)
print('dist dims full symm',dist_full_symm(full_df.head(2)).shape)


In [None]:
# Compute per-fold distance + per-fold squared error using current submission preds

from scipy.stats import spearmanr

def fold_error_distance_single():
    X=single_df[['Residence Time','Temperature','SOLVENT NAME']]
    Y=single_df[TARGETS]
    sub_s=sub[sub.task==0]
    rows=[]
    for fold_idx, ((Xtr,Ytr),(Xte,Yte)) in enumerate(generate_leave_one_out_splits(X,Y)):
        pred=sub_s[sub_s.fold==fold_idx][['target_1','target_2','target_3']].values
        true=Yte.values
        se=((true-pred)**2).mean(axis=1)  # per row mean over targets
        # distance to training in descriptor space
        Dtr=dist_single(Xtr)
        Dte=dist_single(Xte)
        sc=StandardScaler().fit(Dtr)
        nn=NearestNeighbors(n_neighbors=min(10,len(Dtr))).fit(sc.transform(Dtr))
        d=nn.kneighbors(sc.transform(Dte), return_distance=True)[0].mean(axis=1)
        for i in range(len(Xte)):
            rows.append((fold_idx, Xte.iloc[i]['SOLVENT NAME'], float(d[i]), float(se[i])))
    out=pd.DataFrame(rows, columns=['fold','solvent','dist','se'])
    return out

single_stats=fold_error_distance_single()
print(single_stats.describe())
print('spearman dist vs se', spearmanr(single_stats['dist'], single_stats['se']))

# contribution of top distances
q=np.quantile(single_stats['dist'], [0.5,0.7,0.8,0.9,0.95])
print('dist quantiles',q)
for thr in [0.7,0.8,0.9]:
    t=np.quantile(single_stats['dist'], thr)
    m=single_stats['dist']>=t
    print('top',thr,'fraction',m.mean(),'MSE share', single_stats.loc[m,'se'].sum()/single_stats['se'].sum())


In [None]:
def fold_error_distance_full(dist_fn):
    X=full_df[['Residence Time','Temperature','SOLVENT A NAME','SOLVENT B NAME','SolventB%']]
    Y=full_df[TARGETS]
    sub_f=sub[sub.task==1]
    rows=[]
    for fold_idx, ((Xtr,Ytr),(Xte,Yte)) in enumerate(generate_leave_one_ramp_out_splits(X,Y)):
        pred=sub_f[sub_f.fold==fold_idx][['target_1','target_2','target_3']].values
        true=Yte.values
        se=((true-pred)**2).mean(axis=1)
        Dtr=dist_fn(Xtr)
        Dte=dist_fn(Xte)
        sc=StandardScaler().fit(Dtr)
        nn=NearestNeighbors(n_neighbors=min(10,len(Dtr))).fit(sc.transform(Dtr))
        d=nn.kneighbors(sc.transform(Dte), return_distance=True)[0].mean(axis=1)
        for i in range(len(Xte)):
            rows.append((fold_idx, float(d[i]), float(se[i])))
    out=pd.DataFrame(rows, columns=['fold','dist','se'])
    return out

full_blend=fold_error_distance_full(dist_full_blend)
full_symm=fold_error_distance_full(dist_full_symm)

print('FULL blend spearman', spearmanr(full_blend['dist'], full_blend['se']))
print('FULL symm spearman', spearmanr(full_symm['dist'], full_symm['se']))

for name,dfx in [('blend',full_blend),('symm',full_symm)]:
    print('\n',name, dfx['dist'].describe())
    for thr in [0.7,0.8,0.9]:
        t=np.quantile(dfx['dist'], thr)
        m=dfx['dist']>=t
        print('top',thr,'fraction',m.mean(),'MSE share', dfx.loc[m,'se'].sum()/dfx['se'].sum())


In [None]:
# Compare per-fold average error vs distance to see if some folds (solvents/ramps) are the true OODs

single_fold=single_stats.groupby('fold').agg(dist_mean=('dist','mean'), se_mean=('se','mean'), n=('se','size')).reset_index()
print('single fold-level spearman', spearmanr(single_fold['dist_mean'], single_fold['se_mean']))

full_fold_blend=full_blend.groupby('fold').agg(dist_mean=('dist','mean'), se_mean=('se','mean'), n=('se','size')).reset_index()
full_fold_symm=full_symm.groupby('fold').agg(dist_mean=('dist','mean'), se_mean=('se','mean'), n=('se','size')).reset_index()

print('full fold blend spearman', spearmanr(full_fold_blend['dist_mean'], full_fold_blend['se_mean']))
print('full fold symm spearman', spearmanr(full_fold_symm['dist_mean'], full_fold_symm['se_mean']))

print('worst single folds by error')
print(single_fold.sort_values('se_mean',ascending=False).head(5))
print('worst full folds by error (symm dist)')
print(full_fold_symm.sort_values('se_mean',ascending=False).head(5))
