# Evolver Loop 70 Analysis

Goal: validate whether a solvent **distance-to-training** signal is predictive of **leave-one-solvent-out error**, supporting applicability-domain (AD) shrinkage/calibration.

We will:
1. Load single-solvent training data.
2. Build a simple, fast baseline model (Ridge) using kinetics + (Spange+ACS) solvent descriptors.
3. For each held-out solvent (group), compute:
   - nearest-neighbor distance to remaining solvents in descriptor space
   - MSE on that solvent
4. Correlate distance vs error and identify outlier solvents.

This is **not** intended to beat the baseline, just to check if AD has a measurable signal.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

DATA_PATH = '/home/data'

# Load data
single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
acs = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv', index_col=0)

print('Single columns:', list(single.columns))

solvent_col = 'SOLVENT NAME'
solvents = single[solvent_col].unique().tolist()
print('n rows:', len(single), 'n solvents:', len(solvents))
print('spange rows:', spange.shape, 'acs rows:', acs.shape)

# descriptor matrix per solvent
desc = pd.concat([
    spange.reindex(solvents).fillna(0.0),
    acs.reindex(solvents).fillna(0.0)
], axis=1)
print('descriptor dim:', desc.shape)

# kinetic features
rt = single['Residence Time'].values.reshape(-1,1)
tc = single['Temperature'].values.reshape(-1,1)
tk = tc + 273.15
invT = 1000.0 / tk
logt = np.log(rt + 1e-6)
inter = invT * logt
kin = np.hstack([rt, tc, invT, logt, inter])

# attach solvent descriptors row-wise
X_desc = desc.reindex(single[solvent_col]).values
X = np.hstack([kin, X_desc])

# detect target columns automatically (3 columns ending with 'Yield' or similar)
y_cols = [c for c in single.columns if ('Yield' in c) and (c not in ['Residence Time','Temperature'])]
print('Detected yield cols:', y_cols)

# fallback if dataset uses different naming
if len(y_cols) >= 3:
    y_cols = y_cols[:3]
else:
    raise ValueError('Could not detect 3 target columns')

Y = single[y_cols].values

# scale features for ridge
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print('X shape:', X_scaled.shape, 'Y shape:', Y.shape)

In [None]:
from sklearn.metrics import mean_squared_error

def nearest_neighbor_dist(solvent_name, train_solvents, desc_df):
    v = desc_df.loc[solvent_name].values.astype(float)
    M = desc_df.loc[train_solvents].values.astype(float)
    # euclidean distance in raw descriptor space
    d = np.sqrt(((M - v)**2).sum(axis=1))
    return float(d.min()), float(d.mean())

results = []
alpha = 1.0

for s in solvents:
    mask_te = single[solvent_col].values == s
    mask_tr = ~mask_te

    tr_solvents = list(set(single.loc[mask_tr, solvent_col].unique().tolist()))
    d_min, d_mean = nearest_neighbor_dist(s, tr_solvents, desc)

    # Ridge per target
    preds = np.zeros((mask_te.sum(), 3))
    for t in range(3):
        model = Ridge(alpha=alpha, random_state=0)
        model.fit(X_scaled[mask_tr], Y[mask_tr, t])
        preds[:, t] = model.predict(X_scaled[mask_te])

    mse = mean_squared_error(Y[mask_te], preds)
    results.append({'solvent': s, 'n_rows': int(mask_te.sum()), 'mse': mse, 'd_nn_min': d_min, 'd_nn_mean': d_mean})

res = pd.DataFrame(results).sort_values('mse', ascending=False)
res.head(10)

In [None]:
# Correlation distance vs error
corr_min = res[['mse','d_nn_min']].corr().iloc[0,1]
corr_mean = res[['mse','d_nn_mean']].corr().iloc[0,1]

print('corr(mse, nearest-neighbor distance):', corr_min)
print('corr(mse, mean distance):', corr_mean)

# Show scatter-like summary
print('\nTop 8 high-distance solvents:')
print(res.sort_values('d_nn_min', ascending=False).head(8)[['solvent','mse','d_nn_min','d_nn_mean','n_rows']])

print('\nTop 8 high-error solvents:')
print(res.head(8)[['solvent','mse','d_nn_min','d_nn_mean','n_rows']])