# Evolver Loop 69 Analysis

Goals:
1. Confirm the CVâ†’LB linear mapping from all *completed* submissions.
2. Audit mixture representation: compare linear blending vs pairwise interaction features in *training CV* (proxy for ability to model mixture nonlinearity).
3. Prototype an **applicability-domain (AD) correction**: distance-based shrinkage to baseline, trained **cross-fitted** (no leakage).

We will not tune hyperparameters; we want a structural change likely to reduce LB intercept.

In [None]:
import json, numpy as np, pandas as pd
from pathlib import Path

state = json.loads(Path('/home/code/session_state.json').read_text())
submissions = state.get('submissions', [])
print('submissions in state:', len(submissions))

# Parse the user-provided list (some LB pending). We'll reconstruct from known ones here.
paired = []
# From prompt history (completed LB only)
hist = [
    (0.0111, 0.0982),
    (0.0123, 0.1065),
    (0.0105, 0.0972),
    (0.0104, 0.0969),
    (0.0097, 0.0946),
    (0.0093, 0.0932),
    (0.0092, 0.0936),
    (0.0090, 0.0913),
    (0.0087, 0.0893),
    (0.0085, 0.0887),
    (0.0083, 0.0877),
    (0.0098, 0.0970),
]
cv = np.array([a for a,b in hist])
lb = np.array([b for a,b in hist])
print('n paired:', len(cv))

# Linear regression with intercept
A = np.vstack([cv, np.ones_like(cv)]).T
slope, intercept = np.linalg.lstsq(A, lb, rcond=None)[0]
pred = slope*cv + intercept
r2 = 1 - ((lb-pred)**2).sum()/((lb-lb.mean())**2).sum()
print('Fit: LB = %.3f * CV + %.4f | R2=%.4f' % (slope, intercept, r2))

TARGET = 0.0347
req_cv = (TARGET - intercept)/slope
print('Required CV for target given this fit:', req_cv)

In [None]:
import pandas as pd, numpy as np
DATA_PATH='/home/data'

single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv')
drfp = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv')
acs = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv')
smiles = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv')

print(single.shape, full.shape)
print('spange', spange.shape, 'drfp', drfp.shape, 'acs', acs.shape, 'smiles', smiles.shape)
print('single solvents:', single['SOLVENT NAME'].nunique())
print('full ramps:', full[['SOLVENT A NAME','SOLVENT B NAME']].drop_duplicates().shape)


import pandas as pd, numpy as np
DATA_PATH='/home/data'

single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv')
drfp = pd.read_csv(f'{DATA_PATH}/drfps_catechol_lookup.csv')
acs = pd.read_csv(f'{DATA_PATH}/acs_pca_descriptors_lookup.csv')
smiles = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv')

print(single.shape, full.shape)
print('spange', spange.shape, 'drfp', drfp.shape, 'acs', acs.shape, 'smiles', smiles.shape)
print('single solvents:', single['SOLVENT NAME'].nunique())
print('full ramps:', full[['SOLVENT A NAME','SOLVENT B NAME']].drop_duplicates().shape)


In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

# Build solvent descriptor dicts
spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv').set_index('solvent')
# keep numeric columns only
sp_cols = spange.columns

def arrhenius_feats(df):
    # basic kinetics-inspired transforms
    T = df['Temperature'].values.astype(float)
    t = df['Residence Time'].values.astype(float)
    Tk = T + 273.15
    invT = 1.0/Tk
    lnt = np.log(np.clip(t, 1e-6, None))
    return np.vstack([t, T, invT, lnt, invT*lnt]).T


def full_feats_blended(df):
    p = (df['SolventB%'].values.astype(float)/100.0).reshape(-1,1)
    A = spange.loc[df['SOLVENT A NAME']].values
    B = spange.loc[df['SOLVENT B NAME']].values
    mix = (1-p)*A + p*B
    X = np.hstack([arrhenius_feats(df), p, mix])
    return X


def full_feats_pairwise(df):
    p = (df['SolventB%'].values.astype(float)/100.0).reshape(-1,1)
    A = spange.loc[df['SOLVENT A NAME']].values
    B = spange.loc[df['SOLVENT B NAME']].values
    inter = np.hstack([
        A, B, p, p*(1-p),
        (A-B),
        (A*B)
    ])
    X = np.hstack([arrhenius_feats(df), inter])
    return X

Y = full[['Product 2','Product 3','SM']].values
ramps = full['SOLVENT A NAME'].astype(str) + '_' + full['SOLVENT B NAME'].astype(str)

for name, featurizer in [('blended', full_feats_blended), ('pairwise', full_feats_pairwise)]:
    gkf = GroupKFold(n_splits=13)
    preds = np.zeros_like(Y)
    for tr, te in gkf.split(full, Y, groups=ramps):
        Xtr = featurizer(full.iloc[tr])
        Xte = featurizer(full.iloc[te])
        s = StandardScaler().fit(Xtr)
        Xtr = s.transform(Xtr); Xte = s.transform(Xte)
        # Simple baseline model to isolate feature signal
        model = Ridge(alpha=1.0)
        # fit 3 targets independently
        for k in range(3):
            model.fit(Xtr, Y[tr, k])
            preds[te, k] = model.predict(Xte)
    mse = ((Y - preds)**2).mean()
    print(name, 'full-data Ridge CV MSE:', mse)


from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

# Build solvent descriptor dicts
spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv').set_index('SOLVENT NAME')


def arrhenius_feats(df):
    # basic kinetics-inspired transforms
    T = df['Temperature'].values.astype(float)
    t = df['Residence Time'].values.astype(float)
    Tk = T + 273.15
    invT = 1.0/Tk
    lnt = np.log(np.clip(t, 1e-6, None))
    return np.vstack([t, T, invT, lnt, invT*lnt]).T


def full_feats_blended(df):
    p = (df['SolventB%'].values.astype(float)/100.0).reshape(-1,1)
    A = spange.loc[df['SOLVENT A NAME']].values
    B = spange.loc[df['SOLVENT B NAME']].values
    mix = (1-p)*A + p*B
    X = np.hstack([arrhenius_feats(df), p, mix])
    return X


def full_feats_pairwise(df):
    p = (df['SolventB%'].values.astype(float)/100.0).reshape(-1,1)
    A = spange.loc[df['SOLVENT A NAME']].values
    B = spange.loc[df['SOLVENT B NAME']].values
    inter = np.hstack([
        A, B, p, p*(1-p),
        (A-B),
        (A*B)
    ])
    X = np.hstack([arrhenius_feats(df), inter])
    return X

Y = full[['Product 2','Product 3','SM']].values
ramps = full['SOLVENT A NAME'].astype(str) + '_' + full['SOLVENT B NAME'].astype(str)

for name, featurizer in [('blended', full_feats_blended), ('pairwise', full_feats_pairwise)]:
    gkf = GroupKFold(n_splits=13)
    preds = np.zeros_like(Y)
    for tr, te in gkf.split(full, Y, groups=ramps):
        Xtr = featurizer(full.iloc[tr])
        Xte = featurizer(full.iloc[te])
        s = StandardScaler().fit(Xtr)
        Xtr = s.transform(Xtr); Xte = s.transform(Xte)
        # Simple baseline model to isolate feature signal
        model = Ridge(alpha=1.0)
        for k in range(3):
            model.fit(Xtr, Y[tr, k])
            preds[te, k] = model.predict(Xte)
    mse = ((Y - preds)**2).mean()
    print(name, 'full-data Ridge CV MSE:', mse)


In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.isotonic import IsotonicRegression
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

# AD correction for FULL data only, using blended featurization + Ridge.

def solventpair_vector(df):
    p = (df['SolventB%'].values.astype(float)/100.0).reshape(-1,1)
    A = spange.loc[df['SOLVENT A NAME']].values
    B = spange.loc[df['SOLVENT B NAME']].values
    return np.hstack([A,B,p,p*(1-p),(A-B),(A*B)])

Y = full[['Product 2','Product 3','SM']].values
ramps = full['SOLVENT A NAME'].astype(str) + '_' + full['SOLVENT B NAME'].astype(str)

gkf = GroupKFold(n_splits=13)
preds_base = np.zeros_like(Y)
preds_ad = np.zeros_like(Y)

for tr, te in gkf.split(full, Y, groups=ramps):
    train_df = full.iloc[tr].reset_index(drop=True)
    test_df = full.iloc[te].reset_index(drop=True)

    # base model
    Xtr = full_feats_blended(train_df)
    Xte = full_feats_blended(test_df)
    scaler = StandardScaler().fit(Xtr)
    Xtr_s = scaler.transform(Xtr); Xte_s = scaler.transform(Xte)

    for k in range(3):
        m = Ridge(alpha=1.0)
        m.fit(Xtr_s, Y[tr, k])
        preds_base[te, k] = m.predict(Xte_s)

    # distances
    Vtr = solventpair_vector(train_df)
    Vte = solventpair_vector(test_df)
    Vsc = StandardScaler().fit(Vtr)
    Vtr_s = Vsc.transform(Vtr); Vte_s = Vsc.transform(Vte)

    nn = NearestNeighbors(n_neighbors=min(10, len(Vtr_s))).fit(Vtr_s)
    d_te, nbr_te = nn.kneighbors(Vte_s)
    d_te = d_te.mean(axis=1)

    # Inner CV to learn alpha(d)
    train_groups = train_df['SOLVENT A NAME'].astype(str) + '_' + train_df['SOLVENT B NAME'].astype(str)
    n_splits_inner = min(5, len(train_groups.unique()))
    inner = GroupKFold(n_splits=n_splits_inner)

    oof_pred = np.zeros((len(train_df),3))
    for tr2, va2 in inner.split(train_df, Y[tr], groups=train_groups):
        Xtr2 = full_feats_blended(train_df.iloc[tr2])
        Xva2 = full_feats_blended(train_df.iloc[va2])
        sc2 = StandardScaler().fit(Xtr2)
        Xtr2_s=sc2.transform(Xtr2); Xva2_s=sc2.transform(Xva2)
        for k in range(3):
            m2 = Ridge(alpha=1.0)
            m2.fit(Xtr2_s, Y[tr][tr2, k])
            oof_pred[va2,k] = m2.predict(Xva2_s)

    # distances for train samples (exclude self)
    nn2 = NearestNeighbors(n_neighbors=min(11, len(Vtr_s))).fit(Vtr_s)
    d_tr, nbr_tr = nn2.kneighbors(Vtr_s)
    if d_tr.shape[1] > 1:
        d_tr = d_tr[:,1:].mean(axis=1)
        nbr_tr = nbr_tr[:,1:]
    else:
        d_tr = d_tr.mean(axis=1)

    # kNN baseline labels (train-only)
    y_train = Y[tr]
    baseline_tr = y_train[nbr_tr].mean(axis=1)
    baseline_te = y_train[nbr_te].mean(axis=1)

    # optimal alpha per train sample
    eps=1e-12
    bp = baseline_tr - oof_pred
    num = (y_train - oof_pred)*bp
    den = (bp*bp) + eps
    a_opt = np.clip(num/den, 0, 1)
    a_opt_scalar = a_opt.mean(axis=1)

    iso = IsotonicRegression(y_min=0, y_max=1, increasing=True, out_of_bounds='clip')
    iso.fit(d_tr, a_opt_scalar)
    a_te = iso.predict(d_te)

    preds_ad[te,:] = (1-a_te)[:,None]*preds_base[te,:] + a_te[:,None]*baseline_te

mse_base = ((Y - preds_base)**2).mean()
mse_ad = ((Y - preds_ad)**2).mean()
print('FULL only: base Ridge blended MSE:', mse_base)
print('FULL only: AD-corrected MSE:', mse_ad)
print('delta:', mse_ad - mse_base)
