# Evolver Loop 71 Analysis

Goals:
1. Verify *template-compliant* split generation vs current exp_030 control scaffold.
2. Confirm mixture flip/TTA bug and implement a correct reference implementation.
3. Identify minimal-delta changes that can reduce OOD intercept (AD shrinkage / conservative blending).

In [None]:
import pandas as pd, numpy as np
from pathlib import Path
DATA_PATH = Path('/home/data')

def load_single():
    df=pd.read_csv(DATA_PATH/'catechol_single_solvent_yields.csv')
    X=df[["Residence Time","Temperature","SOLVENT NAME"]]
    Y=df[["Product 2","Product 3","SM"]]
    return X,Y

def load_full():
    df=pd.read_csv(DATA_PATH/'catechol_full_data_yields.csv')
    X=df[["Residence Time","Temperature","SOLVENT A NAME","SOLVENT B NAME","SolventB%"]]
    Y=df[["Product 2","Product 3","SM"]]
    return X,Y

X_s,Y_s=load_single(); X_f,Y_f=load_full()
(len(X_s), len(X_f), X_s['SOLVENT NAME'].nunique(), len(X_f[["SOLVENT A NAME","SOLVENT B NAME"]].drop_duplicates()))

In [None]:
# Re-implement two ramp split generators:
# A) Template-ish: sort ramps, and use `.any(axis=1)` mask on inequality
# B) Control-ish: drop_duplicates without sorting, and use `~((A==... ) & (B==...))`

def ramps_template(X):
    ramps = X[["SOLVENT A NAME","SOLVENT B NAME"]].drop_duplicates().sort_values(["SOLVENT A NAME","SOLVENT B NAME"]).reset_index(drop=True)
    return ramps

def splits_template(X,Y):
    ramps = ramps_template(X)
    for _, row in ramps.iterrows():
        solvent_pair = row.values
        train_mask = (X[["SOLVENT A NAME","SOLVENT B NAME"]] != solvent_pair).any(axis=1)
        yield train_mask

def ramps_control(X):
    return X[["SOLVENT A NAME","SOLVENT B NAME"]].drop_duplicates().reset_index(drop=True)

def splits_control(X,Y):
    ramps = ramps_control(X)
    for _, row in ramps.iterrows():
        train_mask = ~((X["SOLVENT A NAME"]==row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"]==row["SOLVENT B NAME"]))
        yield train_mask

ramps_A = ramps_template(X_f)
ramps_B = ramps_control(X_f)
print('template ramps head:\n', ramps_A.head())
print('control ramps head:\n', ramps_B.head())

# Compare ramp order
same_order = ramps_A.equals(ramps_B)
print('same ramp ordering?', same_order)

# Compare per-fold membership masks
masks_A = list(splits_template(X_f,Y_f))
masks_B = list(splits_control(X_f,Y_f))
print('num folds A,B', len(masks_A), len(masks_B))

# For each ramp in template order, find its index in control and check membership identity
ramp_to_idx_B = {tuple(ramps_B.loc[i].values): i for i in range(len(ramps_B))}
order_mismatch = 0
membership_mismatch = 0
for i in range(len(ramps_A)):
    ramp = tuple(ramps_A.loc[i].values)
    j = ramp_to_idx_B[ramp]
    if i != j:
        order_mismatch += 1
    if not np.array_equal(masks_A[i].values, masks_B[j].values):
        membership_mismatch += 1

print('fold order mismatches:', order_mismatch)
print('membership mismatches (should be 0):', membership_mismatch)

In [None]:
# Sanity check: mask logic equivalence for a random ramp
row = ramps_A.iloc[0]
solvent_pair = row.values
mask_any = (X_f[["SOLVENT A NAME","SOLVENT B NAME"]] != solvent_pair).any(axis=1)
mask_not_and = ~((X_f["SOLVENT A NAME"]==row["SOLVENT A NAME"]) & (X_f["SOLVENT B NAME"]==row["SOLVENT B NAME"]))
print('equivalent?', np.array_equal(mask_any.values, mask_not_and.values))
print('diff count', np.sum(mask_any.values != mask_not_and.values))

In [None]:
# Investigate mixture flip bug on a toy example using spange descriptors
SPANGE = pd.read_csv(DATA_PATH/'spange_descriptors_lookup.csv', index_col=0)

def blend_linear(A,B,pctB):
    return (1-pctB)*SPANGE.loc[A].values + pctB*SPANGE.loc[B].values

# Intended flip: swap A/B and pct -> 1-pct
# Buggy flip seems to produce same as non-flip.

def blend_buggy_flip(A,B,pctB):
    pct=pctB
    return SPANGE.loc[B].values*pct + SPANGE.loc[A].values*(1-pct)

def blend_true_flip(A,B,pctB):
    return blend_linear(B,A,1-pctB)

A,B = SPANGE.index[0], SPANGE.index[1]
for pctB in [0.1,0.3,0.7,0.9]:
    x = blend_linear(A,B,pctB)
    xb = blend_buggy_flip(A,B,pctB)
    xt = blend_true_flip(A,B,pctB)
    print(pctB, 'buggy same as nonflip?', np.allclose(x, xb), 'trueflip same as nonflip?', np.allclose(x, xt), 'max|x-true|', np.max(np.abs(x-xt)))