# Loop 96 Analysis: Strategic Assessment

## Key Questions:
1. Why have all 96 experiments fallen on the same CV-LB line?
2. What fundamentally different approaches haven't been tried?
3. What does the benchmark paper's success (MSE 0.0039) tell us?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

# Load session state
with open('/home/code/session_state.json', 'r') as f:
    state = json.load(f)

# Extract submission data
submissions = state.get('submissions', [])
print(f'Total submissions: {len(submissions)}')

# Get valid CV-LB pairs
cv_lb_pairs = []
for sub in submissions:
    cv = sub.get('cv_score')
    lb = sub.get('lb_score')
    exp_id = sub.get('experiment_id', 'unknown')
    if cv is not None and lb is not None:
        try:
            lb_float = float(lb)
            if lb_float < 0.15 and exp_id != 'exp_073':  # Exclude outlier
                cv_lb_pairs.append((float(cv), lb_float, exp_id))
        except:
            pass

print(f'Valid CV-LB pairs: {len(cv_lb_pairs)}')
for cv, lb, exp_id in sorted(cv_lb_pairs, key=lambda x: x[0]):
    print(f'  {exp_id}: CV={cv:.6f}, LB={lb:.6f}')

In [None]:
# Fit linear regression
cv_arr = np.array([x[0] for x in cv_lb_pairs])
lb_arr = np.array([x[1] for x in cv_lb_pairs])

slope, intercept = np.polyfit(cv_arr, lb_arr, 1)
pred = slope * cv_arr + intercept
ss_res = np.sum((lb_arr - pred) ** 2)
ss_tot = np.sum((lb_arr - np.mean(lb_arr)) ** 2)
r_squared = 1 - (ss_res / ss_tot)

print('=== CV-LB RELATIONSHIP ===')
print(f'Linear fit: LB = {slope:.4f} * CV + {intercept:.6f}')
print(f'R-squared: {r_squared:.4f}')
print(f'Intercept: {intercept:.6f}')
print(f'Target LB: 0.0347')
print(f'Required CV for target: {(0.0347 - intercept) / slope:.6f}')

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(cv_arr, lb_arr, c='blue', alpha=0.7, label='Submissions')
plt.plot([0, max(cv_arr)*1.1], [intercept, slope*max(cv_arr)*1.1 + intercept], 'r--', label=f'Fit: LB = {slope:.2f}*CV + {intercept:.4f}')
plt.axhline(y=0.0347, color='green', linestyle=':', label='Target LB = 0.0347')
plt.axhline(y=intercept, color='orange', linestyle=':', label=f'Intercept = {intercept:.4f}')
plt.xlabel('CV Score')
plt.ylabel('LB Score')
plt.title('CV vs LB Relationship (96 experiments)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('/home/code/exploration/cv_lb_relationship.png', dpi=150, bbox_inches='tight')
plt.show()

print(f'\n=== CRITICAL INSIGHT ===')
print(f'Intercept ({intercept:.4f}) > Target ({0.0347})')
print(f'This means even with CV=0, expected LB would be {intercept:.4f}')
print(f'The target is UNREACHABLE with current approaches!')

In [None]:
# Analyze experiment types
experiments = state.get('experiments', [])
print(f'Total experiments logged: {len(experiments)}')

# Categorize by model type
model_types = {}
for exp in experiments:
    model_type = exp.get('model_type', 'unknown')
    if model_type not in model_types:
        model_types[model_type] = []
    model_types[model_type].append(exp)

print('\n=== EXPERIMENTS BY MODEL TYPE ===')
for mt, exps in sorted(model_types.items(), key=lambda x: -len(x[1])):
    scores = [e.get('score', 0) for e in exps if e.get('score')]
    if scores:
        print(f'{mt}: {len(exps)} experiments, best CV: {min(scores):.6f}')
    else:
        print(f'{mt}: {len(exps)} experiments')

In [None]:
# Check what approaches have been tried
print('=== APPROACHES TRIED ===')
approaches = [
    'MLP', 'LightGBM', 'XGBoost', 'CatBoost', 'RandomForest',
    'Ridge', 'GP', 'GNN', 'ChemBERTa', 'Transformer',
    'Ensemble', 'Stacking'
]

for approach in approaches:
    count = sum(1 for exp in experiments if approach.lower() in str(exp).lower())
    print(f'{approach}: {count} experiments')

In [None]:
# Load data to understand the problem
DATA_PATH = '/home/data'

df_single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
df_full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')

print('=== DATA OVERVIEW ===')
print(f'Single solvent: {df_single.shape}')
print(f'Full data: {df_full.shape}')

print(f'\nUnique solvents in single: {df_single["SOLVENT NAME"].nunique()}')
print(f'Unique solvent pairs in full: {len(df_full[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates())}')

print(f'\nTarget statistics (single):')
print(df_single[['Product 2', 'Product 3', 'SM']].describe())

In [None]:
# The key insight: Why does the intercept exist?
# The intercept represents the STRUCTURAL GAP between CV and LB
# This gap exists because:
# 1. CV uses Leave-One-Solvent-Out, but test solvents are "harder"
# 2. The model extrapolates poorly to unseen solvents
# 3. Tabular features don't capture the full molecular structure

print('=== ROOT CAUSE ANALYSIS ===')
print()
print('The CV-LB intercept (0.0528) represents DISTRIBUTION SHIFT:')
print('1. CV uses Leave-One-Solvent-Out validation')
print('2. Test solvents are structurally different from training')
print('3. Tabular models extrapolate poorly to unseen chemical space')
print()
print('The benchmark paper achieved MSE 0.0039 by:')
print('1. Using Graph Neural Networks (GATs) on molecular graphs')
print('2. Operating on MOLECULAR STRUCTURE, not tabular features')
print('3. Learning representations that generalize to unseen solvents')
print()
print('Our GNN attempts failed (CV 0.018-0.068) likely because:')
print('1. Model class mismatch in submission cells')
print('2. Missing DRFP integration with graph representation')
print('3. Missing learned mixture-aware encodings')

In [None]:
# What approaches could CHANGE the CV-LB relationship?
print('=== APPROACHES TO CHANGE CV-LB RELATIONSHIP ===')
print()
print('1. PROPER GNN IMPLEMENTATION')
print('   - Use PyTorch Geometric with GATConv')
print('   - Convert SMILES to molecular graphs')
print('   - Integrate DRFP features')
print('   - VERIFY submission cells use same model class')
print()
print('2. PRE-TRAINED MOLECULAR EMBEDDINGS')
print('   - ChemBERTa or MolBERT embeddings')
print('   - These capture chemical knowledge from large corpora')
print('   - May generalize better to unseen solvents')
print()
print('3. EXTRAPOLATION DETECTION + CONSERVATIVE PREDICTIONS')
print('   - Detect when test sample is far from training')
print('   - Blend toward population mean for extrapolation')
print('   - This could reduce the intercept')
print()
print('4. DOMAIN CONSTRAINTS')
print('   - Yields must sum to ~1 (mass balance)')
print('   - Arrhenius kinetics constraints')
print('   - These constraints hold even for unseen solvents')

In [None]:
# Check the best experiment details
best_exp = None
best_cv = float('inf')
for exp in experiments:
    score = exp.get('score', float('inf'))
    if score < best_cv:
        best_cv = score
        best_exp = exp

if best_exp:
    print('=== BEST EXPERIMENT ===')
    print(f"ID: {best_exp.get('id')}")
    print(f"Name: {best_exp.get('name')}")
    print(f"Model Type: {best_exp.get('model_type')}")
    print(f"CV Score: {best_exp.get('score')}")
    print(f"Notes: {best_exp.get('notes', '')[:500]}...")