# Evolver Loop 11 Analysis

## Key Findings:
1. exp_010 (safe ensemble) has CV=70.365 but NOT YET SUBMITTED
2. 4/6 submissions FAILED Kaggle validation (67% failure rate)
3. External SmartManoj data downloaded - need to compare per-N scores
4. Top kernel uses C++ optimizer with fractional translation

## Priority: Submit exp_010 to validate the "safe" approach

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from numba import njit
import math

# Load baseline and exp_010
baseline_path = '/home/code/experiments/001_valid_baseline/submission.csv'
exp010_path = '/home/code/experiments/010_safe_ensemble/submission.csv'
external_path = '/home/code/external_smartmanoj.csv'

print('Loading submissions...')
baseline = pd.read_csv(baseline_path)
exp010 = pd.read_csv(exp010_path)
external = pd.read_csv(external_path)

print(f'Baseline rows: {len(baseline)}')
print(f'Exp010 rows: {len(exp010)}')
print(f'External rows: {len(external)}')

# Check for 's' prefix
print(f"\nBaseline x sample: {baseline['x'].iloc[0]}")
print(f"Exp010 x sample: {exp010['x'].iloc[0]}")
print(f"External x sample: {external['x'].iloc[0]}")

Loading submissions...
Baseline rows: 20100
Exp010 rows: 20100
External rows: 20100

Baseline x sample: s-48.196086194214245779
Exp010 x sample: s-48.19608619421424577922
External x sample: s-48.19608619421424578


In [2]:
# Score calculation
@njit
def make_polygon_template():
    tw=0.15; th=0.2; bw=0.7; mw=0.4; ow=0.25
    tip=0.8; t1=0.5; t2=0.25; base=0.0; tbot=-th
    x=np.array([0,ow/2,ow/4,mw/2,mw/4,bw/2,tw/2,tw/2,-tw/2,-tw/2,-bw/2,-mw/4,-mw/2,-ow/4,-ow/2],np.float64)
    y=np.array([tip,t1,t1,t2,t2,base,base,tbot,tbot,base,base,t2,t2,t1,t1],np.float64)
    return x,y

@njit
def score_group(xs,ys,degs,tx,ty):
    n=xs.size; V=tx.size
    mnx=1e300; mny=1e300; mxx=-1e300; mxy=-1e300
    for i in range(n):
        r=degs[i]*math.pi/180.0
        c=math.cos(r); s=math.sin(r)
        xi=xs[i]; yi=ys[i]
        for j in range(V):
            X=c*tx[j]-s*ty[j]+xi
            Y=s*tx[j]+c*ty[j]+yi
            if X<mnx: mnx=X
            if X>mxx: mxx=X
            if Y<mny: mny=Y
            if Y>mxy: mxy=Y
    side=max(mxx-mnx,mxy-mny)
    return side*side/ n

def strip(a):
    return np.array([float(str(v).replace('s','')) for v in a],np.float64)

tx, ty = make_polygon_template()

def compute_per_n_scores(df):
    df = df.copy()
    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    scores = {}
    for n, g in df.groupby('N'):
        xs = strip(g['x'].to_numpy())
        ys = strip(g['y'].to_numpy())
        ds = strip(g['deg'].to_numpy())
        scores[n] = score_group(xs, ys, ds, tx, ty)
    return scores

print('Computing per-N scores...')
baseline_scores = compute_per_n_scores(baseline)
exp010_scores = compute_per_n_scores(exp010)
external_scores = compute_per_n_scores(external)

print(f'Baseline total: {sum(baseline_scores.values()):.6f}')
print(f'Exp010 total: {sum(exp010_scores.values()):.6f}')
print(f'External total: {sum(external_scores.values()):.6f}')

Computing per-N scores...


Baseline total: 70.615102
Exp010 total: 70.365091
External total: 70.743774


In [3]:
# Compare per-N: Where is external BETTER than exp010?
print('\n=== External vs Exp010 (where external is better) ===')
external_better = []
for n in range(1, 201):
    diff = exp010_scores[n] - external_scores[n]
    if diff > 1e-6:  # External is better
        external_better.append((n, diff, exp010_scores[n], external_scores[n]))
        if diff > 0.001:
            print(f'N={n}: exp010={exp010_scores[n]:.6f}, external={external_scores[n]:.6f}, diff={diff:.6f}')

print(f'\nTotal N values where external is better: {len(external_better)}')
print(f'Total potential improvement: {sum(d[1] for d in external_better):.6f}')

# Where is exp010 BETTER than external?
exp010_better = []
for n in range(1, 201):
    diff = external_scores[n] - exp010_scores[n]
    if diff > 1e-6:  # Exp010 is better
        exp010_better.append((n, diff))

print(f'\nTotal N values where exp010 is better: {len(exp010_better)}')
print(f'Total exp010 advantage: {sum(d[1] for d in exp010_better):.6f}')


=== External vs Exp010 (where external is better) ===

Total N values where external is better: 0
Total potential improvement: 0.000000

Total N values where exp010 is better: 156
Total exp010 advantage: 0.378677


In [4]:
# Create best-of-both ensemble
print('\n=== Creating best-of-both ensemble ===')

best_scores = {}
best_source = {}
for n in range(1, 201):
    if exp010_scores[n] <= external_scores[n]:
        best_scores[n] = exp010_scores[n]
        best_source[n] = 'exp010'
    else:
        best_scores[n] = external_scores[n]
        best_source[n] = 'external'

print(f'Best ensemble total: {sum(best_scores.values()):.6f}')
print(f'From exp010: {sum(1 for s in best_source.values() if s == "exp010")}')
print(f'From external: {sum(1 for s in best_source.values() if s == "external")}')

# Improvement over exp010
improvement = sum(exp010_scores.values()) - sum(best_scores.values())
print(f'\nImprovement over exp010: {improvement:.6f}')


=== Creating best-of-both ensemble ===
Best ensemble total: 70.365091
From exp010: 200
From external: 0

Improvement over exp010: 0.000000


In [5]:
# Check what N values have the biggest improvements from external
print('\n=== Top 20 improvements from external ===')
external_better_sorted = sorted(external_better, key=lambda x: x[1], reverse=True)[:20]
for n, diff, exp_score, ext_score in external_better_sorted:
    print(f'N={n}: improvement={diff:.6f} (exp010={exp_score:.6f}, external={ext_score:.6f})')


=== Top 20 improvements from external ===


In [6]:
# Check all snapshot files to see if we can find even better solutions
import os
import glob

snapshot_dir = '/home/nonroot/snapshots/'
snapshot_files = glob.glob(f'{snapshot_dir}/**/*.csv', recursive=True)
print(f'Total snapshot files: {len(snapshot_files)}')

# Sample a few to see their scores
if snapshot_files:
    for f in snapshot_files[:5]:
        print(f'\\n{f}')
        try:
            df = pd.read_csv(f)
            scores = compute_per_n_scores(df)
            print(f'  Total score: {sum(scores.values()):.6f}')
        except Exception as e:
            print(f'  Error: {e}')

Total snapshot files: 3776
\n/home/nonroot/snapshots/santa-2025/21116303805/code/submission.csv
  Total score: 70.676102
\n/home/nonroot/snapshots/santa-2025/21116303805/code/experiments/004_sa_v1_parallel/submission_best.csv
  Total score: 70.676102
\n/home/nonroot/snapshots/santa-2025/21116303805/code/experiments/004_sa_v1_parallel/submission_v18.csv
  Total score: 70.676102
\n/home/nonroot/snapshots/santa-2025/21116303805/code/experiments/005_backward_propagation/submission.csv


  Total score: 70.676102
\n/home/nonroot/snapshots/santa-2025/21116303805/code/experiments/005_backward_propagation/submission_v21.csv
  Total score: 70.676102


In [7]:
# The key insight: exp_010 already has the best ensemble from 3700+ snapshots\n# Score: 70.365 vs target 68.88 - gap of 1.49 points\n\n# Let's check what the theoretical minimum might be\n# N=1 optimal is 0.6612 (45 degree rotation)\n# For larger N, the minimum is harder to compute\n\nprint('=== Per-N score analysis ===')\nprint('\\nTop 20 N values by score contribution:')\nsorted_scores = sorted(exp010_scores.items(), key=lambda x: x[1], reverse=True)[:20]\nfor n, score in sorted_scores:\n    print(f'N={n}: {score:.6f}')\n\nprint('\\n\\nBottom 20 N values by score contribution:')\nsorted_scores = sorted(exp010_scores.items(), key=lambda x: x[1])[:20]\nfor n, score in sorted_scores:\n    print(f'N={n}: {score:.6f}')\n\nprint(f'\\n\\nN=1 score: {exp010_scores[1]:.6f} (optimal is 0.6612)')\nprint(f'N=2 score: {exp010_scores[2]:.6f}')\nprint(f'N=3 score: {exp010_scores[3]:.6f}')