# Evolver Loop 3 Analysis: Understanding the Gap

## Key Findings So Far
1. Baseline (santa-2025.csv) scores 70.734327 - no overlaps
2. Ensemble from cpp_parallel_sa scores 67.77 but has overlaps in 30 N values
3. Target is 68.931058 - need to improve by 1.8 points
4. SA with collision constraints from baseline shows NO improvement
5. The baseline is already at a local optimum

## New Strategy: Use External Datasets
The jonathanchan kernel shows that top solutions ensemble from MANY sources:
- bucket-of-chump dataset
- santa25-public dataset
- telegram-public dataset
- Multiple kernel outputs

Let's analyze what's available and build a proper ensemble.

In [None]:
import numpy as np
import pandas as pd
import math
from numba import njit
from shapely.geometry import Polygon
from shapely.strtree import STRtree
import os
import glob
from tqdm import tqdm

# Tree polygon template
@njit
def make_polygon_template():
    tw=0.15; th=0.2; bw=0.7; mw=0.4; ow=0.25
    tip=0.8; t1=0.5; t2=0.25; base=0.0; tbot=-th
    x=np.array([0,ow/2,ow/4,mw/2,mw/4,bw/2,tw/2,tw/2,-tw/2,-tw/2,-bw/2,-mw/4,-mw/2,-ow/4,-ow/2],np.float64)
    y=np.array([tip,t1,t1,t2,t2,base,base,tbot,tbot,base,base,t2,t2,t1,t1],np.float64)
    return x,y

@njit
def score_group(xs, ys, degs, tx, ty):
    n = xs.size
    V = tx.size
    mnx = 1e300; mny = 1e300; mxx = -1e300; mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c = math.cos(r); s = math.sin(r)
        xi = xs[i]; yi = ys[i]
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xi
            Y = s * tx[j] + c * ty[j] + yi
            if X < mnx: mnx = X
            if X > mxx: mxx = X
            if Y < mny: mny = Y
            if Y > mxy: mxy = Y
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    return np.array([float(str(v).replace('s', '')) for v in a], np.float64)

tx, ty = make_polygon_template()
print('Template loaded')

In [None]:
# Overlap detection functions
def get_shapely_polygon(cx, cy, deg, tx, ty):
    r = deg * np.pi / 180.0
    c = np.cos(r)
    s = np.sin(r)
    px = c * tx - s * ty + cx
    py = s * tx + c * ty + cy
    return Polygon(zip(px, py))

def has_overlap(xs, ys, degs, tx, ty):
    n = len(xs)
    if n <= 1:
        return False
    polygons = [get_shapely_polygon(xs[i], ys[i], degs[i], tx, ty) for i in range(n)]
    tree_index = STRtree(polygons)
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx == i:
                continue
            if poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                return True
    return False

print('Overlap detection loaded')

In [None]:
# Find all CSV files in external datasets
dataset_dirs = [
    '/home/nonroot/snapshots/santa-2025/21105319338/code/datasets/bucket-of-chump',
    '/home/nonroot/snapshots/santa-2025/21105319338/code/datasets/santa25-public',
    '/home/nonroot/snapshots/santa-2025/21105319338/code/datasets/santa-2025-csv',
    '/home/nonroot/snapshots/santa-2025/21105319338/code/datasets/telegram-public',
    '/home/nonroot/snapshots/santa-2025/21105319338/code/datasets/santa-2025-try3',
]

all_csv_files = []
for d in dataset_dirs:
    if os.path.exists(d):
        files = glob.glob(os.path.join(d, '*.csv'))
        all_csv_files.extend(files)
        print(f'{d}: {len(files)} CSV files')

print(f'\nTotal CSV files found: {len(all_csv_files)}')
for f in all_csv_files:
    print(f'  {os.path.basename(f)}')

In [None]:
# Analyze each CSV file - score and overlap status
file_analysis = []

for fp in tqdm(all_csv_files, desc='Analyzing'):
    try:
        df = pd.read_csv(fp)
    except Exception as e:
        print(f'Error reading {fp}: {e}')
        continue
    
    if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
        print(f'Missing columns in {fp}')
        continue
    
    # Check if values have 's' prefix
    sample_x = str(df['x'].iloc[0])
    if not sample_x.startswith('s'):
        print(f'No s prefix in {fp}')
        continue
    
    df = df.copy()
    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    
    total_score = 0.0
    n_with_overlaps = 0
    
    for n, g in df.groupby('N'):
        if n < 1 or n > 200:
            continue
        
        xs = strip(g['x'].to_numpy())
        ys = strip(g['y'].to_numpy())
        ds = strip(g['deg'].to_numpy())
        
        sc = score_group(xs, ys, ds, tx, ty)
        total_score += sc
        
        if has_overlap(xs, ys, ds, tx, ty):
            n_with_overlaps += 1
    
    file_analysis.append({
        'file': os.path.basename(fp),
        'path': fp,
        'score': total_score,
        'n_overlaps': n_with_overlaps,
        'valid': n_with_overlaps == 0
    })

df_files = pd.DataFrame(file_analysis)
df_files = df_files.sort_values('score')
print('\nFile analysis (sorted by score):')
print(df_files.to_string())

In [None]:
# Build the best VALID ensemble from all sources
best = {n: {'score': 1e300, 'data': None, 'src': None} for n in range(1, 201)}

for fp in tqdm(all_csv_files, desc='Building ensemble'):
    try:
        df = pd.read_csv(fp)
    except Exception:
        continue
    
    if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
        continue
    
    sample_x = str(df['x'].iloc[0])
    if not sample_x.startswith('s'):
        continue
    
    df = df.copy()
    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    
    for n, g in df.groupby('N'):
        if n < 1 or n > 200:
            continue
        
        xs = strip(g['x'].to_numpy())
        ys = strip(g['y'].to_numpy())
        ds = strip(g['deg'].to_numpy())
        
        # Check for overlaps - only keep valid configs
        if has_overlap(xs, ys, ds, tx, ty):
            continue
        
        sc = score_group(xs, ys, ds, tx, ty)
        if sc < best[n]['score']:
            best[n]['score'] = float(sc)
            best[n]['data'] = g.drop(columns=['N']).copy()
            best[n]['src'] = os.path.basename(fp)

print('\nBest valid ensemble built!')

In [None]:
# Calculate ensemble score and show improvements
ensemble_score = sum(best[n]['score'] for n in range(1, 201))
print(f'Ensemble score: {ensemble_score:.6f}')
print(f'Baseline score: 70.734327')
print(f'Target: 68.931058')
print(f'Gap to target: {ensemble_score - 68.931058:.6f}')

# Show source distribution
source_counts = {}
for n in range(1, 201):
    src = best[n]['src']
    if src:
        source_counts[src] = source_counts.get(src, 0) + 1

print('\nSource distribution:')
for src, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f'  {src}: {count} N values')

In [None]:
# Compare with baseline for each N
baseline_path = '/home/nonroot/snapshots/santa-2025/21105319338/code/datasets/santa-2025-csv/santa-2025.csv'
df_baseline = pd.read_csv(baseline_path)
df_baseline['N'] = df_baseline['id'].astype(str).str.split('_').str[0].astype(int)

improvements = []
for n in range(1, 201):
    g = df_baseline[df_baseline['N'] == n]
    xs = strip(g['x'].to_numpy())
    ys = strip(g['y'].to_numpy())
    ds = strip(g['deg'].to_numpy())
    baseline_score = score_group(xs, ys, ds, tx, ty)
    
    ensemble_n_score = best[n]['score']
    improvement = baseline_score - ensemble_n_score
    
    if improvement > 0.0001:
        improvements.append({
            'n': n,
            'baseline': baseline_score,
            'ensemble': ensemble_n_score,
            'improvement': improvement,
            'source': best[n]['src']
        })

if improvements:
    df_imp = pd.DataFrame(improvements)
    print(f'\nN values with improvement: {len(df_imp)}')
    print(f'Total improvement: {df_imp["improvement"].sum():.6f}')
    print('\nTop 20 improvements:')
    print(df_imp.sort_values('improvement', ascending=False).head(20).to_string())
else:
    print('\nNo improvements found over baseline!')

In [None]:
# Save the ensemble submission
rows = []
for n in range(1, 201):
    if best[n]['data'] is not None:
        rows.append(best[n]['data'])

if rows:
    out = pd.concat(rows, ignore_index=True)
    out['sn'] = out['id'].str.split('_').str[0].astype(int)
    out['si'] = out['id'].str.split('_').str[1].astype(int)
    out = out.sort_values(['sn', 'si']).drop(columns=['sn', 'si'])
    out = out[['id', 'x', 'y', 'deg']]
    
    out.to_csv('/home/code/experiments/003_proper_sa/ensemble_submission.csv', index=False)
    print('Saved ensemble to /home/code/experiments/003_proper_sa/ensemble_submission.csv')
    print(f'Ensemble score: {ensemble_score:.6f}')
else:
    print('No valid ensemble data!')