# Loop 4 Analysis: Ensemble Approach from Multiple Sources

## Goal
Implement the ensemble approach from jonathanchan kernel - find the best configuration for each N from ALL available CSV files across snapshots.

## Key Insight
The target score (68.896973) might be achievable by combining the best per-N configurations from different sources. Different optimization runs may have found better solutions for different N values.

In [None]:
import os
import glob
import pandas as pd
import numpy as np
from numba import njit
import math
from tqdm import tqdm

# Tree polygon template
@njit
def make_polygon_template():
    tw=0.15; th=0.2; bw=0.7; mw=0.4; ow=0.25
    tip=0.8; t1=0.5; t2=0.25; base=0.0; tbot=-th
    x=np.array([0,ow/2,ow/4,mw/2,mw/4,bw/2,tw/2,tw/2,-tw/2,-tw/2,-bw/2,-mw/4,-mw/2,-ow/4,-ow/2],np.float64)
    y=np.array([tip,t1,t1,t2,t2,base,base,tbot,tbot,base,base,t2,t2,t1,t1],np.float64)
    return x,y

@njit
def score_group(xs, ys, degs, tx, ty):
    """Calculate bounding box score for a group of trees"""
    n = xs.size
    V = tx.size
    mnx = 1e300; mny = 1e300; mxx = -1e300; mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c = math.cos(r); s = math.sin(r)
        xi = xs[i]; yi = ys[i]
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xi
            Y = s * tx[j] + c * ty[j] + yi
            if X < mnx: mnx = X
            if X > mxx: mxx = X
            if Y < mny: mny = Y
            if Y > mxy: mxy = Y
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    """Remove 's' prefix from values"""
    return np.array([float(str(v).replace('s', '')) for v in a], np.float64)

tx, ty = make_polygon_template()
print("Template loaded")

In [None]:
# Find ALL CSV files in snapshots
all_csv_files = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
print(f"Found {len(all_csv_files)} CSV files in snapshots")

# Also check current code directory
code_csvs = glob.glob('/home/code/**/*.csv', recursive=True)
print(f"Found {len(code_csvs)} CSV files in /home/code")

all_files = all_csv_files + code_csvs
print(f"Total: {len(all_files)} CSV files to scan")

In [None]:
# Initialize best tracker for each N
best = {n: {'score': 1e300, 'data': None, 'src': None} for n in range(1, 201)}

# Scan all CSV files
valid_files = 0
for fp in tqdm(all_files, desc='Scanning CSV files'):
    try:
        df = pd.read_csv(fp)
    except Exception:
        continue
    
    # Check if it's a valid submission file
    if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
        continue
    
    valid_files += 1
    df = df.copy()
    
    # Extract N from id
    try:
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    except:
        continue
    
    # Score each N group
    for n, g in df.groupby('N'):
        if n < 1 or n > 200:
            continue
        
        try:
            xs = strip(g['x'].to_numpy())
            ys = strip(g['y'].to_numpy())
            ds = strip(g['deg'].to_numpy())
            
            if len(xs) != n:
                continue  # Invalid group
            
            sc = score_group(xs, ys, ds, tx, ty)
            
            if sc < best[n]['score']:
                best[n]['score'] = float(sc)
                best[n]['data'] = g.drop(columns=['N']).copy()
                best[n]['src'] = fp.split('/')[-1]
        except:
            continue

print(f"\nScanned {valid_files} valid submission files")

In [None]:
# Calculate total score from best per-N
total_score = 0
missing = []
for n in range(1, 201):
    if best[n]['data'] is None:
        missing.append(n)
    else:
        total_score += best[n]['score']

print(f"Total ensemble score: {total_score:.6f}")
print(f"Target: 68.896973")
print(f"Gap: {total_score - 68.896973:.6f}")
print(f"Missing N values: {missing}")

# Show score breakdown by range
ranges = [(1, 10), (11, 50), (51, 100), (101, 150), (151, 200)]
for start, end in ranges:
    range_score = sum(best[n]['score'] for n in range(start, end+1) if best[n]['data'] is not None)
    print(f"N={start}-{end}: {range_score:.4f}")

In [None]:
# Show best sources for each N (top improvements)
print("\nBest sources per N (showing first 20):")
for n in range(1, 21):
    if best[n]['data'] is not None:
        print(f"N={n:3d}: score={best[n]['score']:.6f}, src={best[n]['src']}")

In [None]:
# Compare with current submission
current_df = pd.read_csv('/home/submission/submission.csv')
current_df['N'] = current_df['id'].astype(str).str.split('_').str[0].astype(int)

current_scores = {}
for n, g in current_df.groupby('N'):
    xs = strip(g['x'].to_numpy())
    ys = strip(g['y'].to_numpy())
    ds = strip(g['deg'].to_numpy())
    current_scores[n] = score_group(xs, ys, ds, tx, ty)

current_total = sum(current_scores.values())
print(f"Current submission score: {current_total:.6f}")
print(f"Ensemble score: {total_score:.6f}")
print(f"Improvement: {current_total - total_score:.6f}")

# Show where improvements come from
print("\nImprovements by N (top 20):")
improvements = []
for n in range(1, 201):
    if best[n]['data'] is not None and n in current_scores:
        imp = current_scores[n] - best[n]['score']
        if imp > 1e-9:
            improvements.append((n, imp, best[n]['src']))

improvements.sort(key=lambda x: -x[1])
for n, imp, src in improvements[:20]:
    print(f"N={n:3d}: improvement={imp:.9f}, src={src}")

In [None]:
# Create ensemble submission
if total_score < current_total:
    print("Creating ensemble submission...")
    rows = []
    for n in range(1, 201):
        if best[n]['data'] is not None:
            rows.append(best[n]['data'])
    
    ensemble_df = pd.concat(rows, ignore_index=True)
    ensemble_df['sn'] = ensemble_df['id'].str.split('_').str[0].astype(int)
    ensemble_df['si'] = ensemble_df['id'].str.split('_').str[1].astype(int)
    ensemble_df = ensemble_df.sort_values(['sn', 'si']).drop(columns=['sn', 'si'])
    ensemble_df = ensemble_df[['id', 'x', 'y', 'deg']]
    
    # Save
    ensemble_df.to_csv('/home/code/experiments/ensemble_best.csv', index=False)
    print(f"Saved ensemble to /home/code/experiments/ensemble_best.csv")
    print(f"Ensemble score: {total_score:.6f}")
else:
    print("No improvement from ensemble")