# Loop 4 Analysis: Ensemble Strategy from Multiple Sources

## Key Insight from Research
The jonathanchan kernel shows that top scores come from **ensembling 16+ external sources**, not from running optimizers longer.

## Strategy
1. Collect ALL available CSV files from snapshots
2. For each N (1-200), find the best solution across all sources
3. Ensemble the best solutions
4. Validate for overlaps
5. Submit

In [None]:
import pandas as pd
import numpy as np
import math
import os
import glob
from numba import njit
from collections import defaultdict

# Tree geometry
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

@njit
def score_group(xs, ys, degs, tx, ty):
    n = xs.size
    V = tx.size
    mnx = mny = 1e300
    mxx = mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c, s = math.cos(r), math.sin(r)
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xs[i]
            Y = s * tx[j] + c * ty[j] + ys[i]
            mnx, mxx = min(mnx, X), max(mxx, X)
            mny, mxy = min(mny, Y), max(mxy, Y)
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    return np.array([float(str(v).replace('s', '')) for v in a], np.float64)

print('Functions defined')

In [None]:
# Find ALL CSV files in snapshots
all_csvs = []
for root, dirs, files in os.walk('/home/nonroot/snapshots/santa-2025'):
    for f in files:
        if f.endswith('.csv'):
            all_csvs.append(os.path.join(root, f))

print(f'Found {len(all_csvs)} CSV files in snapshots')
print('\nSample files:')
for f in all_csvs[:10]:
    print(f'  {f}')

In [None]:
# Score each CSV file and find best per-N solutions
best_per_n = {n: {'score': 1e300, 'data': None, 'src': None} for n in range(1, 201)}

valid_files = 0
for fp in all_csvs:
    try:
        df = pd.read_csv(fp)
        if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
            continue
        
        # Check if it has proper format
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        
        valid_files += 1
        
        for n, g in df.groupby('N'):
            if n < 1 or n > 200 or len(g) != n:
                continue
            
            xs = strip(g['x'].values)
            ys = strip(g['y'].values)
            ds = strip(g['deg'].values)
            
            sc = score_group(xs, ys, ds, TX, TY)
            
            if sc < best_per_n[n]['score']:
                best_per_n[n]['score'] = float(sc)
                best_per_n[n]['data'] = g.drop(columns=['N']).copy()
                best_per_n[n]['src'] = fp.split('/')[-1]
    except Exception as e:
        continue

print(f'Processed {valid_files} valid CSV files')

In [None]:
# Calculate total ensemble score
total_score = sum(best_per_n[n]['score'] for n in range(1, 201) if best_per_n[n]['data'] is not None)
print(f'\nTotal ensemble score: {total_score:.6f}')
print(f'Target: 68.892266')
print(f'Gap: {total_score - 68.892266:.6f}')

# Show best sources
from collections import Counter
sources = Counter(best_per_n[n]['src'] for n in range(1, 201) if best_per_n[n]['src'])
print(f'\nTop sources:')
for src, count in sources.most_common(10):
    print(f'  {src}: {count} N values')

In [None]:
# Show per-N scores for first 20 N values
print('\nPer-N scores (N=1-20):')
for n in range(1, 21):
    entry = best_per_n[n]
    print(f'  N={n:3d}: score={entry["score"]:.6f}, src={entry["src"]}')

In [None]:
# Compare with our current best (70.622435)
current_best = 70.622435
print(f'\nComparison:')
print(f'  Current best: {current_best:.6f}')
print(f'  Ensemble:     {total_score:.6f}')
print(f'  Improvement:  {current_best - total_score:.6f}')

if total_score < current_best:
    print('\n*** ENSEMBLE IS BETTER! ***')
else:
    print('\n*** Current solution is already the best ***')