# Loop 4 Analysis: Ensemble of Best-Per-N from Multiple Sources

The evaluator has confirmed that local optimization is exhausted. The strategy now is to:
1. Collect ALL available pre-optimized CSVs
2. For each N (1-200), find the best solution across ALL sources
3. Combine into a single submission

This exploits the fact that different optimization runs may have found better solutions for different N values.

In [None]:
import pandas as pd
import numpy as np
import glob
import os
from numba import njit
import math

# Tree geometry
@njit
def make_polygon_template():
    tw=0.15; th=0.2; bw=0.7; mw=0.4; ow=0.25
    tip=0.8; t1=0.5; t2=0.25; base=0.0; tbot=-th
    x=np.array([0,ow/2,ow/4,mw/2,mw/4,bw/2,tw/2,tw/2,-tw/2,-tw/2,-bw/2,-mw/4,-mw/2,-ow/4,-ow/2],np.float64)
    y=np.array([tip,t1,t1,t2,t2,base,base,tbot,tbot,base,base,t2,t2,t1,t1],np.float64)
    return x,y

@njit
def score_group(xs, ys, degs, tx, ty):
    n = xs.size
    V = tx.size
    mnx = 1e300; mny = 1e300; mxx = -1e300; mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c = math.cos(r); s = math.sin(r)
        xi = xs[i]; yi = ys[i]
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xi
            Y = s * tx[j] + c * ty[j] + yi
            if X < mnx: mnx = X
            if X > mxx: mxx = X
            if Y < mny: mny = Y
            if Y > mxy: mxy = Y
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    return np.array([float(str(v).replace('s', '')) for v in a], np.float64)

tx, ty = make_polygon_template()
print('Functions defined')

In [None]:
# Collect all CSV files from all sources
sources = [
    '/home/code/exploration/datasets/saspav_latest/*.csv',
    '/home/code/exploration/datasets/bucket_of_chump/*.csv',
    '/home/code/exploration/datasets/telegram/*.csv',
    '/home/code/exploration/datasets/telegram_extracted/*.csv',
    '/home/code/exploration/datasets/chistyakov/*.csv',
    '/home/code/exploration/datasets/chistyakov_packed/*.csv',
    '/home/code/exploration/datasets/saspav/*.csv',
    '/home/code/exploration/datasets/*.csv',
    '/home/submission/submission.csv',  # Current best
    '/home/code/submission_candidates/*.csv',  # Previous candidates
]

all_files = []
for pattern in sources:
    all_files.extend(glob.glob(pattern))

all_files = list(set(all_files))  # Remove duplicates
print(f'Found {len(all_files)} CSV files:')
for f in sorted(all_files):
    print(f'  {f}')

In [None]:
# For each N, find the best solution across all sources
best = {n: {'score': 1e300, 'data': None, 'src': None} for n in range(1, 201)}

for fp in all_files:
    try:
        df = pd.read_csv(fp)
    except Exception as e:
        print(f'Error reading {fp}: {e}')
        continue
    
    if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
        print(f'Skipping {fp}: missing columns')
        continue
    
    df = df.copy()
    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    
    for n, g in df.groupby('N'):
        if n < 1 or n > 200:
            continue
        xs = strip(g['x'].to_numpy())
        ys = strip(g['y'].to_numpy())
        ds = strip(g['deg'].to_numpy())
        sc = score_group(xs, ys, ds, tx, ty)
        
        if sc < best[n]['score']:
            best[n]['score'] = float(sc)
            best[n]['data'] = g.drop(columns=['N']).copy()
            best[n]['src'] = os.path.basename(fp)

print('\nBest sources per N:')
print(f"{'N':>3} {'Score':>12} {'Source':>30}")
print('-' * 50)
for n in range(1, 21):  # Show first 20
    entry = best[n]
    if entry['data'] is not None:
        print(f"{n:>3} {entry['score']:>12.6f} {entry['src']:>30}")

In [None]:
# Count how many N values come from each source
source_counts = {}
for n in range(1, 201):
    src = best[n]['src']
    if src:
        source_counts[src] = source_counts.get(src, 0) + 1

print('\nSource distribution:')
for src, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f'  {src}: {count} N values')

In [None]:
# Calculate total score from ensemble
total_score = sum(best[n]['score'] for n in range(1, 201))
print(f'\nEnsemble total score: {total_score:.6f}')
print(f'Current baseline: 70.659437')
print(f'Improvement: {70.659437 - total_score:.6f}')

In [None]:
# Build the ensemble submission
rows = []
for n in range(1, 201):
    entry = best[n]
    if entry['data'] is not None:
        rows.append(entry['data'])
    else:
        print(f'WARNING: No data for N={n}')

if rows:
    out = pd.concat(rows, ignore_index=True)
    out['sn'] = out['id'].str.split('_').str[0].astype(int)
    out['si'] = out['id'].str.split('_').str[1].astype(int)
    out = out.sort_values(['sn', 'si']).drop(columns=['sn', 'si'])
    out = out[['id', 'x', 'y', 'deg']]
    
    # Save ensemble
    out.to_csv('/home/code/exploration/datasets/ensemble_best_per_n.csv', index=False)
    print(f'\nSaved ensemble to /home/code/exploration/datasets/ensemble_best_per_n.csv')
    print(f'Total rows: {len(out)}')
else:
    print('ERROR: No rows collected')

In [None]:
# Verify the ensemble score
df = pd.read_csv('/home/code/exploration/datasets/ensemble_best_per_n.csv')
df['N'] = df['id'].str.split('_').str[0].astype(int)

total = 0
for n, g in df.groupby('N'):
    xs = strip(g['x'].to_numpy())
    ys = strip(g['y'].to_numpy())
    ds = strip(g['deg'].to_numpy())
    sc = score_group(xs, ys, ds, tx, ty)
    total += sc

print(f'Verified ensemble score: {total:.6f}')
print(f'Current baseline: 70.659437')
print(f'Improvement: {70.659437 - total:.6f}')