# Loop 1 Analysis: Snapshot Ensemble Strategy

Goal: Score all 105 snapshots and create an ensemble taking the best N from each.
This is the key strategy from the jonathanchan kernel.

In [None]:
import pandas as pd
import numpy as np
import math
import os
import glob
from numba import njit
from tqdm import tqdm

# Tree geometry
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

@njit
def score_group(xs, ys, degs, tx, ty):
    """Calculate score for a single N-tree configuration"""
    n = xs.size
    V = tx.size
    mnx = mny = 1e300
    mxx = mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c, s = math.cos(r), math.sin(r)
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xs[i]
            Y = s * tx[j] + c * ty[j] + ys[i]
            mnx, mxx = min(mnx, X), max(mxx, X)
            mny, mxy = min(mny, Y), max(mxy, Y)
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    """Remove 's' prefix from values"""
    return np.array([float(str(v).replace('s', '')) for v in a], np.float64)

print('Functions defined')

In [None]:
# Find all submission CSVs in snapshots
snapshot_dir = '/home/nonroot/snapshots/santa-2025/'
all_csvs = []

for root, dirs, files in os.walk(snapshot_dir):
    for f in files:
        if f.endswith('.csv'):
            all_csvs.append(os.path.join(root, f))

print(f'Found {len(all_csvs)} CSV files in snapshots')

In [None]:
# Score each CSV and track best per N
best = {n: {'score': 1e300, 'data': None, 'src': None} for n in range(1, 201)}
csv_scores = {}  # Track total score per CSV

for fp in tqdm(all_csvs, desc='Scoring snapshots'):
    try:
        df = pd.read_csv(fp)
    except Exception as e:
        continue
    
    if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
        continue
    
    # Check if it has all N values
    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    n_values = df['N'].unique()
    if len(n_values) < 200:
        continue  # Skip incomplete submissions
    
    total_score = 0.0
    for n in range(1, 201):
        g = df[df['N'] == n]
        if len(g) != n:
            continue
        
        xs = strip(g['x'].values)
        ys = strip(g['y'].values)
        ds = strip(g['deg'].values)
        sc = score_group(xs, ys, ds, TX, TY)
        total_score += sc
        
        if sc < best[n]['score']:
            best[n]['score'] = float(sc)
            best[n]['data'] = g.drop(columns=['N']).copy()
            best[n]['src'] = fp
    
    csv_scores[fp] = total_score

print(f'\nScored {len(csv_scores)} complete submissions')

In [None]:
# Show top 10 submissions by total score
print('Top 10 submissions by total score:')
for i, (fp, score) in enumerate(sorted(csv_scores.items(), key=lambda x: x[1])[:10]):
    print(f'{i+1}. {score:.6f} - {fp.split("/")[-3]}')

# Calculate ensemble score
ensemble_score = sum(best[n]['score'] for n in range(1, 201))
print(f'\nEnsemble score (best per N): {ensemble_score:.6f}')
print(f'Best single submission: {min(csv_scores.values()):.6f}')
print(f'Improvement from ensemble: {min(csv_scores.values()) - ensemble_score:.6f}')

In [None]:
# Override N=1 with optimal value (45 degrees)
manual_data = pd.DataFrame({
    'id': ['001_0'],
    'x': ['s0.0'],
    'y': ['s0.0'],
    'deg': ['s45.0']
})
xs = strip(manual_data['x'].values)
ys = strip(manual_data['y'].values)
ds = strip(manual_data['deg'].values)
sc = score_group(xs, ys, ds, TX, TY)
best[1]['score'] = float(sc)
best[1]['data'] = manual_data.copy()
best[1]['src'] = 'optimal_n1'

print(f'N=1 optimal score: {sc:.6f}')

# Recalculate ensemble score
ensemble_score = sum(best[n]['score'] for n in range(1, 201))
print(f'Ensemble score with optimal N=1: {ensemble_score:.6f}')

In [None]:
# Analyze which N values have the most room for improvement
print('\nN values with highest scores (most room for improvement):')
n_scores = [(n, best[n]['score']) for n in range(1, 201)]
n_scores_sorted = sorted(n_scores, key=lambda x: x[1], reverse=True)

for n, score in n_scores_sorted[:20]:
    print(f'  N={n:3d}: {score:.6f}')

In [None]:
# Count how many N values each source contributes
source_counts = {}
for n in range(1, 201):
    src = best[n]['src']
    if src:
        source_counts[src] = source_counts.get(src, 0) + 1

print('\nSources contributing to ensemble:')
for src, count in sorted(source_counts.items(), key=lambda x: -x[1])[:10]:
    print(f'  {count:3d} N values from {src.split("/")[-3] if "/" in src else src}')

In [None]:
# Create ensemble submission
rows = []
for n in range(1, 201):
    if best[n]['data'] is not None:
        rows.append(best[n]['data'])

ensemble_df = pd.concat(rows, ignore_index=True)

# Sort by N and index
ensemble_df['sn'] = ensemble_df['id'].str.split('_').str[0].astype(int)
ensemble_df['si'] = ensemble_df['id'].str.split('_').str[1].astype(int)
ensemble_df = ensemble_df.sort_values(['sn', 'si']).drop(columns=['sn', 'si'])
ensemble_df = ensemble_df[['id', 'x', 'y', 'deg']]

print(f'Ensemble submission has {len(ensemble_df)} rows')
print(ensemble_df.head())

In [None]:
# Save ensemble submission
os.makedirs('/home/submission', exist_ok=True)
ensemble_df.to_csv('/home/submission/submission.csv', index=False)
print('Saved ensemble to /home/submission/submission.csv')

# Verify score
verify_df = pd.read_csv('/home/submission/submission.csv')
verify_df['N'] = verify_df['id'].str.split('_').str[0].astype(int)
verify_score = 0.0
for n in range(1, 201):
    g = verify_df[verify_df['N'] == n]
    xs = strip(g['x'].values)
    ys = strip(g['y'].values)
    ds = strip(g['deg'].values)
    verify_score += score_group(xs, ys, ds, TX, TY)

print(f'Verified ensemble score: {verify_score:.6f}')
print(f'Target: 68.894234')
print(f'Gap: {verify_score - 68.894234:.6f}')