# Ensemble from ALL Available Sources

Build ensemble by scanning ALL 3213 CSV files and keeping the best configuration for each N (1-200).

In [None]:
import numpy as np
import pandas as pd
import math
from numba import njit
import glob
import os
from tqdm import tqdm
import json

# Tree geometry
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125], dtype=np.float64)
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5], dtype=np.float64)

In [None]:
@njit
def score_group(xs, ys, degs, tx, ty):
    """Calculate score for a group of trees."""
    n = xs.size
    V = tx.size
    mnx = 1e300
    mny = 1e300
    mxx = -1e300
    mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c = math.cos(r)
        s = math.sin(r)
        xi = xs[i]
        yi = ys[i]
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xi
            Y = s * tx[j] + c * ty[j] + yi
            if X < mnx:
                mnx = X
            if X > mxx:
                mxx = X
            if Y < mny:
                mny = Y
            if Y > mxy:
                mxy = Y
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    """Remove 's' prefix from values."""
    return np.array([float(str(v).replace('s', '')) for v in a], np.float64)

# Warm up numba
_ = score_group(np.array([0.0]), np.array([0.0]), np.array([45.0]), TX, TY)
print("Numba warmed up")

In [None]:
# Find ALL CSV files in snapshots
csv_files = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
print(f"Found {len(csv_files)} CSV files")

In [None]:
# Initialize best configurations for each N
best = {n: {'score': 1e300, 'data': None, 'src': None} for n in range(1, 201)}

# Scan all CSV files
for fp in tqdm(csv_files, desc="Scanning CSV files"):
    try:
        df = pd.read_csv(fp)
    except Exception:
        continue
    
    # Check if it has the required columns
    if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
        continue
    
    # Parse N from id column
    df = df.copy()
    try:
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    except:
        continue
    
    # Process each N group
    for n, g in df.groupby('N'):
        if n < 1 or n > 200:
            continue
        if len(g) != n:  # Must have exactly n trees
            continue
        
        try:
            xs = strip(g['x'].to_numpy())
            ys = strip(g['y'].to_numpy())
            ds = strip(g['deg'].to_numpy())
            sc = score_group(xs, ys, ds, TX, TY)
            
            if sc < best[n]['score']:
                best[n]['score'] = float(sc)
                best[n]['data'] = g.drop(columns=['N']).copy()
                best[n]['src'] = fp.split('/')[-1]
        except:
            continue

print("\nScanning complete!")

In [None]:
# Override N=1 with optimal 45 degree rotation (theoretical minimum)
manual_data = pd.DataFrame({
    'id': ['001_0'],
    'x': ['s0.0'],
    'y': ['s0.0'],
    'deg': ['s45.0']
})
xs = strip(manual_data['x'].to_numpy())
ys = strip(manual_data['y'].to_numpy())
ds = strip(manual_data['deg'].to_numpy())
sc = score_group(xs, ys, ds, TX, TY)
best[1]['score'] = float(sc)
best[1]['data'] = manual_data.copy()
best[1]['src'] = 'optimal_45deg'
print(f"N=1 set to optimal 45Â° rotation, score: {sc:.6f}")

In [None]:
# Build the ensemble submission
rows = []
total_score = 0.0
source_counts = {}

for n in range(1, 201):
    entry = best[n]
    if entry['data'] is None:
        print(f"WARNING: No data for N={n}")
        continue
    rows.append(entry['data'])
    total_score += entry['score']
    src = entry['src']
    source_counts[src] = source_counts.get(src, 0) + 1

print(f"\nTotal ensemble score: {total_score:.6f}")
print(f"\nTop sources by contribution:")
for src, count in sorted(source_counts.items(), key=lambda x: -x[1])[:10]:
    print(f"  {src}: {count} N values")

In [None]:
# Concatenate and sort
ensemble_df = pd.concat(rows, ignore_index=True)
ensemble_df['sn'] = ensemble_df['id'].str.split('_').str[0].astype(int)
ensemble_df['si'] = ensemble_df['id'].str.split('_').str[1].astype(int)
ensemble_df = ensemble_df.sort_values(['sn', 'si']).drop(columns=['sn', 'si'])
ensemble_df = ensemble_df[['id', 'x', 'y', 'deg']]

print(f"Ensemble has {len(ensemble_df)} rows")
print(f"Expected: {sum(range(1, 201))} rows")

In [None]:
# Save ensemble
ensemble_df.to_csv('/home/code/experiments/002_ensemble/ensemble.csv', index=False)
ensemble_df.to_csv('/home/submission/submission.csv', index=False)
print("Saved ensemble to experiment folder and submission folder")

In [None]:
# Compare with baseline
baseline_score = 70.619825
improvement = baseline_score - total_score
print(f"\nBaseline score: {baseline_score:.6f}")
print(f"Ensemble score: {total_score:.6f}")
print(f"Improvement: {improvement:.6f} ({improvement/baseline_score*100:.4f}%)")

In [None]:
# Score breakdown by N range
print("\nScore breakdown by N range:")
ranges = [(1, 10), (11, 50), (51, 100), (101, 150), (151, 200)]
for start, end in ranges:
    range_score = sum(best[n]['score'] for n in range(start, end+1))
    pct = range_score / total_score * 100
    print(f"  N={start}-{end}: {range_score:.4f} ({pct:.1f}%)")

In [None]:
# Save metrics
metrics = {'cv_score': total_score}
with open('/home/code/experiments/002_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f)
print(f"Saved metrics: {metrics}")