# Experiment 005: Multi-Source Ensemble

The key insight from top kernels: ensemble from 15+ different sources.
We have 88 snapshot submissions - let's use ALL of them!

Approach:
1. Load ALL snapshot submissions
2. For each N, take the BEST solution across all sources
3. Track which source contributed to each N
4. Validate and create submission

In [1]:
import pandas as pd
import numpy as np
from numba import njit
import os
import json
import time
import warnings
warnings.filterwarnings('ignore')

print("Setup complete")

Setup complete


In [2]:
# Fast scoring using numba (from jonathanchan kernel)
@njit
def make_polygon_template():
    tw=0.15; th=0.2; bw=0.7; mw=0.4; ow=0.25
    tip=0.8; t1=0.5; t2=0.25; base=0.0; tbot=-th
    x=np.array([0,ow/2,ow/4,mw/2,mw/4,bw/2,tw/2,tw/2,-tw/2,-tw/2,-bw/2,-mw/4,-mw/2,-ow/4,-ow/2],np.float64)
    y=np.array([tip,t1,t1,t2,t2,base,base,tbot,tbot,base,base,t2,t2,t1,t1],np.float64)
    return x,y

@njit
def score_group(xs, ys, degs, tx, ty):
    """Fast scoring using numba - calculates side^2/n for a group."""
    n = xs.size
    V = tx.size
    mnx = 1e300; mny = 1e300; mxx = -1e300; mxy = -1e300
    for i in range(n):
        r = degs[i] * np.pi / 180.0
        c = np.cos(r); s = np.sin(r)
        xi = xs[i]; yi = ys[i]
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xi
            Y = s * tx[j] + c * ty[j] + yi
            if X < mnx: mnx = X
            if X > mxx: mxx = X
            if Y < mny: mny = Y
            if Y > mxy: mxy = Y
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    """Convert submission values (with 's' prefix) to float array."""
    return np.array([float(str(v).replace("s","")) for v in a], np.float64)

# Initialize polygon template
tx, ty = make_polygon_template()
print("Numba scoring functions compiled")

Numba scoring functions compiled


In [3]:
# Find all snapshot submissions
print("=" * 60)
print("STEP 1: COLLECTING ALL SNAPSHOT SUBMISSIONS")
print("=" * 60)

snapshot_dir = '/home/nonroot/snapshots/santa-2025/'
sources = []

for subdir in sorted(os.listdir(snapshot_dir)):
    csv_path = os.path.join(snapshot_dir, subdir, 'submission', 'submission.csv')
    if os.path.exists(csv_path):
        sources.append(csv_path)

print(f"Found {len(sources)} snapshot submissions")

STEP 1: COLLECTING ALL SNAPSHOT SUBMISSIONS
Found 88 snapshot submissions


In [4]:
# For each N, track best solution across ALL sources
print("\n" + "=" * 60)
print("STEP 2: ENSEMBLE - FIND BEST PER-N FROM ALL SOURCES")
print("=" * 60)

best = {n: {"score": 1e300, "data": None, "src": None} for n in range(1, 201)}

start_time = time.time()
processed = 0
errors = 0

for source_path in sources:
    try:
        df = pd.read_csv(source_path)
        if not {"id","x","y","deg"}.issubset(df.columns):
            errors += 1
            continue
        
        # Parse N from id column
        df["N"] = df["id"].astype(str).str.split("_").str[0].astype(int)
        
        for n, g in df.groupby("N"):
            if n < 1 or n > 200:
                continue
            if len(g) != n:
                continue  # Invalid - wrong number of trees
            
            xs = strip(g["x"].to_numpy())
            ys = strip(g["y"].to_numpy())
            ds = strip(g["deg"].to_numpy())
            sc = score_group(xs, ys, ds, tx, ty)
            
            if sc < best[n]["score"]:
                best[n]["score"] = float(sc)
                best[n]["data"] = g.drop(columns=["N"]).copy()
                best[n]["src"] = source_path
        
        processed += 1
        if processed % 20 == 0:
            print(f"  Processed {processed}/{len(sources)} sources...")
            
    except Exception as e:
        errors += 1
        continue

print(f"\nProcessed {processed} sources, {errors} errors")
print(f"Time: {time.time() - start_time:.1f}s")


STEP 2: ENSEMBLE - FIND BEST PER-N FROM ALL SOURCES


  Processed 20/88 sources...


  Processed 40/88 sources...


  Processed 60/88 sources...


  Processed 80/88 sources...



Processed 87 sources, 1 errors
Time: 5.7s


In [None]:
# Calculate ensemble score
ensemble_total = sum(best[n]["score"] for n in range(1, 201))
print(f"\nEnsemble total score: {ensemble_total:.6f}")

# Compare to baseline
baseline_path = '/home/code/experiments/001_fix_overlaps/submission.csv'
baseline_df = pd.read_csv(baseline_path)
baseline_df["N"] = baseline_df["id"].astype(str).str.split("_").str[0].astype(int)

baseline_scores = {}
for n, g in baseline_df.groupby("N"):
    xs = strip(g["x"].to_numpy())
    ys = strip(g["y"].to_numpy())
    ds = strip(g["deg"].to_numpy())
    baseline_scores[n] = score_group(xs, ys, ds, tx, ty)

baseline_total = sum(baseline_scores.values())
print(f"Baseline total score: {baseline_total:.6f}")
print(f"Improvement: {baseline_total - ensemble_total:.6f}")

In [None]:
# Analyze which sources contributed
print("\n" + "=" * 60)
print("STEP 3: SOURCE ANALYSIS")
print("=" * 60)

from collections import Counter
source_counts = Counter(best[n]["src"] for n in range(1, 201))

print(f"\nUnique sources used: {len(source_counts)}")
print("\nTop 10 contributing sources:")
for src, count in source_counts.most_common(10):
    src_name = src.split('/')[-3] if src else 'None'
    print(f"  {src_name}: {count} N values")

In [None]:
# Find N values where ensemble improved over baseline
print("\n" + "=" * 60)
print("STEP 4: PER-N IMPROVEMENTS")
print("=" * 60)

improvements = []
for n in range(1, 201):
    diff = baseline_scores[n] - best[n]["score"]
    if diff > 1e-9:
        improvements.append((n, diff, best[n]["src"]))

print(f"\nN values improved: {len(improvements)}")
if improvements:
    print("\nTop 20 improvements:")
    for n, diff, src in sorted(improvements, key=lambda x: -x[1])[:20]:
        src_name = src.split('/')[-3] if src else 'None'
        print(f"  N={n:3d}: +{diff:.6f} from {src_name}")
    
    total_improvement = sum(d for _, d, _ in improvements)
    print(f"\nTotal improvement from per-N selection: {total_improvement:.6f}")
else:
    print("No improvements found - all sources have same or worse solutions")

In [None]:
# Create submission from ensemble
print("\n" + "=" * 60)
print("STEP 5: CREATE SUBMISSION")
print("=" * 60)

rows = []
for n in range(1, 201):
    data = best[n]["data"]
    if data is not None:
        for _, row in data.iterrows():
            rows.append(row.to_dict())
    else:
        print(f"WARNING: No data for N={n}")

submission_df = pd.DataFrame(rows)
print(f"Submission shape: {submission_df.shape}")
print(f"Expected: (20100, 4)")

# Verify format
print(f"\nFirst few rows:")
print(submission_df.head())

In [None]:
# Save submission
submission_df.to_csv('/home/code/experiments/005_multi_source_ensemble/submission.csv', index=False)
submission_df.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved!")

In [None]:
# Save metrics
metrics = {
    'cv_score': ensemble_total,
    'baseline_score': baseline_total,
    'improvement': baseline_total - ensemble_total,
    'n_improved': len(improvements),
    'sources_used': len(source_counts),
    'total_sources': len(sources),
    'target': 68.888293,
    'gap': ensemble_total - 68.888293
}

with open('/home/code/experiments/005_multi_source_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("\nMetrics saved!")
print(f"\n" + "=" * 60)
print("FINAL RESULTS")
print("=" * 60)
print(f"Baseline score: {baseline_total:.6f}")
print(f"Ensemble score: {ensemble_total:.6f}")
print(f"Improvement: {baseline_total - ensemble_total:.6f}")
print(f"N values improved: {len(improvements)}")
print(f"Target: 68.888293")
print(f"Gap to target: {ensemble_total - 68.888293:.6f}")