# Loop 5 Analysis: Comprehensive Ensemble from All Sources

## Key Insight from jonathanchan kernel:
1. Ensemble from 19+ sources (GitHub, Kaggle datasets, notebooks)
2. Override N=1 with optimal value (x=0, y=0, deg=45)
3. Apply fractional translation (0.001 to 0.00001 steps)

## Available Sources:
- 88 snapshots (already scanned)
- Preoptimized folder: telegram, santa25-public, bucket-of-chump, etc.
- 3434 total CSV files

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
import math
from tqdm import tqdm
import json

# Tree geometry
def make_tree_polygon():
    tw, th = 0.15, 0.2
    bw, mw, ow = 0.7, 0.4, 0.25
    tip, t1, t2, base, tbot = 0.8, 0.5, 0.25, 0.0, -0.2
    x = [0, ow/2, ow/4, mw/2, mw/4, bw/2, tw/2, tw/2, -tw/2, -tw/2, -bw/2, -mw/4, -mw/2, -ow/4, -ow/2]
    y = [tip, t1, t1, t2, t2, base, base, tbot, tbot, base, base, t2, t2, t1, t1]
    return list(zip(x, y))

TREE_TEMPLATE = make_tree_polygon()

def get_tree_polygon(x, y, deg):
    poly = Polygon(TREE_TEMPLATE)
    poly = rotate(poly, deg, origin=(0, 0))
    poly = translate(poly, x, y)
    return poly

def score_group(xs, ys, degs):
    n = len(xs)
    all_x, all_y = [], []
    for i in range(n):
        r = math.radians(degs[i])
        c, s = math.cos(r), math.sin(r)
        for tx, ty in TREE_TEMPLATE:
            X = c * tx - s * ty + xs[i]
            Y = s * tx + c * ty + ys[i]
            all_x.append(X)
            all_y.append(Y)
    side = max(max(all_x) - min(all_x), max(all_y) - min(all_y))
    return side * side / n

def check_overlaps(xs, ys, degs):
    n = len(xs)
    polys = [get_tree_polygon(xs[i], ys[i], degs[i]) for i in range(n)]
    for i in range(n):
        for j in range(i+1, n):
            if polys[i].intersects(polys[j]) and not polys[i].touches(polys[j]):
                inter = polys[i].intersection(polys[j])
                if inter.area > 1e-10:  # Significant overlap
                    return True, f"Trees {i} and {j} overlap (area={inter.area})"
    return False, "OK"

print("Functions defined")

Functions defined


In [2]:
# Find ALL CSV files in preoptimized folder
preopt_base = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/'

preopt_files = []
for root, dirs, files in os.walk(preopt_base):
    for f in files:
        if f.endswith('.csv'):
            preopt_files.append(os.path.join(root, f))

print(f"Found {len(preopt_files)} preoptimized CSV files")
for f in preopt_files[:20]:
    print(f"  {f}")

Found 30 preoptimized CSV files
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/submission.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/best_ensemble.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/72.49.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/71.97.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/telegram_extracted/72.49.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/telegram_extracted/71.97.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/submission_JKoT4.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/New_Tree_144_196.csv
  /home/nonroot/snapshots/santa-2025/21116303805/co

In [3]:
# Score each preoptimized file
def strip_s(val):
    s = str(val)
    return float(s[1:] if s.startswith('s') else s)

def load_and_score_csv(filepath):
    try:
        df = pd.read_csv(filepath)
        if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
            return None, None
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        
        scores = {}
        for n, g in df.groupby('N'):
            if n < 1 or n > 200:
                continue
            xs = [strip_s(v) for v in g['x'].values]
            ys = [strip_s(v) for v in g['y'].values]
            ds = [strip_s(v) for v in g['deg'].values]
            scores[n] = score_group(xs, ys, ds)
        
        total = sum(scores.values())
        return total, scores
    except Exception as e:
        return None, None

# Score all preoptimized files
preopt_scores = {}
for fp in tqdm(preopt_files, desc="Scoring preoptimized"):
    total, per_n = load_and_score_csv(fp)
    if total is not None:
        preopt_scores[fp] = {'total': total, 'per_n': per_n}

print(f"\nScored {len(preopt_scores)} files")

# Sort by total score
sorted_preopt = sorted(preopt_scores.items(), key=lambda x: x[1]['total'])
print("\nTop 10 preoptimized solutions:")
for fp, data in sorted_preopt[:10]:
    print(f"  {data['total']:.6f}: {os.path.basename(fp)}")

Scoring preoptimized:   0%|          | 0/30 [00:00<?, ?it/s]

Scoring preoptimized:   3%|▎         | 1/30 [00:00<00:03,  8.18it/s]

Scoring preoptimized:   7%|▋         | 2/30 [00:00<00:03,  8.32it/s]

Scoring preoptimized:  10%|█         | 3/30 [00:00<00:03,  7.71it/s]

Scoring preoptimized:  13%|█▎        | 4/30 [00:00<00:03,  8.00it/s]

Scoring preoptimized:  17%|█▋        | 5/30 [00:00<00:02,  8.34it/s]

Scoring preoptimized:  20%|██        | 6/30 [00:00<00:02,  8.32it/s]

Scoring preoptimized:  23%|██▎       | 7/30 [00:00<00:02,  8.04it/s]

Scoring preoptimized:  27%|██▋       | 8/30 [00:00<00:02,  8.15it/s]

Scoring preoptimized:  30%|███       | 9/30 [00:01<00:02,  8.44it/s]

Scoring preoptimized:  37%|███▋      | 11/30 [00:01<00:02,  8.96it/s]

Scoring preoptimized:  40%|████      | 12/30 [00:01<00:02,  8.59it/s]

Scoring preoptimized:  43%|████▎     | 13/30 [00:01<00:01,  8.72it/s]

Scoring preoptimized:  47%|████▋     | 14/30 [00:01<00:01,  8.87it/s]

Scoring preoptimized:  50%|█████     | 15/30 [00:01<00:01,  8.96it/s]

Scoring preoptimized:  53%|█████▎    | 16/30 [00:01<00:01,  9.00it/s]

Scoring preoptimized:  57%|█████▋    | 17/30 [00:02<00:01,  8.47it/s]

Scoring preoptimized:  60%|██████    | 18/30 [00:02<00:01,  8.63it/s]

Scoring preoptimized:  63%|██████▎   | 19/30 [00:02<00:01,  8.81it/s]

Scoring preoptimized:  67%|██████▋   | 20/30 [00:02<00:01,  8.95it/s]

Scoring preoptimized:  70%|███████   | 21/30 [00:02<00:01,  8.42it/s]

Scoring preoptimized:  73%|███████▎  | 22/30 [00:02<00:00,  8.61it/s]

Scoring preoptimized:  77%|███████▋  | 23/30 [00:02<00:00,  8.98it/s]

Scoring preoptimized:  80%|████████  | 24/30 [00:02<00:00,  9.07it/s]

Scoring preoptimized:  83%|████████▎ | 25/30 [00:02<00:00,  8.93it/s]

Scoring preoptimized:  87%|████████▋ | 26/30 [00:03<00:00,  8.36it/s]

Scoring preoptimized:  90%|█████████ | 27/30 [00:03<00:00,  8.43it/s]

Scoring preoptimized:  93%|█████████▎| 28/30 [00:03<00:00,  8.67it/s]

Scoring preoptimized:  97%|█████████▋| 29/30 [00:03<00:00,  8.84it/s]

Scoring preoptimized: 100%|██████████| 30/30 [00:03<00:00,  8.36it/s]

Scoring preoptimized: 100%|██████████| 30/30 [00:03<00:00,  8.57it/s]


Scored 30 files

Top 10 preoptimized solutions:
  70.676102: ensemble.csv
  70.676102: santa-2025.csv
  70.676102: best_ensemble.csv
  70.676102: santa-2025.csv
  70.676501: submission.csv
  70.676501: submission.csv
  70.926150: submission_70_926149550346.csv
  70.926150: submission_best.csv
  70.936674: submission_70_936673758122.csv
  70.990692: submission_opt1.csv





In [4]:
# Now let's build the BEST ensemble from ALL sources
# 1. Load all snapshot submissions
# 2. Load all preoptimized files
# 3. For each N, find the best VALID solution

# First, let's load the current valid baseline
valid_baseline_path = '/home/nonroot/snapshots/santa-2025/21328309254/submission/submission.csv'
baseline_df = pd.read_csv(valid_baseline_path)
baseline_df['N'] = baseline_df['id'].astype(str).str.split('_').str[0].astype(int)

baseline_per_n = {}
for n, g in baseline_df.groupby('N'):
    xs = [strip_s(v) for v in g['x'].values]
    ys = [strip_s(v) for v in g['y'].values]
    ds = [strip_s(v) for v in g['deg'].values]
    baseline_per_n[n] = {
        'score': score_group(xs, ys, ds),
        'data': g.drop(columns=['N']).copy(),
        'source': 'valid_baseline'
    }

print(f"Baseline total: {sum(d['score'] for d in baseline_per_n.values()):.6f}")

Baseline total: 70.647327


In [5]:
# Build best-per-N from ALL sources (including preoptimized)
best_per_n = {n: baseline_per_n[n].copy() for n in range(1, 201)}

# Add preoptimized solutions
for fp, data in tqdm(preopt_scores.items(), desc="Processing preoptimized"):
    try:
        df = pd.read_csv(fp)
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        
        for n, g in df.groupby('N'):
            if n < 1 or n > 200:
                continue
            xs = [strip_s(v) for v in g['x'].values]
            ys = [strip_s(v) for v in g['y'].values]
            ds = [strip_s(v) for v in g['deg'].values]
            score = score_group(xs, ys, ds)
            
            if score < best_per_n[n]['score']:
                best_per_n[n] = {
                    'score': score,
                    'data': g.drop(columns=['N']).copy(),
                    'source': os.path.basename(fp)
                }
    except:
        continue

# Calculate total score (ignoring overlaps for now)
total_best = sum(d['score'] for d in best_per_n.values())
print(f"\nBest ensemble (ignoring overlaps): {total_best:.6f}")
print(f"Improvement from baseline: {sum(d['score'] for d in baseline_per_n.values()) - total_best:.6f}")

Processing preoptimized:   0%|          | 0/30 [00:00<?, ?it/s]

Processing preoptimized:   3%|▎         | 1/30 [00:00<00:03,  8.42it/s]

Processing preoptimized:   7%|▋         | 2/30 [00:00<00:03,  8.34it/s]

Processing preoptimized:  10%|█         | 3/30 [00:00<00:03,  7.62it/s]

Processing preoptimized:  13%|█▎        | 4/30 [00:00<00:03,  7.89it/s]

Processing preoptimized:  17%|█▋        | 5/30 [00:00<00:03,  8.26it/s]

Processing preoptimized:  20%|██        | 6/30 [00:00<00:02,  8.26it/s]

Processing preoptimized:  23%|██▎       | 7/30 [00:00<00:02,  8.49it/s]

Processing preoptimized:  27%|██▋       | 8/30 [00:00<00:02,  7.90it/s]

Processing preoptimized:  30%|███       | 9/30 [00:01<00:02,  8.18it/s]

Processing preoptimized:  33%|███▎      | 10/30 [00:01<00:02,  8.62it/s]

Processing preoptimized:  37%|███▋      | 11/30 [00:01<00:02,  8.74it/s]

Processing preoptimized:  40%|████      | 12/30 [00:01<00:02,  8.31it/s]

Processing preoptimized:  43%|████▎     | 13/30 [00:01<00:02,  8.49it/s]

Processing preoptimized:  47%|████▋     | 14/30 [00:01<00:01,  8.67it/s]

Processing preoptimized:  50%|█████     | 15/30 [00:01<00:01,  8.79it/s]

Processing preoptimized:  53%|█████▎    | 16/30 [00:01<00:01,  8.85it/s]

Processing preoptimized:  57%|█████▋    | 17/30 [00:02<00:01,  8.37it/s]

Processing preoptimized:  60%|██████    | 18/30 [00:02<00:01,  8.52it/s]

Processing preoptimized:  63%|██████▎   | 19/30 [00:02<00:01,  8.68it/s]

Processing preoptimized:  67%|██████▋   | 20/30 [00:02<00:01,  8.80it/s]

Processing preoptimized:  70%|███████   | 21/30 [00:02<00:01,  8.35it/s]

Processing preoptimized:  73%|███████▎  | 22/30 [00:02<00:00,  8.52it/s]

Processing preoptimized:  77%|███████▋  | 23/30 [00:02<00:00,  8.84it/s]

Processing preoptimized:  80%|████████  | 24/30 [00:02<00:00,  8.92it/s]

Processing preoptimized:  83%|████████▎ | 25/30 [00:02<00:00,  8.79it/s]

Processing preoptimized:  87%|████████▋ | 26/30 [00:03<00:00,  8.22it/s]

Processing preoptimized:  90%|█████████ | 27/30 [00:03<00:00,  8.27it/s]

Processing preoptimized:  93%|█████████▎| 28/30 [00:03<00:00,  8.48it/s]

Processing preoptimized:  97%|█████████▋| 29/30 [00:03<00:00,  8.65it/s]

Processing preoptimized: 100%|██████████| 30/30 [00:03<00:00,  8.13it/s]

Processing preoptimized: 100%|██████████| 30/30 [00:03<00:00,  8.43it/s]


Best ensemble (ignoring overlaps): 70.647327
Improvement from baseline: 0.000000





In [6]:
# Now check which solutions have overlaps
print("Checking for overlaps in best solutions...")

overlap_ns = []
for n in tqdm(range(1, 201), desc="Checking overlaps"):
    g = best_per_n[n]['data']
    xs = [strip_s(v) for v in g['x'].values]
    ys = [strip_s(v) for v in g['y'].values]
    ds = [strip_s(v) for v in g['deg'].values]
    
    has_overlap, msg = check_overlaps(xs, ys, ds)
    if has_overlap:
        overlap_ns.append(n)
        # Revert to baseline
        best_per_n[n] = baseline_per_n[n].copy()

print(f"\nFound {len(overlap_ns)} N values with overlaps: {overlap_ns[:20]}...")

# Calculate valid total
valid_total = sum(d['score'] for d in best_per_n.values())
print(f"\nValid ensemble score: {valid_total:.6f}")
print(f"Improvement from baseline: {sum(d['score'] for d in baseline_per_n.values()) - valid_total:.6f}")

Checking for overlaps in best solutions...


Checking overlaps:   0%|          | 0/200 [00:00<?, ?it/s]

Checking overlaps:  17%|█▋        | 34/200 [00:00<00:00, 338.07it/s]

Checking overlaps:  34%|███▍      | 68/200 [00:00<00:01, 112.62it/s]

Checking overlaps:  44%|████▎     | 87/200 [00:01<00:01, 70.59it/s] 

Checking overlaps:  50%|████▉     | 99/200 [00:01<00:01, 53.14it/s]

Checking overlaps:  54%|█████▍    | 108/200 [00:01<00:02, 42.82it/s]

Checking overlaps:  57%|█████▊    | 115/200 [00:02<00:02, 35.89it/s]

Checking overlaps:  60%|██████    | 120/200 [00:02<00:02, 31.43it/s]

Checking overlaps:  62%|██████▏   | 124/200 [00:02<00:02, 28.16it/s]

Checking overlaps:  64%|██████▍   | 128/200 [00:02<00:02, 24.98it/s]

Checking overlaps:  66%|██████▌   | 131/200 [00:03<00:03, 22.77it/s]

Checking overlaps:  67%|██████▋   | 134/200 [00:03<00:03, 20.72it/s]

Checking overlaps:  68%|██████▊   | 137/200 [00:03<00:03, 18.97it/s]

Checking overlaps:  70%|██████▉   | 139/200 [00:03<00:03, 17.80it/s]

Checking overlaps:  70%|███████   | 141/200 [00:03<00:03, 16.64it/s]

Checking overlaps:  72%|███████▏  | 143/200 [00:03<00:03, 15.87it/s]

Checking overlaps:  72%|███████▎  | 145/200 [00:04<00:03, 14.92it/s]

Checking overlaps:  74%|███████▎  | 147/200 [00:04<00:03, 14.25it/s]

Checking overlaps:  74%|███████▍  | 149/200 [00:04<00:03, 13.56it/s]

Checking overlaps:  76%|███████▌  | 151/200 [00:04<00:03, 13.00it/s]

Checking overlaps:  76%|███████▋  | 153/200 [00:04<00:03, 12.55it/s]

Checking overlaps:  78%|███████▊  | 155/200 [00:04<00:03, 12.09it/s]

Checking overlaps:  78%|███████▊  | 157/200 [00:05<00:03, 11.71it/s]

Checking overlaps:  80%|███████▉  | 159/200 [00:05<00:03, 11.48it/s]

Checking overlaps:  80%|████████  | 161/200 [00:05<00:03, 11.10it/s]

Checking overlaps:  82%|████████▏ | 163/200 [00:05<00:03, 10.77it/s]

Checking overlaps:  82%|████████▎ | 165/200 [00:05<00:03, 10.62it/s]

Checking overlaps:  84%|████████▎ | 167/200 [00:06<00:03, 10.33it/s]

Checking overlaps:  84%|████████▍ | 169/200 [00:06<00:03, 10.21it/s]

Checking overlaps:  86%|████████▌ | 171/200 [00:06<00:02,  9.93it/s]

Checking overlaps:  86%|████████▌ | 172/200 [00:06<00:02,  9.71it/s]

Checking overlaps:  86%|████████▋ | 173/200 [00:06<00:02,  9.54it/s]

Checking overlaps:  87%|████████▋ | 174/200 [00:06<00:02,  9.44it/s]

Checking overlaps:  88%|████████▊ | 175/200 [00:07<00:02,  9.16it/s]

Checking overlaps:  88%|████████▊ | 176/200 [00:07<00:02,  8.92it/s]

Checking overlaps:  88%|████████▊ | 177/200 [00:07<00:02,  8.71it/s]

Checking overlaps:  89%|████████▉ | 178/200 [00:07<00:02,  8.55it/s]

Checking overlaps:  90%|████████▉ | 179/200 [00:07<00:02,  8.43it/s]

Checking overlaps:  90%|█████████ | 180/200 [00:07<00:02,  8.45it/s]

Checking overlaps:  90%|█████████ | 181/200 [00:07<00:02,  8.27it/s]

Checking overlaps:  91%|█████████ | 182/200 [00:07<00:02,  8.13it/s]

Checking overlaps:  92%|█████████▏| 183/200 [00:07<00:02,  7.98it/s]

Checking overlaps:  92%|█████████▏| 184/200 [00:08<00:01,  8.02it/s]

Checking overlaps:  92%|█████████▎| 185/200 [00:08<00:01,  8.02it/s]

Checking overlaps:  93%|█████████▎| 186/200 [00:08<00:01,  8.01it/s]

Checking overlaps:  94%|█████████▎| 187/200 [00:08<00:01,  7.82it/s]

Checking overlaps:  94%|█████████▍| 188/200 [00:08<00:01,  7.88it/s]

Checking overlaps:  94%|█████████▍| 189/200 [00:08<00:01,  7.74it/s]

Checking overlaps:  95%|█████████▌| 190/200 [00:08<00:01,  7.58it/s]

Checking overlaps:  96%|█████████▌| 191/200 [00:09<00:01,  7.44it/s]

Checking overlaps:  96%|█████████▌| 192/200 [00:09<00:01,  7.42it/s]

Checking overlaps:  96%|█████████▋| 193/200 [00:09<00:00,  7.43it/s]

Checking overlaps:  97%|█████████▋| 194/200 [00:09<00:00,  7.29it/s]

Checking overlaps:  98%|█████████▊| 195/200 [00:09<00:00,  7.31it/s]

Checking overlaps:  98%|█████████▊| 196/200 [00:09<00:00,  7.36it/s]

Checking overlaps:  98%|█████████▊| 197/200 [00:09<00:00,  7.34it/s]

Checking overlaps:  99%|█████████▉| 198/200 [00:09<00:00,  7.33it/s]

Checking overlaps: 100%|█████████▉| 199/200 [00:10<00:00,  7.27it/s]

Checking overlaps: 100%|██████████| 200/200 [00:10<00:00,  7.06it/s]

Checking overlaps: 100%|██████████| 200/200 [00:10<00:00, 19.44it/s]


Found 0 N values with overlaps: []...

Valid ensemble score: 70.647327
Improvement from baseline: 0.000000





In [7]:
# Override N=1 with optimal value (from jonathanchan kernel)
# x=0, y=0, deg=45 gives score 0.6612499...
optimal_n1 = pd.DataFrame({
    'id': ['001_0'],
    'x': ['s0.0'],
    'y': ['s0.0'],
    'deg': ['s45.0']
})

xs = [strip_s(v) for v in optimal_n1['x'].values]
ys = [strip_s(v) for v in optimal_n1['y'].values]
ds = [strip_s(v) for v in optimal_n1['deg'].values]
n1_score = score_group(xs, ys, ds)

print(f"Optimal N=1 score: {n1_score:.10f}")
print(f"Current N=1 score: {best_per_n[1]['score']:.10f}")

if n1_score < best_per_n[1]['score']:
    print(f"Improvement: {best_per_n[1]['score'] - n1_score:.10f}")
    best_per_n[1] = {
        'score': n1_score,
        'data': optimal_n1.copy(),
        'source': 'optimal_n1'
    }

# Final score
final_total = sum(d['score'] for d in best_per_n.values())
print(f"\nFinal ensemble score: {final_total:.6f}")

Optimal N=1 score: 0.6612500000
Current N=1 score: 0.6612500000

Final ensemble score: 70.647327


In [8]:
# Show which sources contributed
source_counts = {}
for n in range(1, 201):
    src = best_per_n[n]['source']
    source_counts[src] = source_counts.get(src, 0) + 1

print("Sources contributing to ensemble:")
for src, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f"  {src}: {count} N values")

Sources contributing to ensemble:
  valid_baseline: 198 N values
  ensemble.csv: 1 N values
  submission.csv: 1 N values


In [9]:
# Create the final submission
rows = []
for n in range(1, 201):
    rows.append(best_per_n[n]['data'])

final_df = pd.concat(rows, ignore_index=True)
final_df['sn'] = final_df['id'].str.split('_').str[0].astype(int)
final_df['si'] = final_df['id'].str.split('_').str[1].astype(int)
final_df = final_df.sort_values(['sn', 'si']).drop(columns=['sn', 'si'])
final_df = final_df[['id', 'x', 'y', 'deg']]

print(f"Final submission shape: {final_df.shape}")
print(f"Expected: (20100, 4)")

# Save
os.makedirs('/home/code/experiments/005_comprehensive_ensemble', exist_ok=True)
final_df.to_csv('/home/code/experiments/005_comprehensive_ensemble/submission.csv', index=False)

# Save metrics
metrics = {
    'cv_score': final_total,
    'baseline_score': sum(d['score'] for d in baseline_per_n.values()),
    'improvement': sum(d['score'] for d in baseline_per_n.values()) - final_total,
    'target': 68.888293,
    'gap': final_total - 68.888293,
    'num_sources': len(source_counts),
    'overlap_ns_reverted': len(overlap_ns)
}

with open('/home/code/experiments/005_comprehensive_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"\nMetrics: {metrics}")

Final submission shape: (20100, 4)
Expected: (20100, 4)

Metrics: {'cv_score': 70.6473268976368, 'baseline_score': 70.64732689763682, 'improvement': 1.4210854715202004e-14, 'target': 68.888293, 'gap': 1.7590338976367974, 'num_sources': 3, 'overlap_ns_reverted': 0}


In [10]:
# Final validation - check ALL N values for overlaps
print("Final validation - checking all N values...")

final_overlaps = []
for n in tqdm(range(1, 201), desc="Final validation"):
    g = best_per_n[n]['data']
    xs = [strip_s(v) for v in g['x'].values]
    ys = [strip_s(v) for v in g['y'].values]
    ds = [strip_s(v) for v in g['deg'].values]
    
    has_overlap, msg = check_overlaps(xs, ys, ds)
    if has_overlap:
        final_overlaps.append((n, msg))

if final_overlaps:
    print(f"\nWARNING: {len(final_overlaps)} N values still have overlaps!")
    for n, msg in final_overlaps[:10]:
        print(f"  N={n}: {msg}")
else:
    print("\n✓ All N values pass overlap validation!")
    print(f"\nFinal score: {final_total:.6f}")
    print(f"Gap to target: {final_total - 68.888293:.6f}")

Final validation - checking all N values...


Final validation:   0%|          | 0/200 [00:00<?, ?it/s]

Final validation:  18%|█▊        | 35/200 [00:00<00:00, 339.99it/s]

Final validation:  34%|███▍      | 69/200 [00:00<00:01, 110.53it/s]

Final validation:  44%|████▍     | 88/200 [00:01<00:01, 69.19it/s] 

Final validation:  50%|█████     | 100/200 [00:01<00:01, 52.04it/s]

Final validation:  55%|█████▍    | 109/200 [00:01<00:02, 41.88it/s]

Final validation:  58%|█████▊    | 116/200 [00:02<00:02, 34.97it/s]

Final validation:  60%|██████    | 121/200 [00:02<00:02, 30.69it/s]

Final validation:  62%|██████▎   | 125/200 [00:02<00:02, 27.42it/s]

Final validation:  64%|██████▍   | 129/200 [00:02<00:02, 24.39it/s]

Final validation:  66%|██████▌   | 132/200 [00:03<00:03, 22.21it/s]

Final validation:  68%|██████▊   | 135/200 [00:03<00:03, 20.26it/s]

Final validation:  69%|██████▉   | 138/200 [00:03<00:03, 18.72it/s]

Final validation:  70%|███████   | 140/200 [00:03<00:03, 17.70it/s]

Final validation:  71%|███████   | 142/200 [00:03<00:03, 16.51it/s]

Final validation:  72%|███████▏  | 144/200 [00:04<00:03, 15.59it/s]

Final validation:  73%|███████▎  | 146/200 [00:04<00:03, 14.68it/s]

Final validation:  74%|███████▍  | 148/200 [00:04<00:03, 14.12it/s]

Final validation:  75%|███████▌  | 150/200 [00:04<00:03, 13.42it/s]

Final validation:  76%|███████▌  | 152/200 [00:04<00:03, 12.80it/s]

Final validation:  77%|███████▋  | 154/200 [00:04<00:03, 12.29it/s]

Final validation:  78%|███████▊  | 156/200 [00:05<00:03, 11.90it/s]

Final validation:  79%|███████▉  | 158/200 [00:05<00:03, 11.49it/s]

Final validation:  80%|████████  | 160/200 [00:05<00:03, 11.23it/s]

Final validation:  81%|████████  | 162/200 [00:05<00:03, 10.91it/s]

Final validation:  82%|████████▏ | 164/200 [00:05<00:03, 10.71it/s]

Final validation:  83%|████████▎ | 166/200 [00:06<00:03, 10.44it/s]

Final validation:  84%|████████▍ | 168/200 [00:06<00:03, 10.21it/s]

Final validation:  85%|████████▌ | 170/200 [00:06<00:03,  9.96it/s]

Final validation:  86%|████████▌ | 171/200 [00:06<00:02,  9.82it/s]

Final validation:  86%|████████▌ | 172/200 [00:06<00:02,  9.59it/s]

Final validation:  86%|████████▋ | 173/200 [00:06<00:02,  9.42it/s]

Final validation:  87%|████████▋ | 174/200 [00:06<00:02,  9.37it/s]

Final validation:  88%|████████▊ | 175/200 [00:07<00:02,  9.16it/s]

Final validation:  88%|████████▊ | 176/200 [00:07<00:02,  8.94it/s]

Final validation:  88%|████████▊ | 177/200 [00:07<00:02,  8.87it/s]

Final validation:  89%|████████▉ | 178/200 [00:07<00:02,  8.79it/s]

Final validation:  90%|████████▉ | 179/200 [00:07<00:02,  8.65it/s]

Final validation:  90%|█████████ | 180/200 [00:07<00:02,  8.43it/s]

Final validation:  90%|█████████ | 181/200 [00:07<00:02,  8.44it/s]

Final validation:  91%|█████████ | 182/200 [00:07<00:02,  8.33it/s]

Final validation:  92%|█████████▏| 183/200 [00:07<00:02,  8.27it/s]

Final validation:  92%|█████████▏| 184/200 [00:08<00:01,  8.18it/s]

Final validation:  92%|█████████▎| 185/200 [00:08<00:01,  8.00it/s]

Final validation:  93%|█████████▎| 186/200 [00:08<00:01,  8.00it/s]

Final validation:  94%|█████████▎| 187/200 [00:08<00:01,  7.82it/s]

Final validation:  94%|█████████▍| 188/200 [00:08<00:01,  7.69it/s]

Final validation:  94%|█████████▍| 189/200 [00:08<00:01,  7.60it/s]

Final validation:  95%|█████████▌| 190/200 [00:08<00:01,  7.53it/s]

Final validation:  96%|█████████▌| 191/200 [00:09<00:01,  7.62it/s]

Final validation:  96%|█████████▌| 192/200 [00:09<00:01,  7.59it/s]

Final validation:  96%|█████████▋| 193/200 [00:09<00:00,  7.45it/s]

Final validation:  97%|█████████▋| 194/200 [00:09<00:00,  7.39it/s]

Final validation:  98%|█████████▊| 195/200 [00:09<00:00,  7.26it/s]

Final validation:  98%|█████████▊| 196/200 [00:09<00:00,  7.13it/s]

Final validation:  98%|█████████▊| 197/200 [00:09<00:00,  7.18it/s]

Final validation:  99%|█████████▉| 198/200 [00:10<00:00,  7.08it/s]

Final validation: 100%|█████████▉| 199/200 [00:10<00:00,  7.04it/s]

Final validation: 100%|██████████| 200/200 [00:10<00:00,  7.02it/s]

Final validation: 100%|██████████| 200/200 [00:10<00:00, 19.39it/s]


✓ All N values pass overlap validation!

Final score: 70.647327
Gap to target: 1.759034





In [None]:
# Let's scan ALL snapshots more comprehensively
# The previous exp_004 found improvements from snapshots 21336527339, 21331543270, 21145966992

snapshot_base = '/home/nonroot/snapshots/santa-2025/'
snapshot_dirs = [d for d in os.listdir(snapshot_base) if os.path.isdir(os.path.join(snapshot_base, d))]
print(f"Found {len(snapshot_dirs)} snapshot directories")

# Find all submission.csv files in snapshots
snapshot_submissions = []
for snap_dir in snapshot_dirs:
    snap_path = os.path.join(snapshot_base, snap_dir)
    for root, dirs, files in os.walk(snap_path):
        for f in files:
            if f.endswith('.csv'):
                snapshot_submissions.append(os.path.join(root, f))

print(f"Found {len(snapshot_submissions)} CSV files in snapshots")