# Evolver Loop 10 Analysis

## Goal: Find more diverse sources and ensemble to beat 70.364

Current best: 70.364392 (exp_010_full_ensemble)
Target: 68.879467
Gap: 1.485 points

In [2]:
import pandas as pd
import numpy as np
import os
import math
from numba import njit
from decimal import Decimal, getcontext
from shapely.geometry import Polygon
from shapely import affinity

getcontext().prec = 30
SCALE = 10**18

@njit
def make_polygon_template():
    tw=0.15; th=0.2; bw=0.7; mw=0.4; ow=0.25
    tip=0.8; t1=0.5; t2=0.25; base=0.0; tbot=-th
    x=np.array([0,ow/2,ow/4,mw/2,mw/4,bw/2,tw/2,tw/2,-tw/2,-tw/2,-bw/2,-mw/4,-mw/2,-ow/4,-ow/2],np.float64)
    y=np.array([tip,t1,t1,t2,t2,base,base,tbot,tbot,base,base,t2,t2,t1,t1],np.float64)
    return x,y

@njit
def score_group(xs, ys, degs, tx, ty):
    """Compute score for a group of trees."""
    n = xs.size
    V = tx.size
    mnx = 1e300; mny = 1e300; mxx = -1e300; mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c = math.cos(r); s = math.sin(r)
        xi = xs[i]; yi = ys[i]
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xi
            Y = s * tx[j] + c * ty[j] + yi
            if X < mnx: mnx = X
            if X > mxx: mxx = X
            if Y < mny: mny = Y
            if Y > mxy: mxy = Y
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    return np.array([float(str(v).replace("s", "")) for v in a], np.float64)

tx, ty = make_polygon_template()

def compute_score_for_n(df, n):
    """Compute score for a specific N."""
    n_df = df[df['N'] == n]
    if len(n_df) == 0:
        return float('inf')
    xs = strip(n_df['x'].to_numpy())
    ys = strip(n_df['y'].to_numpy())
    ds = strip(n_df['deg'].to_numpy())
    return score_group(xs, ys, ds, tx, ty)

def compute_total_score(df):
    """Compute total score for all N."""
    df = df.copy()
    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    total = 0
    for n in range(1, 201):
        score = compute_score_for_n(df, n)
        total += score
    return total

def load_and_prepare(filepath):
    """Load CSV and add N column."""
    df = pd.read_csv(filepath)
    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    return df

print("Functions loaded successfully")

Functions loaded successfully


In [3]:
# Check all available CSV files in kaggle_datasets
import glob

csv_files = []
for root, dirs, files in os.walk('/home/code/kaggle_datasets'):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

print(f"Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(f"  {f}")

Found 42 CSV files:
  /home/code/kaggle_datasets/submission.csv
  /home/code/kaggle_datasets/santa-2025.csv
  /home/code/kaggle_datasets/72.49.csv
  /home/code/kaggle_datasets/71.97.csv
  /home/code/kaggle_datasets/santa25-public/submission_JKoT4.csv
  /home/code/kaggle_datasets/santa25-public/New_Tree_144_196.csv
  /home/code/kaggle_datasets/santa25-public/submission_JKoT3.csv
  /home/code/kaggle_datasets/santa25-public/santa2025_ver2_v61.csv
  /home/code/kaggle_datasets/santa25-public/submission_JKoT2.csv
  /home/code/kaggle_datasets/santa25-public/santa2025_ver2_v67.csv
  /home/code/kaggle_datasets/santa25-public/santa2025_ver2_v76.csv
  /home/code/kaggle_datasets/santa25-public/submission_70_936673758122.csv
  /home/code/kaggle_datasets/santa25-public/santa2025_ver2_v65.csv
  /home/code/kaggle_datasets/santa25-public/submission_70_926149550346.csv
  /home/code/kaggle_datasets/santa25-public/santa2025_ver2_v66.csv
  /home/code/kaggle_datasets/santa25-public/santa2025_ver2_v63.csv
  

In [5]:
# Score each unique CSV file (remove duplicates from different directories)
unique_files = {}
for f in csv_files:
    basename = os.path.basename(f)
    if basename not in unique_files:
        unique_files[basename] = f

print(f"Found {len(unique_files)} unique CSV files")

scores = {}
for basename, csv_file in unique_files.items():
    try:
        df = pd.read_csv(csv_file)
        if 'id' in df.columns and 'x' in df.columns:
            df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
            score = compute_total_score(df)
            scores[basename] = score
            print(f"{basename}: {score:.6f}")
    except Exception as e:
        print(f"{basename}: ERROR - {e}")

print(f"\nScored {len(scores)} files")

Found 22 unique CSV files


submission.csv: 70.647327
santa-2025.csv: 70.376051


72.49.csv: 72.495739
71.97.csv: 71.972027


submission_JKoT4.csv: 72.489504
New_Tree_144_196.csv: 72.927920
submission_JKoT3.csv: 72.489488


santa2025_ver2_v61.csv: 72.951925
submission_JKoT2.csv: 72.489348
santa2025_ver2_v67.csv: 72.938567


santa2025_ver2_v76.csv: 72.826444
submission_70_936673758122.csv: 70.936674


santa2025_ver2_v65.csv: 72.935294
submission_70_926149550346.csv: 70.926150
santa2025_ver2_v66.csv: 72.938599


santa2025_ver2_v63.csv: 72.947427
santa2025_ver2_v69.csv: 72.850110
submission_JKoT1.csv: 72.489483


submission_opt1.csv: 70.990692
santa2025_ver2_v68.csv: 72.939233
submission_best.csv: 70.926150


70.378875862989_20260126_045659.csv: 70.378876

Scored 22 files


In [6]:
# Load current best (exp_010)
exp010_df = load_and_prepare('/home/code/experiments/010_full_ensemble/submission.csv')
exp010_score = compute_total_score(exp010_df)
print(f"Current best (exp_010): {exp010_score:.6f}")

# Load exp_009 for comparison
exp009_df = load_and_prepare('/home/code/experiments/009_santa_ensemble/submission.csv')
exp009_score = compute_total_score(exp009_df)
print(f"exp_009: {exp009_score:.6f}")

# Target
target = 68.879467
print(f"\nTarget: {target:.6f}")
print(f"Gap: {exp010_score - target:.6f}")

Current best (exp_010): 70.364392
exp_009: 70.373334

Target: 68.879467
Gap: 1.484925


In [7]:
# Strict overlap checking using 1e18 scaling
@njit
def check_overlap_numba(xs1, ys1, degs1, xs2, ys2, degs2, tx, ty):
    """Check if two tree placements overlap using integer scaling."""
    SCALE = 10**15  # Use 1e15 for numba compatibility
    
    # Get vertices for tree 1
    r1 = degs1 * math.pi / 180.0
    c1, s1 = math.cos(r1), math.sin(r1)
    verts1 = []
    for j in range(len(tx)):
        X = int((c1 * tx[j] - s1 * ty[j] + xs1) * SCALE)
        Y = int((s1 * tx[j] + c1 * ty[j] + ys1) * SCALE)
        verts1.append((X, Y))
    
    # Get vertices for tree 2
    r2 = degs2 * math.pi / 180.0
    c2, s2 = math.cos(r2), math.sin(r2)
    verts2 = []
    for j in range(len(tx)):
        X = int((c2 * tx[j] - s2 * ty[j] + xs2) * SCALE)
        Y = int((s2 * tx[j] + c2 * ty[j] + ys2) * SCALE)
        verts2.append((X, Y))
    
    return verts1, verts2

def check_overlaps_for_n(df, n):
    """Check for overlaps in a specific N group using Shapely with high precision."""
    n_df = df[df['N'] == n]
    if len(n_df) <= 1:
        return True, []
    
    xs = strip(n_df['x'].to_numpy())
    ys = strip(n_df['y'].to_numpy())
    ds = strip(n_df['deg'].to_numpy())
    
    # Create polygons with high precision
    polygons = []
    for i in range(len(xs)):
        r = ds[i] * math.pi / 180.0
        c, s = math.cos(r), math.sin(r)
        coords = []
        for j in range(len(tx)):
            X = c * tx[j] - s * ty[j] + xs[i]
            Y = s * tx[j] + c * ty[j] + ys[i]
            # Scale to integers for strict comparison
            coords.append((int(Decimal(str(X)) * SCALE), int(Decimal(str(Y)) * SCALE)))
        polygons.append(Polygon(coords))
    
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                overlaps.append((i, j))
    
    return len(overlaps) == 0, overlaps

# Test on a few N values
print("Testing overlap detection on exp_010...")
for n in [1, 2, 5, 10, 50, 100, 200]:
    ok, overlaps = check_overlaps_for_n(exp010_df, n)
    status = "✓" if ok else f"✗ ({len(overlaps)} overlaps)"
    print(f"  N={n}: {status}")

Testing overlap detection on exp_010...
  N=1: ✓
  N=2: ✓
  N=5: ✓
  N=10: ✓
  N=50: ✓
  N=100: ✓
  N=200: ✓


In [8]:
# Load all sources and find best per-N
print("Loading all sources...")

sources = {}

# Current best
sources['exp_010'] = exp010_df

# All kaggle_datasets CSVs
for basename, csv_file in unique_files.items():
    try:
        df = pd.read_csv(csv_file)
        if 'id' in df.columns and 'x' in df.columns:
            df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
            sources[basename] = df
    except:
        pass

# Also load from snapshots
snapshot_dir = '/home/nonroot/snapshots/santa-2025'
if os.path.exists(snapshot_dir):
    for snap in os.listdir(snapshot_dir)[:20]:  # Limit to 20 snapshots
        snap_path = os.path.join(snapshot_dir, snap, 'submission', 'submission.csv')
        if os.path.exists(snap_path):
            try:
                df = pd.read_csv(snap_path)
                if 'id' in df.columns and 'x' in df.columns:
                    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
                    sources[f'snap_{snap}'] = df
            except:
                pass

print(f"Loaded {len(sources)} sources")

Loading all sources...


Loaded 37 sources


In [11]:
# Find best per-N from all sources with strict overlap validation
from tqdm import tqdm

print("Finding best per-N from all sources with strict validation...")

best_per_n = {}
improvements = []

for n in tqdm(range(1, 201), desc="Processing N"):
    best_score = float('inf')
    best_source = None
    best_rows = None
    
    for name, df in sources.items():
        n_df = df[df['N'] == n]
        if len(n_df) == 0:
            continue
        
        # Check if df has required columns
        if 'deg' not in df.columns:
            continue
        
        try:
            score = compute_score_for_n(df, n)
            if score < best_score:
                # Check for overlaps
                ok, _ = check_overlaps_for_n(df, n)
                if ok:
                    best_score = score
                    best_source = name
                    best_rows = n_df.copy()
        except Exception as e:
            continue
    
    best_per_n[n] = {
        'source': best_source,
        'score': best_score,
        'rows': best_rows
    }
    
    # Check if this is an improvement over exp_010
    exp010_score_n = compute_score_for_n(exp010_df, n)
    if best_score < exp010_score_n - 1e-9:
        improvements.append((n, best_source, exp010_score_n - best_score))

print(f"\nFound {len(improvements)} improvements over exp_010:")

Finding best per-N from all sources with strict validation...


Processing N:   0%|          | 0/200 [00:00<?, ?it/s]

Processing N:   4%|▍         | 9/200 [00:00<00:02, 84.30it/s]

Processing N:   9%|▉         | 18/200 [00:00<00:02, 79.53it/s]

Processing N:  13%|█▎        | 26/200 [00:00<00:02, 72.36it/s]

Processing N:  17%|█▋        | 34/200 [00:00<00:02, 63.05it/s]

Processing N:  20%|██        | 41/200 [00:00<00:02, 60.37it/s]

Processing N:  24%|██▍       | 48/200 [00:00<00:02, 55.09it/s]

Processing N:  27%|██▋       | 54/200 [00:00<00:02, 51.39it/s]

Processing N:  30%|███       | 60/200 [00:01<00:02, 47.26it/s]

Processing N:  32%|███▎      | 65/200 [00:01<00:02, 45.07it/s]

Processing N:  35%|███▌      | 70/200 [00:01<00:03, 42.28it/s]

Processing N:  38%|███▊      | 75/200 [00:01<00:03, 38.30it/s]

Processing N:  40%|███▉      | 79/200 [00:01<00:03, 36.42it/s]

Processing N:  42%|████▏     | 83/200 [00:01<00:03, 34.42it/s]

Processing N:  44%|████▎     | 87/200 [00:01<00:04, 27.41it/s]

Processing N:  45%|████▌     | 90/200 [00:02<00:04, 26.03it/s]

Processing N:  46%|████▋     | 93/200 [00:02<00:04, 26.03it/s]

Processing N:  48%|████▊     | 96/200 [00:02<00:04, 24.17it/s]

Processing N:  50%|████▉     | 99/200 [00:02<00:04, 24.14it/s]

Processing N:  51%|█████     | 102/200 [00:02<00:05, 18.72it/s]

Processing N:  52%|█████▎    | 105/200 [00:02<00:05, 17.55it/s]

Processing N:  54%|█████▍    | 108/200 [00:03<00:05, 18.27it/s]

Processing N:  55%|█████▌    | 110/200 [00:03<00:05, 17.51it/s]

Processing N:  56%|█████▌    | 112/200 [00:03<00:05, 17.37it/s]

Processing N:  57%|█████▋    | 114/200 [00:03<00:04, 17.90it/s]

Processing N:  58%|█████▊    | 116/200 [00:03<00:05, 14.32it/s]

Processing N:  59%|█████▉    | 118/200 [00:03<00:06, 12.25it/s]

Processing N:  60%|██████    | 120/200 [00:04<00:05, 13.52it/s]

Processing N:  61%|██████    | 122/200 [00:04<00:05, 14.50it/s]

Processing N:  62%|██████▏   | 124/200 [00:04<00:04, 15.27it/s]

Processing N:  63%|██████▎   | 126/200 [00:04<00:04, 15.63it/s]

Processing N:  64%|██████▍   | 128/200 [00:04<00:04, 15.96it/s]

Processing N:  65%|██████▌   | 130/200 [00:04<00:05, 12.07it/s]

Processing N:  66%|██████▌   | 132/200 [00:04<00:05, 12.94it/s]

Processing N:  67%|██████▋   | 134/200 [00:05<00:04, 13.56it/s]

Processing N:  68%|██████▊   | 136/200 [00:05<00:05, 12.65it/s]

Processing N:  69%|██████▉   | 138/200 [00:05<00:06, 10.15it/s]

Processing N:  70%|███████   | 140/200 [00:05<00:05, 10.25it/s]

Processing N:  71%|███████   | 142/200 [00:05<00:05, 11.14it/s]

Processing N:  72%|███████▏  | 144/200 [00:06<00:06,  9.12it/s]

Processing N:  73%|███████▎  | 146/200 [00:06<00:05, 10.14it/s]

Processing N:  74%|███████▍  | 148/200 [00:06<00:04, 10.99it/s]

Processing N:  75%|███████▌  | 150/200 [00:06<00:04, 11.59it/s]

Processing N:  76%|███████▌  | 152/200 [00:06<00:04, 11.98it/s]

Processing N:  77%|███████▋  | 154/200 [00:06<00:03, 12.10it/s]

Processing N:  78%|███████▊  | 156/200 [00:07<00:03, 12.21it/s]

Processing N:  79%|███████▉  | 158/200 [00:07<00:03, 12.15it/s]

Processing N:  80%|████████  | 160/200 [00:07<00:03, 12.09it/s]

Processing N:  81%|████████  | 162/200 [00:07<00:03, 11.94it/s]

Processing N:  82%|████████▏ | 164/200 [00:07<00:03, 10.41it/s]

Processing N:  83%|████████▎ | 166/200 [00:08<00:03,  9.54it/s]

Processing N:  84%|████████▎ | 167/200 [00:08<00:04,  6.65it/s]

Processing N:  84%|████████▍ | 168/200 [00:08<00:04,  7.12it/s]

Processing N:  85%|████████▌ | 170/200 [00:08<00:04,  6.04it/s]

Processing N:  86%|████████▌ | 171/200 [00:09<00:05,  5.02it/s]

Processing N:  86%|████████▋ | 173/200 [00:09<00:04,  6.23it/s]

Processing N:  87%|████████▋ | 174/200 [00:09<00:05,  4.56it/s]

Processing N:  88%|████████▊ | 176/200 [00:10<00:04,  5.76it/s]

Processing N:  88%|████████▊ | 177/200 [00:10<00:03,  6.32it/s]

Processing N:  89%|████████▉ | 178/200 [00:10<00:03,  6.87it/s]

Processing N:  90%|████████▉ | 179/200 [00:10<00:02,  7.43it/s]

Processing N:  90%|█████████ | 180/200 [00:10<00:02,  7.92it/s]

Processing N:  90%|█████████ | 181/200 [00:10<00:02,  8.26it/s]

Processing N:  91%|█████████ | 182/200 [00:10<00:02,  8.61it/s]

Processing N:  92%|█████████▏| 183/200 [00:10<00:01,  8.78it/s]

Processing N:  92%|█████████▏| 184/200 [00:11<00:03,  5.22it/s]

Processing N:  92%|█████████▎| 185/200 [00:11<00:02,  5.78it/s]

Processing N:  93%|█████████▎| 186/200 [00:11<00:02,  6.49it/s]

Processing N:  94%|█████████▎| 187/200 [00:11<00:01,  7.10it/s]

Processing N:  94%|█████████▍| 188/200 [00:11<00:01,  7.55it/s]

Processing N:  94%|█████████▍| 189/200 [00:11<00:01,  7.94it/s]

Processing N:  95%|█████████▌| 190/200 [00:12<00:01,  5.56it/s]

Processing N:  96%|█████████▌| 191/200 [00:12<00:01,  6.27it/s]

Processing N:  96%|█████████▌| 192/200 [00:12<00:01,  6.80it/s]

Processing N:  96%|█████████▋| 193/200 [00:12<00:00,  7.27it/s]

Processing N:  97%|█████████▋| 194/200 [00:12<00:00,  7.63it/s]

Processing N:  98%|█████████▊| 195/200 [00:12<00:00,  7.87it/s]

Processing N:  98%|█████████▊| 196/200 [00:12<00:00,  8.01it/s]

Processing N:  98%|█████████▊| 197/200 [00:12<00:00,  8.07it/s]

Processing N:  99%|█████████▉| 198/200 [00:13<00:00,  8.15it/s]

Processing N: 100%|█████████▉| 199/200 [00:13<00:00,  3.85it/s]

Processing N: 100%|██████████| 200/200 [00:13<00:00,  4.57it/s]

Processing N: 100%|██████████| 200/200 [00:13<00:00, 14.59it/s]


Found 0 improvements over exp_010:





In [10]:
# Debug: check which sources have the right columns
for name, df in list(sources.items())[:5]:
    print(f"{name}: columns = {list(df.columns)}")

exp_010: columns = ['id', 'x', 'y', 'deg', 'N']
submission.csv: columns = ['id', 'x', 'y', 'deg', 'N']
santa-2025.csv: columns = ['id', 'x', 'y', 'deg', 'N']
72.49.csv: columns = ['id', 'x', 'y', 'deg', 'N']
71.97.csv: columns = ['id', 'x', 'y', 'deg', 'N']


In [13]:
# Check which sources have better scores but with overlaps
print("Checking for sources with better scores (even with overlaps)...")

potential_improvements = []

for n in range(1, 201):
    exp010_score_n = compute_score_for_n(exp010_df, n)
    
    for name, df in sources.items():
        n_df = df[df['N'] == n]
        if len(n_df) == 0 or 'deg' not in df.columns:
            continue
        
        try:
            score = compute_score_for_n(df, n)
            if score < exp010_score_n - 1e-6:
                ok, overlaps = check_overlaps_for_n(df, n)
                potential_improvements.append({
                    'n': n,
                    'source': name,
                    'score': score,
                    'exp010_score': exp010_score_n,
                    'improvement': exp010_score_n - score,
                    'has_overlaps': not ok,
                    'num_overlaps': len(overlaps)
                })
        except:
            continue

# Sort by improvement
potential_improvements.sort(key=lambda x: -x['improvement'])

print(f"\nFound {len(potential_improvements)} potential improvements:")
print("\nTop 20 (including those with overlaps):")
for imp in potential_improvements[:20]:
    overlap_str = f"OVERLAPS ({imp['num_overlaps']})" if imp['has_overlaps'] else "OK"
    print(f"  N={imp['n']:3d}: {imp['source']:30s} score={imp['score']:.6f} (improvement={imp['improvement']:.6f}) {overlap_str}")

Checking for sources with better scores (even with overlaps)...



Found 0 potential improvements:

Top 20 (including those with overlaps):


In [None]:
# Summary of current state
print("=== CURRENT STATE SUMMARY ===")
print(f"Current best (exp_010): {exp010_score:.6f}")
print(f"Target: {target:.6f}")
print(f"Gap: {exp010_score - target:.6f} ({(exp010_score - target) / target * 100:.2f}%)")

print("\n=== BEST SOURCES BY TOTAL SCORE ===")
sorted_scores = sorted(scores.items(), key=lambda x: x[1])
for name, score in sorted_scores[:10]:
    print(f"  {name}: {score:.6f}")

print("\n=== ANALYSIS ===")
print("exp_010 is already the best ensemble from all available sources.")
print("No further improvements possible from current datasets.")
print("\nTo improve further, we need:")
print("1. Download more Kaggle datasets")
print("2. Run C++ optimizer (sa_v1_parallel) with overlap fixing")
print("3. Implement novel algorithms (NFP, branch-and-bound)")

In [None]:
# Load current best (exp_010)
best_df = pd.read_csv('/home/code/experiments/010_full_ensemble/submission.csv')
best_score = compute_total_score(best_df)
print(f"Current best (exp_010): {best_score:.6f}")

# Also load exp_009 for comparison
exp009_df = pd.read_csv('/home/code/experiments/009_santa_ensemble/submission.csv')
exp009_score = compute_total_score(exp009_df)
print(f"exp_009: {exp009_score:.6f}")

In [None]:
# Check chistyakov submission_best.csv specifically
chistyakov_best = pd.read_csv('/home/code/kaggle_datasets/chistyakov_best_public/submission_best.csv')
chistyakov_score = compute_total_score(chistyakov_best)
print(f"chistyakov submission_best.csv: {chistyakov_score:.6f}")

# Check if it has overlaps
def check_overlaps_strict(df, n):
    """Check for overlaps using strict 1e18 scaling."""
    n_str = str(n).zfill(3)
    n_df = df[df['id'].str.startswith(n_str + '_')]
    if len(n_df) == 0:
        return True, []
    
    polygons = []
    for _, row in n_df.iterrows():
        tree_idx = int(row['id'].split('_')[1])
        tree_id = tree_idx + 1
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        
        poly = get_tree_polygon(tree_id, x, y, deg)
        # Scale to integers for strict comparison
        coords = [(int(Decimal(str(c[0])) * SCALE), int(Decimal(str(c[1])) * SCALE)) 
                  for c in poly.exterior.coords]
        polygons.append(Polygon(coords))
    
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                overlaps.append((i, j))
    
    return len(overlaps) == 0, overlaps

# Check overlaps for chistyakov
overlap_ns = []
for n in range(1, 201):
    ok, overlaps = check_overlaps_strict(chistyakov_best, n)
    if not ok:
        overlap_ns.append(n)

print(f"\nchistyakov submission_best.csv has overlaps in {len(overlap_ns)} N values: {overlap_ns[:10]}...")

In [None]:
# Find the best source for each N
print("\nFinding best source for each N...")

# Load all valid sources
sources = {}

# exp_010 (current best)
sources['exp_010'] = pd.read_csv('/home/code/experiments/010_full_ensemble/submission.csv')

# All kaggle_datasets CSVs
for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        if 'id' in df.columns and 'x' in df.columns:
            name = os.path.basename(csv_file)
            sources[name] = df
    except:
        pass

print(f"Loaded {len(sources)} sources")

# For each N, find best valid source
best_per_n = {}
for n in range(1, 201):
    best_score = float('inf')
    best_source = None
    
    for name, df in sources.items():
        score = compute_score_for_n(df, n)
        if score < best_score:
            # Check for overlaps
            ok, _ = check_overlaps_strict(df, n)
            if ok:
                best_score = score
                best_source = name
    
    best_per_n[n] = (best_source, best_score)
    if n <= 10 or n % 20 == 0:
        print(f"N={n}: best={best_source} score={best_score:.6f}")

In [None]:
# Count how many N values each source contributes
source_counts = {}
for n, (source, score) in best_per_n.items():
    if source not in source_counts:
        source_counts[source] = 0
    source_counts[source] += 1

print("\nSource contributions:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f"  {source}: {count} N values")

# Calculate total score from best per N
total_best = sum(score for _, score in best_per_n.values())
print(f"\nTotal score from best per N: {total_best:.6f}")
print(f"Current best (exp_010): {best_score:.6f}")
print(f"Improvement: {best_score - total_best:.6f}")