# Loop 4 Analysis: Ensemble Strategy from Multiple Sources

## Key Insight from Research
The jonathanchan kernel shows that top scores come from **ensembling 16+ external sources**, not from running optimizers longer.

## Strategy
1. Collect ALL available CSV files from snapshots
2. For each N (1-200), find the best solution across all sources
3. Ensemble the best solutions
4. Validate for overlaps
5. Submit

In [1]:
import pandas as pd
import numpy as np
import math
import os
import glob
from numba import njit
from collections import defaultdict

# Tree geometry
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

@njit
def score_group(xs, ys, degs, tx, ty):
    n = xs.size
    V = tx.size
    mnx = mny = 1e300
    mxx = mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c, s = math.cos(r), math.sin(r)
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xs[i]
            Y = s * tx[j] + c * ty[j] + ys[i]
            mnx, mxx = min(mnx, X), max(mxx, X)
            mny, mxy = min(mny, Y), max(mxy, Y)
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    return np.array([float(str(v).replace('s', '')) for v in a], np.float64)

print('Functions defined')

Functions defined


In [2]:
# Find ALL CSV files in snapshots
all_csvs = []
for root, dirs, files in os.walk('/home/nonroot/snapshots/santa-2025'):
    for f in files:
        if f.endswith('.csv'):
            all_csvs.append(os.path.join(root, f))

print(f'Found {len(all_csvs)} CSV files in snapshots')
print('\nSample files:')
for f in all_csvs[:10]:
    print(f'  {f}')

Found 3308 CSV files in snapshots

Sample files:
  /home/nonroot/snapshots/santa-2025/21116303805/code/submission.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/experiments/004_sa_v1_parallel/submission_best.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/experiments/004_sa_v1_parallel/submission_v18.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/experiments/005_backward_propagation/submission.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/experiments/005_backward_propagation/submission_v21.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/experiments/005_backward_propagation/optimized.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/experiments/002_preoptimized/submission.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/submission_candidates/candidate_000.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/submission_candidates/candidate_004.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/submission_c

In [3]:
# Score each CSV file and find best per-N solutions
best_per_n = {n: {'score': 1e300, 'data': None, 'src': None} for n in range(1, 201)}

valid_files = 0
for fp in all_csvs:
    try:
        df = pd.read_csv(fp)
        if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
            continue
        
        # Check if it has proper format
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        
        valid_files += 1
        
        for n, g in df.groupby('N'):
            if n < 1 or n > 200 or len(g) != n:
                continue
            
            xs = strip(g['x'].values)
            ys = strip(g['y'].values)
            ds = strip(g['deg'].values)
            
            sc = score_group(xs, ys, ds, TX, TY)
            
            if sc < best_per_n[n]['score']:
                best_per_n[n]['score'] = float(sc)
                best_per_n[n]['data'] = g.drop(columns=['N']).copy()
                best_per_n[n]['src'] = fp.split('/')[-1]
    except Exception as e:
        continue

print(f'Processed {valid_files} valid CSV files')

Processed 3297 valid CSV files


In [4]:
# Calculate total ensemble score
total_score = sum(best_per_n[n]['score'] for n in range(1, 201) if best_per_n[n]['data'] is not None)
print(f'\nTotal ensemble score: {total_score:.6f}')
print(f'Target: 68.892266')
print(f'Gap: {total_score - 68.892266:.6f}')

# Show best sources
from collections import Counter
sources = Counter(best_per_n[n]['src'] for n in range(1, 201) if best_per_n[n]['src'])
print(f'\nTop sources:')
for src, count in sources.most_common(10):
    print(f'  {src}: {count} N values')


Total ensemble score: 27.414787
Target: 68.892266
Gap: -41.477479

Top sources:
  ensemble_best.csv: 199 N values
  submission.csv: 1 N values


In [5]:
# Show per-N scores for first 20 N values
print('\nPer-N scores (N=1-20):')
for n in range(1, 21):
    entry = best_per_n[n]
    print(f'  N={n:3d}: score={entry["score"]:.6f}, src={entry["src"]}')


Per-N scores (N=1-20):
  N=  1: score=0.661250, src=submission.csv
  N=  2: score=0.338427, src=ensemble_best.csv
  N=  3: score=0.262632, src=ensemble_best.csv
  N=  4: score=0.199376, src=ensemble_best.csv
  N=  5: score=0.206839, src=ensemble_best.csv
  N=  6: score=0.173625, src=ensemble_best.csv
  N=  7: score=0.157468, src=ensemble_best.csv
  N=  8: score=0.136883, src=ensemble_best.csv
  N=  9: score=0.128142, src=ensemble_best.csv
  N= 10: score=0.154792, src=ensemble_best.csv
  N= 11: score=0.135039, src=ensemble_best.csv
  N= 12: score=0.115983, src=ensemble_best.csv
  N= 13: score=0.108253, src=ensemble_best.csv
  N= 14: score=0.111515, src=ensemble_best.csv
  N= 15: score=0.106120, src=ensemble_best.csv
  N= 16: score=0.062169, src=ensemble_best.csv
  N= 17: score=0.102560, src=ensemble_best.csv
  N= 18: score=0.077631, src=ensemble_best.csv
  N= 19: score=0.068570, src=ensemble_best.csv
  N= 20: score=0.061771, src=ensemble_best.csv


In [6]:
# Compare with our current best (70.622435)
current_best = 70.622435
print(f'\nComparison:')
print(f'  Current best: {current_best:.6f}')
print(f'  Ensemble:     {total_score:.6f}')
print(f'  Improvement:  {current_best - total_score:.6f}')

if total_score < current_best:
    print('\n*** ENSEMBLE IS BETTER! ***')
else:
    print('\n*** Current solution is already the best ***')


Comparison:
  Current best: 70.622435
  Ensemble:     27.414787
  Improvement:  43.207648

*** ENSEMBLE IS BETTER! ***


In [10]:
# The ensemble_best.csv has overlaps! We need to check which sources are VALID
# Let's find the validated submissions (those that passed Kaggle validation)

# Known validated submissions from session_state:
# - /home/nonroot/snapshots/santa-2025/21198893057/code/submission.csv (LB: 70.627582)
# - /home/nonroot/snapshots/santa-2025/21198927060/code/submission_candidates/candidate_024.csv (LB: 70.626088)

# Let's check if there are any other validated submissions
validated_paths = [
    '/home/nonroot/snapshots/santa-2025/21198893057/code/submission.csv',
    '/home/nonroot/snapshots/santa-2025/21198927060/code/submission_candidates/candidate_024.csv',
]

print("Checking validated submissions:")
for path in validated_paths:
    if os.path.exists(path):
        df = pd.read_csv(path)
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        total = 0
        for n, g in df.groupby('N'):
            if len(g) == n:
                xs = strip(g['x'].values)
                ys = strip(g['y'].values)
                ds = strip(g['deg'].values)
                total += score_group(xs, ys, ds, TX, TY)
        print(f"  {path.split('/')[-1]}: {total:.6f}")
    else:
        print(f"  {path}: NOT FOUND")

Checking validated submissions:
  submission.csv: 70.627582
  candidate_024.csv: 70.626088


In [11]:
# Let's look at the session_state from the snapshot that achieved 70.626088
# to understand what approaches they tried

import json

snapshot_state_path = '/home/nonroot/snapshots/santa-2025/21198927060/code/session_state.json'
if os.path.exists(snapshot_state_path):
    with open(snapshot_state_path) as f:
        snapshot_state = json.load(f)
    
    print("Experiments from snapshot 21198927060:")
    for exp in snapshot_state.get('experiments', [])[:15]:
        print(f"  {exp['name']}: {exp.get('cv_score', 'N/A')}")

Experiments from snapshot 21198927060:
  001_baseline: 70.647327
  002_ensemble: 70.647306
  003_validated_ensemble: 70.647327
  004_bbox3_optimization: 70.647326
  005_baseline_validated: 70.647327
  006_zaburo_grid: 70.647327
  007_sa_optimization: 70.647327
  008_repair_ensemble: 70.647327
  009_fractional_translation: 70.647327
  010_tessellation_and_ensemble: 70.630478
  011_random_restart_sa: 70.630478
  012_scanline_packer: 70.630478
  013_long_sa: 70.630478
  014_basin_hopping: 70.630478
  015_constraint_programming: 70.630478


In [12]:
# Let's check what the BEST validated LB score is in all snapshots
# by looking at session_state.json files

best_lb = 1e300
best_lb_path = None

for root, dirs, files in os.walk('/home/nonroot/snapshots/santa-2025'):
    if 'session_state.json' in files:
        try:
            with open(os.path.join(root, 'session_state.json')) as f:
                state = json.load(f)
            for sub in state.get('submissions', []):
                lb = sub.get('lb_score')
                if lb and isinstance(lb, (int, float)) and lb < best_lb:
                    best_lb = lb
                    best_lb_path = root
        except:
            pass

print(f'Best validated LB score in snapshots: {best_lb:.6f}')
print(f'From: {best_lb_path}')

Best validated LB score in snapshots: 70.626088
From: /home/nonroot/snapshots/santa-2025/21222392487/code


In [13]:
# Check if our current candidate_003 (from sa_fast_v2) has the same N=8 issue
# as candidate_002 which failed with "Overlapping trees in group 008"

from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.strtree import STRtree

getcontext().prec = 25
scale_factor = Decimal("1e18")

class ChristmasTree:
    def __init__(self, center_x="0", center_y="0", angle="0"):
        self.center_x = Decimal(center_x)
        self.center_y = Decimal(center_y)
        self.angle = Decimal(angle)
        
        trunk_w = Decimal("0.15")
        trunk_h = Decimal("0.2")
        base_w = Decimal("0.7")
        mid_w = Decimal("0.4")
        top_w = Decimal("0.25")
        tip_y = Decimal("0.8")
        tier_1_y = Decimal("0.5")
        tier_2_y = Decimal("0.25")
        base_y = Decimal("0.0")
        trunk_bottom_y = -trunk_h
        
        initial_polygon = Polygon([
            (Decimal("0.0") * scale_factor, tip_y * scale_factor),
            (top_w / Decimal("2") * scale_factor, tier_1_y * scale_factor),
            (top_w / Decimal("4") * scale_factor, tier_1_y * scale_factor),
            (mid_w / Decimal("2") * scale_factor, tier_2_y * scale_factor),
            (mid_w / Decimal("4") * scale_factor, tier_2_y * scale_factor),
            (base_w / Decimal("2") * scale_factor, base_y * scale_factor),
            (trunk_w / Decimal("2") * scale_factor, base_y * scale_factor),
            (trunk_w / Decimal("2") * scale_factor, trunk_bottom_y * scale_factor),
            (-(trunk_w / Decimal("2")) * scale_factor, trunk_bottom_y * scale_factor),
            (-(trunk_w / Decimal("2")) * scale_factor, base_y * scale_factor),
            (-(base_w / Decimal("2")) * scale_factor, base_y * scale_factor),
            (-(mid_w / Decimal("4")) * scale_factor, tier_2_y * scale_factor),
            (-(mid_w / Decimal("2")) * scale_factor, tier_2_y * scale_factor),
            (-(top_w / Decimal("4")) * scale_factor, tier_1_y * scale_factor),
            (-(top_w / Decimal("2")) * scale_factor, tier_1_y * scale_factor),
        ])
        
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(
            rotated,
            xoff=float(self.center_x * scale_factor),
            yoff=float(self.center_y * scale_factor)
        )

def has_overlap(trees):
    if len(trees) <= 1:
        return False
    polygons = [t.polygon for t in trees]
    tree_index = STRtree(polygons)
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx == i:
                continue
            if poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                return True
    return False

def load_trees_for_n(n, df):
    group_data = df[df['id'].str.startswith(f'{n:03d}_')]
    trees = []
    for _, row in group_data.iterrows():
        x = str(row['x']).lstrip('s')
        y = str(row['y']).lstrip('s')
        deg = str(row['deg']).lstrip('s')
        if x and y and deg:
            trees.append(ChristmasTree(x, y, deg))
    return trees

print('Overlap detection functions defined')

Overlap detection functions defined


In [14]:
# Check candidate_003 for overlaps in N=8 and other groups
candidate_003_path = '/home/submission/submission.csv'
df_003 = pd.read_csv(candidate_003_path)

print("Checking candidate_003 for overlaps...")
overlap_groups = []
for n in range(1, 201):
    trees = load_trees_for_n(n, df_003)
    if trees and has_overlap(trees):
        overlap_groups.append(n)

if overlap_groups:
    print(f"\nOVERLAPS FOUND in groups: {overlap_groups}")
else:
    print("\nNo overlaps detected!")

Checking candidate_003 for overlaps...



No overlaps detected!
