# Loop 5 Analysis: Comprehensive Ensemble from All Sources

## Key Insight from jonathanchan kernel:
1. Ensemble from 19+ sources (GitHub, Kaggle datasets, notebooks)
2. Override N=1 with optimal value (x=0, y=0, deg=45)
3. Apply fractional translation (0.001 to 0.00001 steps)

## Available Sources:
- 88 snapshots (already scanned)
- Preoptimized folder: telegram, santa25-public, bucket-of-chump, etc.
- 3434 total CSV files

In [None]:
import pandas as pd
import numpy as np
import os
import glob
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
import math
from tqdm import tqdm
import json

# Tree geometry
def make_tree_polygon():
    tw, th = 0.15, 0.2
    bw, mw, ow = 0.7, 0.4, 0.25
    tip, t1, t2, base, tbot = 0.8, 0.5, 0.25, 0.0, -0.2
    x = [0, ow/2, ow/4, mw/2, mw/4, bw/2, tw/2, tw/2, -tw/2, -tw/2, -bw/2, -mw/4, -mw/2, -ow/4, -ow/2]
    y = [tip, t1, t1, t2, t2, base, base, tbot, tbot, base, base, t2, t2, t1, t1]
    return list(zip(x, y))

TREE_TEMPLATE = make_tree_polygon()

def get_tree_polygon(x, y, deg):
    poly = Polygon(TREE_TEMPLATE)
    poly = rotate(poly, deg, origin=(0, 0))
    poly = translate(poly, x, y)
    return poly

def score_group(xs, ys, degs):
    n = len(xs)
    all_x, all_y = [], []
    for i in range(n):
        r = math.radians(degs[i])
        c, s = math.cos(r), math.sin(r)
        for tx, ty in TREE_TEMPLATE:
            X = c * tx - s * ty + xs[i]
            Y = s * tx + c * ty + ys[i]
            all_x.append(X)
            all_y.append(Y)
    side = max(max(all_x) - min(all_x), max(all_y) - min(all_y))
    return side * side / n

def check_overlaps(xs, ys, degs):
    n = len(xs)
    polys = [get_tree_polygon(xs[i], ys[i], degs[i]) for i in range(n)]
    for i in range(n):
        for j in range(i+1, n):
            if polys[i].intersects(polys[j]) and not polys[i].touches(polys[j]):
                inter = polys[i].intersection(polys[j])
                if inter.area > 1e-10:  # Significant overlap
                    return True, f"Trees {i} and {j} overlap (area={inter.area})"
    return False, "OK"

print("Functions defined")

In [None]:
# Find ALL CSV files in preoptimized folder
preopt_base = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/'

preopt_files = []
for root, dirs, files in os.walk(preopt_base):
    for f in files:
        if f.endswith('.csv'):
            preopt_files.append(os.path.join(root, f))

print(f"Found {len(preopt_files)} preoptimized CSV files")
for f in preopt_files[:20]:
    print(f"  {f}")

In [None]:
# Score each preoptimized file
def strip_s(val):
    s = str(val)
    return float(s[1:] if s.startswith('s') else s)

def load_and_score_csv(filepath):
    try:
        df = pd.read_csv(filepath)
        if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
            return None, None
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        
        scores = {}
        for n, g in df.groupby('N'):
            if n < 1 or n > 200:
                continue
            xs = [strip_s(v) for v in g['x'].values]
            ys = [strip_s(v) for v in g['y'].values]
            ds = [strip_s(v) for v in g['deg'].values]
            scores[n] = score_group(xs, ys, ds)
        
        total = sum(scores.values())
        return total, scores
    except Exception as e:
        return None, None

# Score all preoptimized files
preopt_scores = {}
for fp in tqdm(preopt_files, desc="Scoring preoptimized"):
    total, per_n = load_and_score_csv(fp)
    if total is not None:
        preopt_scores[fp] = {'total': total, 'per_n': per_n}

print(f"\nScored {len(preopt_scores)} files")

# Sort by total score
sorted_preopt = sorted(preopt_scores.items(), key=lambda x: x[1]['total'])
print("\nTop 10 preoptimized solutions:")
for fp, data in sorted_preopt[:10]:
    print(f"  {data['total']:.6f}: {os.path.basename(fp)}")

In [None]:
# Now let's build the BEST ensemble from ALL sources
# 1. Load all snapshot submissions
# 2. Load all preoptimized files
# 3. For each N, find the best VALID solution

# First, let's load the current valid baseline
valid_baseline_path = '/home/nonroot/snapshots/santa-2025/21328309254/submission/submission.csv'
baseline_df = pd.read_csv(valid_baseline_path)
baseline_df['N'] = baseline_df['id'].astype(str).str.split('_').str[0].astype(int)

baseline_per_n = {}
for n, g in baseline_df.groupby('N'):
    xs = [strip_s(v) for v in g['x'].values]
    ys = [strip_s(v) for v in g['y'].values]
    ds = [strip_s(v) for v in g['deg'].values]
    baseline_per_n[n] = {
        'score': score_group(xs, ys, ds),
        'data': g.drop(columns=['N']).copy(),
        'source': 'valid_baseline'
    }

print(f"Baseline total: {sum(d['score'] for d in baseline_per_n.values()):.6f}")

In [None]:
# Build best-per-N from ALL sources (including preoptimized)
best_per_n = {n: baseline_per_n[n].copy() for n in range(1, 201)}

# Add preoptimized solutions
for fp, data in tqdm(preopt_scores.items(), desc="Processing preoptimized"):
    try:
        df = pd.read_csv(fp)
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        
        for n, g in df.groupby('N'):
            if n < 1 or n > 200:
                continue
            xs = [strip_s(v) for v in g['x'].values]
            ys = [strip_s(v) for v in g['y'].values]
            ds = [strip_s(v) for v in g['deg'].values]
            score = score_group(xs, ys, ds)
            
            if score < best_per_n[n]['score']:
                best_per_n[n] = {
                    'score': score,
                    'data': g.drop(columns=['N']).copy(),
                    'source': os.path.basename(fp)
                }
    except:
        continue

# Calculate total score (ignoring overlaps for now)
total_best = sum(d['score'] for d in best_per_n.values())
print(f"\nBest ensemble (ignoring overlaps): {total_best:.6f}")
print(f"Improvement from baseline: {sum(d['score'] for d in baseline_per_n.values()) - total_best:.6f}")

In [None]:
# Now check which solutions have overlaps
print("Checking for overlaps in best solutions...")

overlap_ns = []
for n in tqdm(range(1, 201), desc="Checking overlaps"):
    g = best_per_n[n]['data']
    xs = [strip_s(v) for v in g['x'].values]
    ys = [strip_s(v) for v in g['y'].values]
    ds = [strip_s(v) for v in g['deg'].values]
    
    has_overlap, msg = check_overlaps(xs, ys, ds)
    if has_overlap:
        overlap_ns.append(n)
        # Revert to baseline
        best_per_n[n] = baseline_per_n[n].copy()

print(f"\nFound {len(overlap_ns)} N values with overlaps: {overlap_ns[:20]}...")

# Calculate valid total
valid_total = sum(d['score'] for d in best_per_n.values())
print(f"\nValid ensemble score: {valid_total:.6f}")
print(f"Improvement from baseline: {sum(d['score'] for d in baseline_per_n.values()) - valid_total:.6f}")

In [None]:
# Override N=1 with optimal value (from jonathanchan kernel)
# x=0, y=0, deg=45 gives score 0.6612499...
optimal_n1 = pd.DataFrame({
    'id': ['001_0'],
    'x': ['s0.0'],
    'y': ['s0.0'],
    'deg': ['s45.0']
})

xs = [strip_s(v) for v in optimal_n1['x'].values]
ys = [strip_s(v) for v in optimal_n1['y'].values]
ds = [strip_s(v) for v in optimal_n1['deg'].values]
n1_score = score_group(xs, ys, ds)

print(f"Optimal N=1 score: {n1_score:.10f}")
print(f"Current N=1 score: {best_per_n[1]['score']:.10f}")

if n1_score < best_per_n[1]['score']:
    print(f"Improvement: {best_per_n[1]['score'] - n1_score:.10f}")
    best_per_n[1] = {
        'score': n1_score,
        'data': optimal_n1.copy(),
        'source': 'optimal_n1'
    }

# Final score
final_total = sum(d['score'] for d in best_per_n.values())
print(f"\nFinal ensemble score: {final_total:.6f}")

In [None]:
# Show which sources contributed
source_counts = {}
for n in range(1, 201):
    src = best_per_n[n]['source']
    source_counts[src] = source_counts.get(src, 0) + 1

print("Sources contributing to ensemble:")
for src, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f"  {src}: {count} N values")

In [None]:
# Create the final submission
rows = []
for n in range(1, 201):
    rows.append(best_per_n[n]['data'])

final_df = pd.concat(rows, ignore_index=True)
final_df['sn'] = final_df['id'].str.split('_').str[0].astype(int)
final_df['si'] = final_df['id'].str.split('_').str[1].astype(int)
final_df = final_df.sort_values(['sn', 'si']).drop(columns=['sn', 'si'])
final_df = final_df[['id', 'x', 'y', 'deg']]

print(f"Final submission shape: {final_df.shape}")
print(f"Expected: (20100, 4)")

# Save
os.makedirs('/home/code/experiments/005_comprehensive_ensemble', exist_ok=True)
final_df.to_csv('/home/code/experiments/005_comprehensive_ensemble/submission.csv', index=False)

# Save metrics
metrics = {
    'cv_score': final_total,
    'baseline_score': sum(d['score'] for d in baseline_per_n.values()),
    'improvement': sum(d['score'] for d in baseline_per_n.values()) - final_total,
    'target': 68.888293,
    'gap': final_total - 68.888293,
    'num_sources': len(source_counts),
    'overlap_ns_reverted': len(overlap_ns)
}

with open('/home/code/experiments/005_comprehensive_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"\nMetrics: {metrics}")

In [None]:
# Final validation - check ALL N values for overlaps
print("Final validation - checking all N values...")

final_overlaps = []
for n in tqdm(range(1, 201), desc="Final validation"):
    g = best_per_n[n]['data']
    xs = [strip_s(v) for v in g['x'].values]
    ys = [strip_s(v) for v in g['y'].values]
    ds = [strip_s(v) for v in g['deg'].values]
    
    has_overlap, msg = check_overlaps(xs, ys, ds)
    if has_overlap:
        final_overlaps.append((n, msg))

if final_overlaps:
    print(f"\nWARNING: {len(final_overlaps)} N values still have overlaps!")
    for n, msg in final_overlaps[:10]:
        print(f"  N={n}: {msg}")
else:
    print("\nâœ“ All N values pass overlap validation!")
    print(f"\nFinal score: {final_total:.6f}")
    print(f"Gap to target: {final_total - 68.888293:.6f}")