# Experiment 005: Per-N Ensemble from 116 Snapshots

Systematically scan ALL 116 snapshots and select the BEST solution for EACH N value.
This is the highest-leverage, lowest-risk improvement available.

In [None]:
import os
import pandas as pd
import math
from collections import defaultdict
import json
import glob

# Tree shape constants
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def get_polygon_bounds(cx, cy, deg):
    """Calculate bounding box of rotated tree polygon"""
    rad = deg * math.pi / 180.0
    s, c = math.sin(rad), math.cos(rad)
    x_coords = [TX[i] * c - TY[i] * s + cx for i in range(len(TX))]
    y_coords = [TX[i] * s + TY[i] * c + cy for i in range(len(TX))]
    return min(x_coords), max(x_coords), min(y_coords), max(y_coords)

def calculate_score_for_n(trees):
    """Calculate score for a single N configuration"""
    if not trees:
        return float('inf')
    
    global_x_min, global_x_max = float('inf'), float('-inf')
    global_y_min, global_y_max = float('inf'), float('-inf')
    
    for idx, cx, cy, deg in trees:
        x_min, x_max, y_min, y_max = get_polygon_bounds(cx, cy, deg)
        global_x_min = min(global_x_min, x_min)
        global_x_max = max(global_x_max, x_max)
        global_y_min = min(global_y_min, y_min)
        global_y_max = max(global_y_max, y_max)
    
    side = max(global_x_max - global_x_min, global_y_max - global_y_min)
    return side * side / len(trees)

print("Functions defined")

In [None]:
def load_submission(filepath):
    """Load submission file, returns dict mapping n -> list of (idx, x, y, deg)"""
    configurations = defaultdict(list)
    try:
        df = pd.read_csv(filepath)
        if len(df) < 20000:  # Skip incomplete files
            return {}
        for _, row in df.iterrows():
            id_parts = row['id'].split('_')
            n = int(id_parts[0])
            idx = int(id_parts[1])
            x = float(str(row['x']).replace('s', ''))
            y = float(str(row['y']).replace('s', ''))
            deg = float(str(row['deg']).replace('s', ''))
            configurations[n].append((idx, x, y, deg))
        for n in configurations:
            configurations[n].sort(key=lambda t: t[0])
        return dict(configurations)
    except Exception as e:
        return {}

print("Load function defined")

In [None]:
# Find ALL submission files in snapshots
SNAPSHOT_DIR = '/home/nonroot/snapshots/santa-2025'
snapshots = os.listdir(SNAPSHOT_DIR)
print(f"Found {len(snapshots)} snapshot directories")

# Find all submission.csv files
all_submission_files = []
for snapshot_id in snapshots:
    # Check standard location
    csv_path = f'{SNAPSHOT_DIR}/{snapshot_id}/submission/submission.csv'
    if os.path.exists(csv_path):
        all_submission_files.append(csv_path)
    
    # Check code folder
    code_csv = f'{SNAPSHOT_DIR}/{snapshot_id}/code/submission.csv'
    if os.path.exists(code_csv):
        all_submission_files.append(code_csv)
    
    # Check experiment folders
    exp_csvs = glob.glob(f'{SNAPSHOT_DIR}/{snapshot_id}/code/experiments/*/submission.csv')
    all_submission_files.extend(exp_csvs)

print(f"Found {len(all_submission_files)} submission files to scan")

In [None]:
# Initialize best per-N with baseline
baseline_path = '/home/code/experiments/002_valid_baseline/submission.csv'
baseline_configs = load_submission(baseline_path)

best_per_n = {}
for n in range(1, 201):
    if n in baseline_configs and len(baseline_configs[n]) == n:
        score = calculate_score_for_n(baseline_configs[n])
        best_per_n[n] = {'score': score, 'trees': baseline_configs[n], 'source': 'baseline'}
    else:
        best_per_n[n] = {'score': float('inf'), 'trees': None, 'source': None}

baseline_total = sum(best_per_n[n]['score'] for n in range(1, 201))
print(f"Baseline total score: {baseline_total:.6f}")

In [None]:
# Scan ALL submission files
print(f"\nScanning {len(all_submission_files)} submission files...")
print("="*60)

improvements_found = []
files_processed = 0

for filepath in all_submission_files:
    configs = load_submission(filepath)
    if not configs:
        continue
    
    files_processed += 1
    
    for n in range(1, 201):
        if n not in configs or len(configs[n]) != n:
            continue
        
        score = calculate_score_for_n(configs[n])
        
        if score < best_per_n[n]['score'] - 1e-10:  # Meaningful improvement
            improvement = best_per_n[n]['score'] - score
            old_source = best_per_n[n]['source']
            
            best_per_n[n] = {'score': score, 'trees': configs[n], 'source': filepath}
            improvements_found.append((n, improvement, filepath))
            
            if improvement > 0.0001:  # Only print significant improvements
                print(f"âœ… N={n}: {best_per_n[n]['score'] + improvement:.6f} -> {score:.6f} (improved by {improvement:.6f})")
    
    if files_processed % 50 == 0:
        print(f"  Processed {files_processed}/{len(all_submission_files)} files...")

print("="*60)
print(f"Processed {files_processed} valid submission files")
print(f"Found {len(improvements_found)} improvements")

In [None]:
# Calculate ensemble score
ensemble_score = sum(best_per_n[n]['score'] for n in range(1, 201))
print(f"\nBaseline score: {baseline_total:.6f}")
print(f"Ensemble score: {ensemble_score:.6f}")
print(f"Total improvement: {baseline_total - ensemble_score:.6f}")

# Count sources
source_counts = defaultdict(int)
for n in range(1, 201):
    source_counts[best_per_n[n]['source']] += 1

print(f"\nSource distribution (top 10):")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1])[:10]:
    print(f"  {source}: {count} N values")

In [None]:
# Show significant improvements
if improvements_found:
    significant = [(n, imp, src) for n, imp, src in improvements_found if imp > 0.0001]
    if significant:
        print(f"\nSignificant improvements (> 0.0001):")
        for n, imp, src in sorted(significant, key=lambda x: -x[1])[:20]:
            print(f"  N={n}: improved by {imp:.6f}")
    else:
        print("\nNo significant improvements found (all < 0.0001)")
else:
    print("\nNo improvements found at all")

In [None]:
# Save ensemble submission
import csv

print("\nSaving ensemble submission...")

with open('/home/code/experiments/005_per_n_ensemble/submission.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'x', 'y', 'deg'])
    for n in range(1, 201):
        trees = best_per_n[n]['trees']
        if trees:
            for idx, x, y, deg in trees:
                writer.writerow([f'{n:03d}_{idx}', f's{x:.17f}', f's{y:.17f}', f's{deg:.17f}'])

# Also save to submission folder
os.makedirs('/home/submission', exist_ok=True)
with open('/home/submission/submission.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'x', 'y', 'deg'])
    for n in range(1, 201):
        trees = best_per_n[n]['trees']
        if trees:
            for idx, x, y, deg in trees:
                writer.writerow([f'{n:03d}_{idx}', f's{x:.17f}', f's{y:.17f}', f's{deg:.17f}'])

print(f"Saved ensemble submission")
print(f"Final score: {ensemble_score:.6f}")

In [None]:
# Save metrics
metrics = {
    'cv_score': ensemble_score,
    'baseline_score': baseline_total,
    'improvement': baseline_total - ensemble_score,
    'improvements_found': len(improvements_found),
    'files_scanned': len(all_submission_files),
    'files_processed': files_processed
}

with open('/home/code/experiments/005_per_n_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"\nMetrics saved")
print(f"CV Score: {ensemble_score:.6f}")
print(f"Improvement: {baseline_total - ensemble_score:.6f}")