# Experiment 002: Ensemble Best Solutions from Multiple Sources

Strategy: For each N, find the solution with the smallest bounding box from all available sources.

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
import os
import glob
import json
from collections import defaultdict

# Tree shape definition (closed polygon)
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125, 0]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5, 0.8]

def get_tree_polygon(x, y, angle_deg):
    """Create a tree polygon at position (x,y) with given rotation angle."""
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = affinity.rotate(poly, angle_deg, origin=(0, 0))
    poly = affinity.translate(poly, xoff=x, yoff=y)
    return poly

def has_overlap(poly1, poly2, tolerance=1e-9):
    """Check if two polygons overlap (not just touch)."""
    if not poly1.intersects(poly2):
        return False
    intersection = poly1.intersection(poly2)
    return intersection.area > tolerance

def get_side_length(polys):
    """Calculate bounding box side length for a list of polygons."""
    union = unary_union(polys)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def check_overlaps_for_n(polys):
    """Check if any polygons overlap."""
    for i in range(len(polys)):
        for j in range(i+1, len(polys)):
            if has_overlap(polys[i], polys[j]):
                return True
    return False

print("Functions defined")

Functions defined


In [2]:
# Find all unique CSV files (excluding sample_submission)
all_csvs = glob.glob('/home/nonroot/snapshots/santa-2025/**/submission*.csv', recursive=True)
all_csvs += glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)

# Filter out sample_submission and duplicates
csv_files = list(set([f for f in all_csvs if 'sample_submission' not in f]))
print(f"Found {len(csv_files)} unique CSV files")

# Also add the valid baseline
csv_files.append('/home/code/valid_baseline.csv')
print(f"Total sources: {len(csv_files)}")

Found 3181 unique CSV files
Total sources: 3182


In [3]:
def parse_submission(csv_path):
    """Parse a submission CSV and return dict of n -> list of (x, y, deg) tuples."""
    try:
        df = pd.read_csv(csv_path)
        if 'x' not in df.columns or 'y' not in df.columns or 'deg' not in df.columns:
            return None
        
        df['x_val'] = df['x'].astype(str).str.replace('s', '').astype(float)
        df['y_val'] = df['y'].astype(str).str.replace('s', '').astype(float)
        df['deg_val'] = df['deg'].astype(str).str.replace('s', '').astype(float)
        df['n'] = df['id'].apply(lambda x: int(str(x).split('_')[0]))
        
        result = {}
        for n in range(1, 201):
            group = df[df['n'] == n]
            if len(group) != n:
                return None  # Invalid submission
            result[n] = [(row['x_val'], row['y_val'], row['deg_val']) for _, row in group.iterrows()]
        return result
    except Exception as e:
        return None

# Parse all submissions
print("Parsing all submissions...")
all_solutions = {}
for i, csv_path in enumerate(csv_files):
    if i % 100 == 0:
        print(f"Processing {i}/{len(csv_files)}...")
    sol = parse_submission(csv_path)
    if sol is not None:
        all_solutions[csv_path] = sol

print(f"\nSuccessfully parsed {len(all_solutions)} valid submissions")

Parsing all submissions...
Processing 0/3182...


Processing 100/3182...


Processing 200/3182...


Processing 300/3182...


Processing 400/3182...


Processing 500/3182...


Processing 600/3182...


Processing 700/3182...


Processing 800/3182...


Processing 900/3182...


Processing 1000/3182...


Processing 1100/3182...


Processing 1200/3182...


Processing 1300/3182...


Processing 1400/3182...


Processing 1500/3182...


Processing 1600/3182...


Processing 1700/3182...


Processing 1800/3182...


Processing 1900/3182...


Processing 2000/3182...


Processing 2100/3182...


Processing 2200/3182...


Processing 2300/3182...


Processing 2400/3182...


Processing 2500/3182...


Processing 2600/3182...


Processing 2700/3182...


Processing 2800/3182...


Processing 2900/3182...


Processing 3000/3182...


Processing 3100/3182...



Successfully parsed 3146 valid submissions


In [None]:
# For each N, find the best solution (smallest bounding box)
print("Finding best solution for each N...")

best_per_n = {}  # n -> (side_length, source_path, solution)
baseline_scores = {}  # For comparison

# First, get baseline scores
baseline_path = '/home/code/valid_baseline.csv'
baseline_sol = all_solutions.get(baseline_path)
if baseline_sol:
    for n in range(1, 201):
        trees = baseline_sol[n]
        polys = [get_tree_polygon(x, y, deg) for x, y, deg in trees]
        side = get_side_length(polys)
        baseline_scores[n] = side
        best_per_n[n] = (side, baseline_path, trees)

print(f"Baseline total score: {sum(s**2/n for n, s in baseline_scores.items()):.6f}")

In [None]:
# Now find better solutions from other sources
improvement_count = 0
source_counts = defaultdict(int)

for source_path, sol in all_solutions.items():
    if source_path == baseline_path:
        continue
    
    for n in range(1, 201):
        trees = sol[n]
        polys = [get_tree_polygon(x, y, deg) for x, y, deg in trees]
        side = get_side_length(polys)
        
        if side < best_per_n[n][0] - 1e-9:  # Found better solution
            # Check for overlaps before accepting
            if not check_overlaps_for_n(polys):
                best_per_n[n] = (side, source_path, trees)
                improvement_count += 1
                source_counts[source_path] += 1

print(f"Found {improvement_count} improvements over baseline")
print(f"\nTop sources by improvement count:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1])[:10]:
    print(f"  {count}: {source.split('/')[-1]}")

In [None]:
# Calculate new total score
new_total_score = sum(best_per_n[n][0]**2 / n for n in range(1, 201))
baseline_total = sum(baseline_scores[n]**2 / n for n in range(1, 201))

print(f"Baseline score: {baseline_total:.6f}")
print(f"Ensemble score: {new_total_score:.6f}")
print(f"Improvement: {baseline_total - new_total_score:.6f}")

# Show improvements by N
print("\nTop 10 improvements by N:")
improvements = []
for n in range(1, 201):
    old_score = baseline_scores[n]**2 / n
    new_score = best_per_n[n][0]**2 / n
    if new_score < old_score - 1e-9:
        improvements.append((n, old_score - new_score, best_per_n[n][1]))

for n, imp, source in sorted(improvements, key=lambda x: -x[1])[:10]:
    print(f"  N={n}: improved by {imp:.6f} from {source.split('/')[-1]}")

In [None]:
# Create the ensemble submission
rows = []
for n in range(1, 201):
    trees = best_per_n[n][2]
    for i, (x, y, deg) in enumerate(trees):
        rows.append({
            'id': f'{n:03d}_{i}',
            'x': f's{x}',
            'y': f's{y}',
            'deg': f's{deg}'
        })

ensemble_df = pd.DataFrame(rows)
print(f"Created ensemble with {len(ensemble_df)} rows")
print(ensemble_df.head())

In [None]:
# Validate the ensemble submission (check for overlaps)
print("Validating ensemble submission...")
overlap_count = 0
for n in range(1, 201):
    trees = best_per_n[n][2]
    polys = [get_tree_polygon(x, y, deg) for x, y, deg in trees]
    if check_overlaps_for_n(polys):
        overlap_count += 1
        print(f"  OVERLAP at N={n}")

print(f"\nTotal N values with overlaps: {overlap_count}")

In [None]:
# Save the ensemble submission
ensemble_df.to_csv('/home/code/experiments/002_ensemble_sources/submission.csv', index=False)
ensemble_df.to_csv('/home/submission/submission.csv', index=False)

# Save metrics
metrics = {
    'cv_score': new_total_score,
    'baseline_score': baseline_total,
    'improvement': baseline_total - new_total_score,
    'overlaps': overlap_count,
    'num_improvements': len(improvements),
    'source_counts': dict(sorted(source_counts.items(), key=lambda x: -x[1])[:20])
}

with open('/home/code/experiments/002_ensemble_sources/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Saved ensemble submission")
print(f"Final CV Score: {new_total_score:.6f}")