# Experiment 002: Full Ensemble from ALL Sources

Load all 30+ pre-optimized CSV files and select the best configuration for each N=1 to 200.

In [None]:
import os
import glob
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.strtree import STRtree

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def get_tree_polygon(x, y, deg):
    """Create a Shapely polygon for a tree at (x, y) with rotation deg."""
    base_poly = Polygon(zip(TX, TY))
    rotated = affinity.rotate(base_poly, deg, origin=(0, 0))
    translated = affinity.translate(rotated, x, y)
    return translated

def load_trees_for_n(df, n):
    """Load trees for a specific N value from submission dataframe."""
    prefix = f'{n:03d}_'
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in rows.iterrows():
        x = float(str(row['x']).replace('s', ''))
        y = float(str(row['y']).replace('s', ''))
        deg = float(str(row['deg']).replace('s', ''))
        trees.append((x, y, deg))
    return trees

def get_bounding_box_side(trees):
    """Calculate the side length of the bounding square for trees."""
    if not trees:
        return float('inf')
    
    all_x = []
    all_y = []
    for x, y, deg in trees:
        poly = get_tree_polygon(x, y, deg)
        bounds = poly.bounds  # (minx, miny, maxx, maxy)
        all_x.extend([bounds[0], bounds[2]])
        all_y.extend([bounds[1], bounds[3]])
    
    width = max(all_x) - min(all_x)
    height = max(all_y) - min(all_y)
    return max(width, height)

def has_overlap(trees):
    """Check if any trees overlap (touching is OK)."""
    if len(trees) <= 1:
        return False
    
    polygons = [get_tree_polygon(x, y, deg) for x, y, deg in trees]
    tree_index = STRtree(polygons)
    
    for i, poly in enumerate(polygons):
        candidates = tree_index.query(poly)
        for j in candidates:
            if i != j and poly.intersects(polygons[j]) and not poly.touches(polygons[j]):
                return True
    return False

print("Functions defined successfully!")

In [None]:
# Find all CSV files in preoptimized directory
preopt_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized'
csv_files = glob.glob(f'{preopt_dir}/**/*.csv', recursive=True)
print(f"Found {len(csv_files)} CSV files")

# Load all CSVs
all_dfs = {}
for csv_path in csv_files:
    try:
        df = pd.read_csv(csv_path)
        if 'id' in df.columns and 'x' in df.columns and 'y' in df.columns and 'deg' in df.columns:
            if len(df) == 20100:  # Valid submission has 20100 rows
                all_dfs[csv_path] = df
                print(f"  Loaded: {os.path.basename(csv_path)} ({len(df)} rows)")
            else:
                print(f"  Skipped (wrong size): {os.path.basename(csv_path)} ({len(df)} rows)")
        else:
            print(f"  Skipped (missing columns): {os.path.basename(csv_path)}")
    except Exception as e:
        print(f"  Error loading {csv_path}: {e}")

print(f"\nLoaded {len(all_dfs)} valid submission files")

In [None]:
# For each N, find the best configuration across all sources
best_configs = {}  # n -> (side, source_path, trees)
best_scores = {}   # n -> score (side^2/n)

for n in range(1, 201):
    best_side = float('inf')
    best_source = None
    best_trees = None
    
    for source_path, df in all_dfs.items():
        trees = load_trees_for_n(df, n)
        if len(trees) == n:  # Valid configuration
            side = get_bounding_box_side(trees)
            if side < best_side:
                best_side = side
                best_source = source_path
                best_trees = trees
    
    if best_trees is not None:
        best_configs[n] = (best_side, best_source, best_trees)
        best_scores[n] = best_side**2 / n
    
    if n % 20 == 0:
        print(f"N={n}: best_side={best_side:.6f}, score={best_scores[n]:.6f}, source={os.path.basename(best_source)}")

print(f"\nProcessed all N values")

In [None]:
# Calculate total score
total_score = sum(best_scores.values())
print(f"Total Ensemble Score: {total_score:.6f}")

# Compare to baseline
baseline_score = 70.676102
improvement = baseline_score - total_score
print(f"Baseline Score: {baseline_score:.6f}")
print(f"Improvement: {improvement:.6f}")

# Show per-N breakdown for first 20
print("\nPer-N scores (first 20):")
for n in range(1, 21):
    side, source, _ = best_configs[n]
    score = best_scores[n]
    print(f"  N={n:3d}: side={side:.6f}, score={score:.6f}, source={os.path.basename(source)}")

In [None]:
# Create the ensemble submission dataframe
ensemble_rows = []
for n in range(1, 201):
    _, _, trees = best_configs[n]
    for i, (x, y, deg) in enumerate(trees):
        row_id = f'{n:03d}_{i}'
        ensemble_rows.append({
            'id': row_id,
            'x': f's{x}',
            'y': f's{y}',
            'deg': f's{deg}'
        })

ensemble_df = pd.DataFrame(ensemble_rows)
print(f"Created ensemble submission with {len(ensemble_df)} rows")
print(ensemble_df.head())

In [None]:
# Validate for overlaps (sample check)
print("Checking for overlaps (sample)...")
overlap_found = False
for n in [1, 10, 50, 100, 150, 200]:
    _, _, trees = best_configs[n]
    if has_overlap(trees):
        print(f"  N={n}: OVERLAP DETECTED!")
        overlap_found = True
    else:
        print(f"  N={n}: OK")

if not overlap_found:
    print("\nNo overlaps detected in sampled configurations.")

In [None]:
# Save the ensemble submission
ensemble_df.to_csv('/home/submission/submission.csv', index=False)
ensemble_df.to_csv('/home/code/experiments/002_full_ensemble/submission.csv', index=False)
print(f"Saved ensemble submission")

print(f"\n=== ENSEMBLE SCORE: {total_score:.6f} ===")
print(f"=== IMPROVEMENT OVER BASELINE: {improvement:.6f} ===")