# Experiment 009: Full Ensemble from ALL 727 CSV Files

Create a proper ensemble by finding the BEST VALID configuration for each N across ALL sources.

In [None]:
import os
import glob
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.strtree import STRtree
from collections import defaultdict

# Tree geometry
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

def get_tree_polygon(x, y, deg):
    # Create polygon with explicit closure
    coords = list(zip(TX, TY))
    coords.append(coords[0])  # Close the polygon
    base_poly = Polygon(coords)
    rotated = affinity.rotate(base_poly, deg, origin=(0, 0))
    return affinity.translate(rotated, x, y)

def get_bounding_box_side(trees):
    if not trees:
        return float('inf')
    all_x, all_y = [], []
    for x, y, deg in trees:
        poly = get_tree_polygon(x, y, deg)
        bounds = poly.bounds
        all_x.extend([bounds[0], bounds[2]])
        all_y.extend([bounds[1], bounds[3]])
    return max(max(all_x) - min(all_x), max(all_y) - min(all_y))

def has_overlap(trees):
    if len(trees) <= 1:
        return False
    polygons = [get_tree_polygon(x, y, deg) for x, y, deg in trees]
    tree_index = STRtree(polygons)
    for i, poly in enumerate(polygons):
        candidates = tree_index.query(poly)
        for j in candidates:
            if i != j and poly.intersects(polygons[j]) and not poly.touches(polygons[j]):
                return True
    return False

print("Functions defined!")

In [None]:
# Find all CSV files
csv_files = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
print(f"Found {len(csv_files)} CSV files")

# Load all valid submissions (20100 rows)
all_dfs = {}
for csv_path in csv_files:
    try:
        df = pd.read_csv(csv_path, dtype=str)
        if 'id' in df.columns and 'x' in df.columns and 'y' in df.columns and 'deg' in df.columns:
            if len(df) == 20100:
                all_dfs[csv_path] = df
    except Exception as e:
        pass

print(f"Loaded {len(all_dfs)} valid submission files")

In [None]:
# For each N, find the best VALID configuration across all sources
def load_trees_for_n(df, n):
    prefix = f'{n:03d}_'
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    raw_data = []  # Store raw string data for precision
    for _, row in rows.iterrows():
        x_str = str(row['x']).replace('s', '')
        y_str = str(row['y']).replace('s', '')
        deg_str = str(row['deg']).replace('s', '')
        x = float(x_str)
        y = float(y_str)
        deg = float(deg_str)
        trees.append((x, y, deg))
        raw_data.append((row['x'], row['y'], row['deg']))
    return trees, raw_data

best_configs = {}  # n -> (score, source_path, raw_data)
best_scores = {}   # n -> score

print("Finding best configuration for each N...")
for n in range(1, 201):
    best_score = float('inf')
    best_source = None
    best_raw = None
    
    for source_path, df in all_dfs.items():
        trees, raw_data = load_trees_for_n(df, n)
        if len(trees) == n:
            # Check for overlaps
            if not has_overlap(trees):
                side = get_bounding_box_side(trees)
                score = side**2 / n
                if score < best_score:
                    best_score = score
                    best_source = source_path
                    best_raw = raw_data
    
    if best_raw is not None:
        best_configs[n] = (best_score, best_source, best_raw)
        best_scores[n] = best_score
    
    if n % 20 == 0:
        print(f"N={n}: best_score={best_score:.6f}, source={os.path.basename(best_source) if best_source else 'None'}")

print(f"\nProcessed all N values")

In [None]:
# Calculate total score
total_score = sum(best_scores.values())
print(f"Total Ensemble Score: {total_score:.6f}")

# Compare to baseline
baseline_score = 70.676102
improvement = baseline_score - total_score
print(f"Baseline Score: {baseline_score:.6f}")
print(f"Improvement: {improvement:.6f}")

# Show which sources contributed
source_counts = defaultdict(int)
for n, (score, source, raw) in best_configs.items():
    source_counts[os.path.basename(source)] += 1

print(f"\nSources used:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1])[:10]:
    print(f"  {source}: {count} N values")

In [None]:
# Create submission using raw string data to preserve precision
submission_rows = []
for n in range(1, 201):
    score, source, raw_data = best_configs[n]
    for i, (x_str, y_str, deg_str) in enumerate(raw_data):
        row_id = f'{n:03d}_{i}'
        submission_rows.append({
            'id': row_id,
            'x': x_str,
            'y': y_str,
            'deg': deg_str
        })

submission_df = pd.DataFrame(submission_rows)
print(f"Created submission with {len(submission_df)} rows")
print(submission_df.head())

In [None]:
# Validate for overlaps (sample check)
print("Validating for overlaps (sample)...")
overlap_found = False
for n in [1, 10, 50, 100, 150, 200]:
    trees, _ = load_trees_for_n(submission_df, n)
    if has_overlap(trees):
        print(f"  N={n}: OVERLAP DETECTED!")
        overlap_found = True
    else:
        print(f"  N={n}: OK")

if not overlap_found:
    print("\nNo overlaps detected in sampled configurations.")

In [None]:
# Save submission
submission_df.to_csv('/home/submission/submission.csv', index=False)
submission_df.to_csv('/home/code/experiments/009_full_ensemble_v2/submission.csv', index=False)
print(f"Saved submission")

print(f"\n=== FINAL SCORE: {total_score:.6f} ===")
print(f"=== IMPROVEMENT: {improvement:.6f} ===")