# Experiment 002: Full Ensemble from ALL Sources

Load all 30+ pre-optimized CSV files and select the best configuration for each N=1 to 200.

In [1]:
import os
import glob
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.strtree import STRtree

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def get_tree_polygon(x, y, deg):
    """Create a Shapely polygon for a tree at (x, y) with rotation deg."""
    base_poly = Polygon(zip(TX, TY))
    rotated = affinity.rotate(base_poly, deg, origin=(0, 0))
    translated = affinity.translate(rotated, x, y)
    return translated

def load_trees_for_n(df, n):
    """Load trees for a specific N value from submission dataframe."""
    prefix = f'{n:03d}_'
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in rows.iterrows():
        x = float(str(row['x']).replace('s', ''))
        y = float(str(row['y']).replace('s', ''))
        deg = float(str(row['deg']).replace('s', ''))
        trees.append((x, y, deg))
    return trees

def get_bounding_box_side(trees):
    """Calculate the side length of the bounding square for trees."""
    if not trees:
        return float('inf')
    
    all_x = []
    all_y = []
    for x, y, deg in trees:
        poly = get_tree_polygon(x, y, deg)
        bounds = poly.bounds  # (minx, miny, maxx, maxy)
        all_x.extend([bounds[0], bounds[2]])
        all_y.extend([bounds[1], bounds[3]])
    
    width = max(all_x) - min(all_x)
    height = max(all_y) - min(all_y)
    return max(width, height)

def has_overlap(trees):
    """Check if any trees overlap (touching is OK)."""
    if len(trees) <= 1:
        return False
    
    polygons = [get_tree_polygon(x, y, deg) for x, y, deg in trees]
    tree_index = STRtree(polygons)
    
    for i, poly in enumerate(polygons):
        candidates = tree_index.query(poly)
        for j in candidates:
            if i != j and poly.intersects(polygons[j]) and not poly.touches(polygons[j]):
                return True
    return False

print("Functions defined successfully!")

Functions defined successfully!


In [2]:
# Find all CSV files in preoptimized directory
preopt_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized'
csv_files = glob.glob(f'{preopt_dir}/**/*.csv', recursive=True)
print(f"Found {len(csv_files)} CSV files")

# Load all CSVs
all_dfs = {}
for csv_path in csv_files:
    try:
        df = pd.read_csv(csv_path)
        if 'id' in df.columns and 'x' in df.columns and 'y' in df.columns and 'deg' in df.columns:
            if len(df) == 20100:  # Valid submission has 20100 rows
                all_dfs[csv_path] = df
                print(f"  Loaded: {os.path.basename(csv_path)} ({len(df)} rows)")
            else:
                print(f"  Skipped (wrong size): {os.path.basename(csv_path)} ({len(df)} rows)")
        else:
            print(f"  Skipped (missing columns): {os.path.basename(csv_path)}")
    except Exception as e:
        print(f"  Error loading {csv_path}: {e}")

print(f"\nLoaded {len(all_dfs)} valid submission files")

Found 30 CSV files
  Loaded: ensemble.csv (20100 rows)
  Loaded: submission.csv (20100 rows)
  Loaded: santa-2025.csv (20100 rows)
  Loaded: best_ensemble.csv (20100 rows)
  Loaded: 72.49.csv (20100 rows)
  Loaded: 71.97.csv (20100 rows)
  Loaded: 72.49.csv (20100 rows)


  Loaded: 71.97.csv (20100 rows)
  Loaded: submission_JKoT4.csv (20100 rows)
  Loaded: New_Tree_144_196.csv (20100 rows)
  Loaded: submission_JKoT3.csv (20100 rows)
  Loaded: santa2025_ver2_v61.csv (20100 rows)
  Loaded: submission_JKoT2.csv (20100 rows)
  Loaded: santa2025_ver2_v67.csv (20100 rows)
  Loaded: santa2025_ver2_v76.csv (20100 rows)
  Loaded: submission_70_936673758122.csv (20100 rows)


  Loaded: santa2025_ver2_v65.csv (20100 rows)
  Loaded: submission_70_926149550346.csv (20100 rows)
  Loaded: santa2025_ver2_v66.csv (20100 rows)
  Loaded: santa2025_ver2_v63.csv (20100 rows)
  Loaded: santa2025_ver2_v69.csv (20100 rows)
  Loaded: submission_JKoT1.csv (20100 rows)
  Loaded: submission_opt1.csv (20100 rows)
  Loaded: santa2025_ver2_v68.csv (20100 rows)
  Loaded: santa-2025.csv (20100 rows)


  Loaded: submission.csv (20100 rows)
  Loaded: submission (77).csv (20100 rows)
  Loaded: submission.csv (20100 rows)
  Loaded: submission_sa.csv (20100 rows)
  Loaded: submission_best.csv (20100 rows)

Loaded 30 valid submission files


In [3]:
# For each N, find the best configuration across all sources
best_configs = {}  # n -> (side, source_path, trees)
best_scores = {}   # n -> score (side^2/n)

for n in range(1, 201):
    best_side = float('inf')
    best_source = None
    best_trees = None
    
    for source_path, df in all_dfs.items():
        trees = load_trees_for_n(df, n)
        if len(trees) == n:  # Valid configuration
            side = get_bounding_box_side(trees)
            if side < best_side:
                best_side = side
                best_source = source_path
                best_trees = trees
    
    if best_trees is not None:
        best_configs[n] = (best_side, best_source, best_trees)
        best_scores[n] = best_side**2 / n
    
    if n % 20 == 0:
        print(f"N={n}: best_side={best_side:.6f}, score={best_scores[n]:.6f}, source={os.path.basename(best_source)}")

print(f"\nProcessed all N values")

N=20: best_side=2.742469, score=0.376057, source=ensemble.csv


N=40: best_side=3.806037, score=0.362148, source=ensemble.csv


N=60: best_side=4.629849, score=0.357258, source=ensemble.csv


N=80: best_side=5.252660, score=0.344881, source=ensemble.csv


N=100: best_side=5.878188, score=0.345531, source=ensemble.csv


N=120: best_side=6.365691, score=0.337684, source=ensemble.csv


N=140: best_side=6.900273, score=0.340098, source=ensemble.csv


N=160: best_side=7.369199, score=0.339407, source=ensemble.csv


N=180: best_side=7.718831, score=0.331002, source=ensemble.csv


N=200: best_side=8.218653, score=0.337731, source=ensemble.csv

Processed all N values


In [4]:
# Calculate total score
total_score = sum(best_scores.values())
print(f"Total Ensemble Score: {total_score:.6f}")

# Compare to baseline
baseline_score = 70.676102
improvement = baseline_score - total_score
print(f"Baseline Score: {baseline_score:.6f}")
print(f"Improvement: {improvement:.6f}")

# Show per-N breakdown for first 20
print("\nPer-N scores (first 20):")
for n in range(1, 21):
    side, source, _ = best_configs[n]
    score = best_scores[n]
    print(f"  N={n:3d}: side={side:.6f}, score={score:.6f}, source={os.path.basename(source)}")

Total Ensemble Score: 70.676102
Baseline Score: 70.676102
Improvement: -0.000000

Per-N scores (first 20):
  N=  1: side=0.813173, score=0.661250, source=ensemble.csv
  N=  2: side=0.949504, score=0.450779, source=ensemble.csv
  N=  3: side=1.142031, score=0.434745, source=ensemble.csv
  N=  4: side=1.290806, score=0.416545, source=ensemble.csv
  N=  5: side=1.443692, score=0.416850, source=ensemble.csv
  N=  6: side=1.548438, score=0.399610, source=ensemble.csv
  N=  7: side=1.673104, score=0.399897, source=ensemble.csv
  N=  8: side=1.755921, score=0.385407, source=ensemble.csv
  N=  9: side=1.867280, score=0.387415, source=ensemble.csv
  N= 10: side=1.940696, score=0.376630, source=ensemble.csv
  N= 11: side=2.033002, score=0.375736, source=ensemble.csv
  N= 12: side=2.114873, score=0.372724, source=ensemble.csv
  N= 13: side=2.200046, score=0.372323, source=ensemble.csv
  N= 14: side=2.277711, score=0.370569, source=ensemble.csv
  N= 15: side=2.384962, score=0.379203, source=ensemb

In [5]:
# Create the ensemble submission dataframe
ensemble_rows = []
for n in range(1, 201):
    _, _, trees = best_configs[n]
    for i, (x, y, deg) in enumerate(trees):
        row_id = f'{n:03d}_{i}'
        ensemble_rows.append({
            'id': row_id,
            'x': f's{x}',
            'y': f's{y}',
            'deg': f's{deg}'
        })

ensemble_df = pd.DataFrame(ensemble_rows)
print(f"Created ensemble submission with {len(ensemble_df)} rows")
print(ensemble_df.head())

Created ensemble submission with 20100 rows
      id                      x                      y                  deg
0  001_0   s-48.196086194214246    s58.770984615214225                s45.0
1  002_0    s0.1540970696213559  s-0.03854074269479465  s203.62937773065684
2  002_1  s-0.15409706962137285   s-0.5614592573052241  s23.629377730656792
3  003_0     s1.123655816140301     s0.781101815992563    s111.125132292893
4  003_1      s1.23405569584216     s1.275999500663759     s66.370622269343


In [6]:
# Validate for overlaps (sample check)
print("Checking for overlaps (sample)...")
overlap_found = False
for n in [1, 10, 50, 100, 150, 200]:
    _, _, trees = best_configs[n]
    if has_overlap(trees):
        print(f"  N={n}: OVERLAP DETECTED!")
        overlap_found = True
    else:
        print(f"  N={n}: OK")

if not overlap_found:
    print("\nNo overlaps detected in sampled configurations.")

Checking for overlaps (sample)...
  N=1: OK
  N=10: OK
  N=50: OK
  N=100: OK
  N=150: OK
  N=200: OK

No overlaps detected in sampled configurations.


In [None]:
# Save the ensemble submission
ensemble_df.to_csv('/home/submission/submission.csv', index=False)
ensemble_df.to_csv('/home/code/experiments/002_full_ensemble/submission.csv', index=False)
print(f"Saved ensemble submission")

print(f"\n=== ENSEMBLE SCORE: {total_score:.6f} ===")
print(f"=== IMPROVEMENT OVER BASELINE: {improvement:.6f} ===")