# Experiment 002: Ensemble Best-of-N from Multiple Sources

Goal: Create an ensemble by taking the best configuration for each N from all available overlap-free solutions.

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely.strtree import STRtree
from shapely import affinity
import os
import warnings
warnings.filterwarnings('ignore')

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]
TREE_VERTICES = list(zip(TX, TY))

def parse_s_value(s):
    if isinstance(s, str) and s.startswith('s'):
        return float(s[1:])
    return float(s)

def format_s_value(v):
    return f's{v}'

def create_tree_polygon(x, y, deg):
    poly = Polygon(TREE_VERTICES)
    poly = affinity.rotate(poly, deg, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def get_bounding_box_side(polygons):
    if not polygons:
        return 0
    all_coords = []
    for poly in polygons:
        all_coords.extend(list(poly.exterior.coords))
    xs = [c[0] for c in all_coords]
    ys = [c[1] for c in all_coords]
    return max(max(xs) - min(xs), max(ys) - min(ys))

def has_overlap(polygons):
    if len(polygons) < 2:
        return False
    tree_index = STRtree(polygons)
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx != i:
                if poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                    intersection = poly.intersection(polygons[idx])
                    if intersection.area > 1e-10:
                        return True
    return False

print("Helper functions defined")

Helper functions defined


In [2]:
# List all available CSV files
base_path = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized'

csv_files = []
for root, dirs, files in os.walk(base_path):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

print(f"Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(f"  {f}")

Found 30 CSV files:
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/submission.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/best_ensemble.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/72.49.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/71.97.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/telegram_extracted/72.49.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/telegram_extracted/71.97.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/submission_JKoT4.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/New_Tree_144_196.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimi

In [3]:
def calculate_score_per_n(df):
    """Calculate score for each N and return dict of {n: {'side': side, 'score': score}}"""
    scores = {}
    for n in range(1, 201):
        prefix = f'{n:03d}_'
        group = df[df['id'].str.startswith(prefix)]
        if len(group) == 0:
            continue
        
        polygons = []
        for _, row in group.iterrows():
            poly = create_tree_polygon(row['x_val'], row['y_val'], row['deg_val'])
            polygons.append(poly)
        
        side = get_bounding_box_side(polygons)
        scores[n] = {'side': side, 'score': side**2 / n}
    return scores

def check_overlaps_per_n(df):
    """Check overlaps for each N and return list of N values with overlaps"""
    overlaps = []
    for n in range(1, 201):
        prefix = f'{n:03d}_'
        group = df[df['id'].str.startswith(prefix)]
        if len(group) == 0:
            continue
        
        polygons = []
        for _, row in group.iterrows():
            poly = create_tree_polygon(row['x_val'], row['y_val'], row['deg_val'])
            polygons.append(poly)
        
        if has_overlap(polygons):
            overlaps.append(n)
    return overlaps

print("Score and overlap functions defined")

Score and overlap functions defined


In [4]:
# Load and analyze each CSV file
all_solutions = {}

for csv_path in csv_files:
    try:
        df = pd.read_csv(csv_path)
        if 'id' not in df.columns or 'x' not in df.columns:
            print(f"Skipping {csv_path} - missing columns")
            continue
        
        # Parse values
        df['x_val'] = df['x'].apply(parse_s_value)
        df['y_val'] = df['y'].apply(parse_s_value)
        df['deg_val'] = df['deg'].apply(parse_s_value)
        
        # Calculate total score
        scores = calculate_score_per_n(df)
        total_score = sum(s['score'] for s in scores.values())
        
        all_solutions[csv_path] = {
            'df': df,
            'scores': scores,
            'total_score': total_score
        }
        print(f"{os.path.basename(csv_path)}: score={total_score:.6f}")
    except Exception as e:
        print(f"Error loading {csv_path}: {e}")

print(f"\nLoaded {len(all_solutions)} solutions")

ensemble.csv: score=70.676102


submission.csv: score=70.676501


santa-2025.csv: score=70.676102


best_ensemble.csv: score=70.676102


72.49.csv: score=72.495739


71.97.csv: score=71.972027


72.49.csv: score=72.495739


71.97.csv: score=71.972027


submission_JKoT4.csv: score=72.489504


New_Tree_144_196.csv: score=72.927920


submission_JKoT3.csv: score=72.489488


santa2025_ver2_v61.csv: score=72.951925


submission_JKoT2.csv: score=72.489348


santa2025_ver2_v67.csv: score=72.938567


santa2025_ver2_v76.csv: score=72.826444


submission_70_936673758122.csv: score=70.936674


santa2025_ver2_v65.csv: score=72.935294


submission_70_926149550346.csv: score=70.926150


santa2025_ver2_v66.csv: score=72.938599


santa2025_ver2_v63.csv: score=72.947427


santa2025_ver2_v69.csv: score=72.850110


submission_JKoT1.csv: score=72.489483


submission_opt1.csv: score=70.990692


santa2025_ver2_v68.csv: score=72.939233


santa-2025.csv: score=70.676102


submission.csv: score=70.676501


submission (77).csv: score=72.135010


submission.csv: score=72.935294


submission_sa.csv: score=72.935294


submission_best.csv: score=70.926150

Loaded 30 solutions


In [5]:
# Sort solutions by score
sorted_solutions = sorted(all_solutions.items(), key=lambda x: x[1]['total_score'])
print("Solutions sorted by score (best first):")
for path, data in sorted_solutions[:10]:
    print(f"  {os.path.basename(path)}: {data['total_score']:.6f}")

Solutions sorted by score (best first):
  ensemble.csv: 70.676102
  santa-2025.csv: 70.676102
  best_ensemble.csv: 70.676102
  santa-2025.csv: 70.676102
  submission.csv: 70.676501
  submission.csv: 70.676501
  submission_70_926149550346.csv: 70.926150
  submission_best.csv: 70.926150
  submission_70_936673758122.csv: 70.936674
  submission_opt1.csv: 70.990692


In [6]:
# Check overlaps for top solutions
print("Checking overlaps for top 5 solutions...")
for path, data in sorted_solutions[:5]:
    overlaps = check_overlaps_per_n(data['df'])
    if overlaps:
        print(f"  {os.path.basename(path)}: OVERLAPS in N={overlaps}")
    else:
        print(f"  {os.path.basename(path)}: NO OVERLAPS ✓")

Checking overlaps for top 5 solutions...


  ensemble.csv: NO OVERLAPS ✓


  santa-2025.csv: NO OVERLAPS ✓


  best_ensemble.csv: NO OVERLAPS ✓


  santa-2025.csv: NO OVERLAPS ✓


  submission.csv: NO OVERLAPS ✓


In [7]:
# Create ensemble: for each N, take the best configuration from any overlap-free solution
print("Creating ensemble from best configurations...")

# First, identify overlap-free solutions
overlap_free_solutions = []
for path, data in sorted_solutions:
    overlaps = check_overlaps_per_n(data['df'])
    if not overlaps:
        overlap_free_solutions.append((path, data))
        print(f"  Overlap-free: {os.path.basename(path)} (score={data['total_score']:.6f})")

print(f"\nFound {len(overlap_free_solutions)} overlap-free solutions")

Creating ensemble from best configurations...


  Overlap-free: ensemble.csv (score=70.676102)


  Overlap-free: santa-2025.csv (score=70.676102)


  Overlap-free: best_ensemble.csv (score=70.676102)


  Overlap-free: santa-2025.csv (score=70.676102)


  Overlap-free: submission.csv (score=70.676501)


  Overlap-free: submission.csv (score=70.676501)


  Overlap-free: submission_70_926149550346.csv (score=70.926150)


  Overlap-free: submission_best.csv (score=70.926150)


  Overlap-free: submission_70_936673758122.csv (score=70.936674)


  Overlap-free: submission_opt1.csv (score=70.990692)


  Overlap-free: 71.97.csv (score=71.972027)


  Overlap-free: 71.97.csv (score=71.972027)


  Overlap-free: submission (77).csv (score=72.135010)


  Overlap-free: submission_JKoT2.csv (score=72.489348)


  Overlap-free: submission_JKoT1.csv (score=72.489483)


  Overlap-free: submission_JKoT3.csv (score=72.489488)


  Overlap-free: submission_JKoT4.csv (score=72.489504)


  Overlap-free: 72.49.csv (score=72.495739)


  Overlap-free: 72.49.csv (score=72.495739)


  Overlap-free: santa2025_ver2_v76.csv (score=72.826444)


  Overlap-free: santa2025_ver2_v69.csv (score=72.850110)


  Overlap-free: New_Tree_144_196.csv (score=72.927920)


  Overlap-free: santa2025_ver2_v65.csv (score=72.935294)


  Overlap-free: submission.csv (score=72.935294)


  Overlap-free: submission_sa.csv (score=72.935294)


  Overlap-free: santa2025_ver2_v67.csv (score=72.938567)


  Overlap-free: santa2025_ver2_v66.csv (score=72.938599)


  Overlap-free: santa2025_ver2_v68.csv (score=72.939233)


  Overlap-free: santa2025_ver2_v63.csv (score=72.947427)


  Overlap-free: santa2025_ver2_v61.csv (score=72.951925)

Found 30 overlap-free solutions


In [8]:
# For each N, find the best configuration across all overlap-free solutions
best_per_n = {}

for n in range(1, 201):
    best_score = float('inf')
    best_source = None
    best_rows = None
    
    for path, data in overlap_free_solutions:
        if n in data['scores']:
            score = data['scores'][n]['score']
            if score < best_score:
                best_score = score
                best_source = path
                prefix = f'{n:03d}_'
                best_rows = data['df'][data['df']['id'].str.startswith(prefix)].copy()
    
    if best_rows is not None:
        best_per_n[n] = {
            'score': best_score,
            'source': best_source,
            'rows': best_rows
        }

print(f"Found best configurations for {len(best_per_n)} N values")

# Show which sources contribute to the ensemble
source_counts = {}
for n, data in best_per_n.items():
    source = os.path.basename(data['source'])
    source_counts[source] = source_counts.get(source, 0) + 1

print("\nSource contributions:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f"  {source}: {count} configurations")

Found best configurations for 200 N values

Source contributions:
  ensemble.csv: 200 configurations


In [9]:
# Create the ensemble submission
ensemble_rows = []
for n in range(1, 201):
    if n in best_per_n:
        ensemble_rows.append(best_per_n[n]['rows'])

ensemble_df = pd.concat(ensemble_rows, ignore_index=True)
print(f"Ensemble has {len(ensemble_df)} rows")

# Calculate ensemble score
ensemble_scores = calculate_score_per_n(ensemble_df)
ensemble_total = sum(s['score'] for s in ensemble_scores.values())
print(f"\nEnsemble total score: {ensemble_total:.6f}")
print(f"Best single solution: {sorted_solutions[0][1]['total_score']:.6f}")
print(f"Improvement: {sorted_solutions[0][1]['total_score'] - ensemble_total:.6f}")

Ensemble has 20100 rows



Ensemble total score: 70.676102
Best single solution: 70.676102
Improvement: 0.000000


In [10]:
# Validate ensemble has no overlaps
print("Validating ensemble for overlaps...")
ensemble_overlaps = check_overlaps_per_n(ensemble_df)
if ensemble_overlaps:
    print(f"WARNING: Ensemble has overlaps in N={ensemble_overlaps}")
else:
    print("Ensemble has NO OVERLAPS ✓")

Validating ensemble for overlaps...


Ensemble has NO OVERLAPS ✓


In [11]:
# Save ensemble submission
def create_submission(df, output_path):
    submission = df[['id']].copy()
    submission['x'] = df['x'].apply(lambda x: x if isinstance(x, str) and x.startswith('s') else format_s_value(df.loc[df['x']==x, 'x_val'].iloc[0]))
    submission['y'] = df['y'].apply(lambda x: x if isinstance(x, str) and x.startswith('s') else format_s_value(df.loc[df['y']==x, 'y_val'].iloc[0]))
    submission['deg'] = df['deg'].apply(lambda x: x if isinstance(x, str) and x.startswith('s') else format_s_value(df.loc[df['deg']==x, 'deg_val'].iloc[0]))
    submission.to_csv(output_path, index=False)
    print(f"Saved submission to {output_path}")
    return submission

# Use original string values to preserve precision
ensemble_submission = ensemble_df[['id', 'x', 'y', 'deg']].copy()
ensemble_submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Saved ensemble to /home/submission/submission.csv")
print(ensemble_submission.head())

Saved ensemble to /home/submission/submission.csv
      id                       x                       y  \
0  001_0    s-48.196086194214246     s58.770984615214225   
1  002_0   s0.154097069621355887  s-0.038540742694794648   
2  002_1  s-0.154097069621372845  s-0.561459257305224058   
3  003_0      s1.123655816140301      s0.781101815992563   
4  003_1       s1.23405569584216      s1.275999500663759   

                       deg  
0                    s45.0  
1  s203.629377730656841550  
2   s23.629377730656791812  
3        s111.125132292893  
4         s66.370622269343  


In [12]:
# Summary
print("="*50)
print("EXPERIMENT 002 SUMMARY")
print("="*50)
print(f"Best single solution score: {sorted_solutions[0][1]['total_score']:.6f}")
print(f"Ensemble score: {ensemble_total:.6f}")
print(f"Improvement: {sorted_solutions[0][1]['total_score'] - ensemble_total:.6f}")
print(f"Target score: 68.919154")
print(f"Gap to target: {ensemble_total - 68.919154:.6f}")
print(f"Overlaps: {len(ensemble_overlaps)}")
print("="*50)

EXPERIMENT 002 SUMMARY
Best single solution score: 70.676102
Ensemble score: 70.676102
Improvement: 0.000000
Target score: 68.919154
Gap to target: 1.756948
Overlaps: 0
