# Ensemble Best Per-N Solutions

Combine the best solution for each N value from all available snapshots.
This is the key technique from the top-scoring kernel.

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
import os
import json
from collections import defaultdict

# Tree polygon vertices
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, angle):
    """Create a tree polygon at position (x, y) with rotation angle (degrees)."""
    poly = Polygon(zip(TX, TY))
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def parse_submission(df):
    """Parse submission dataframe into dict of n -> list of (x, y, angle)."""
    solutions = {}
    for _, row in df.iterrows():
        id_parts = row['id'].split('_')
        n = int(id_parts[0])
        x_str = str(row['x'])
        y_str = str(row['y'])
        deg_str = str(row['deg'])
        x = float(x_str[1:] if x_str.startswith('s') else x_str)
        y = float(y_str[1:] if y_str.startswith('s') else y_str)
        angle = float(deg_str[1:] if deg_str.startswith('s') else deg_str)
        if n not in solutions:
            solutions[n] = []
        solutions[n].append((x, y, angle))
    return solutions

def calculate_side(trees):
    """Calculate the bounding box side length for a set of trees."""
    polys = [create_tree_polygon(*t) for t in trees]
    union = unary_union(polys)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def calculate_score_for_n(trees, n):
    """Calculate score contribution for N trees."""
    side = calculate_side(trees)
    return (side ** 2) / n

def check_overlaps(trees):
    """Check if any trees overlap."""
    polys = [create_tree_polygon(*t) for t in trees]
    for i in range(len(polys)):
        for j in range(i+1, len(polys)):
            if polys[i].intersects(polys[j]):
                intersection = polys[i].intersection(polys[j])
                if intersection.area > 1e-15:
                    return True
    return False

print("Functions defined!")

Functions defined!


In [2]:
# Load all snapshots and find best per-N
snapshot_base = '/home/nonroot/snapshots/santa-2025/'
snapshot_dirs = sorted(os.listdir(snapshot_base))
print(f"Found {len(snapshot_dirs)} snapshot directories")

# Track best solution for each N
best_per_n = {n: {'score': float('inf'), 'trees': None, 'source': None} for n in range(1, 201)}

Found 114 snapshot directories


In [3]:
# Process each snapshot
processed = 0
errors = 0

for snap_dir in snapshot_dirs:
    sub_path = os.path.join(snapshot_base, snap_dir, 'submission', 'submission.csv')
    if not os.path.exists(sub_path):
        continue
    
    try:
        df = pd.read_csv(sub_path)
        solutions = parse_submission(df)
        
        # Check each N
        for n in range(1, 201):
            if n not in solutions:
                continue
            
            trees = solutions[n]
            score = calculate_score_for_n(trees, n)
            
            # Update if better
            if score < best_per_n[n]['score']:
                best_per_n[n]['score'] = score
                best_per_n[n]['trees'] = trees
                best_per_n[n]['source'] = snap_dir
        
        processed += 1
        if processed % 20 == 0:
            print(f"Processed {processed} snapshots...")
            
    except Exception as e:
        errors += 1
        if errors <= 5:
            print(f"Error with {snap_dir}: {e}")

print(f"\nProcessed {processed} snapshots, {errors} errors")

Processed 20 snapshots...


Error with 21145963314: 'deg'


Processed 40 snapshots...


Processed 60 snapshots...


Processed 80 snapshots...



Processed 87 snapshots, 1 errors


In [4]:
# Calculate total score from ensemble
total_score = sum(best_per_n[n]['score'] for n in range(1, 201))
print(f"Ensemble total score: {total_score:.6f}")

# Compare to baseline
baseline_score = 70.572798  # From exp_001
improvement = baseline_score - total_score
print(f"Baseline score: {baseline_score:.6f}")
print(f"Improvement: {improvement:.6f}")

Ensemble total score: 70.522682
Baseline score: 70.572798
Improvement: 0.050116


In [5]:
# Show which N values improved and from which sources
print("\nBest sources per N (first 20):")
for n in range(1, 21):
    print(f"  N={n}: score={best_per_n[n]['score']:.6f} from {best_per_n[n]['source']}")

# Count unique sources
sources = set(best_per_n[n]['source'] for n in range(1, 201))
print(f"\nTotal unique sources used: {len(sources)}")


Best sources per N (first 20):
  N=1: score=0.661250 from 21104669204
  N=2: score=0.437328 from 21145966992
  N=3: score=0.434745 from 21322576827
  N=4: score=0.411056 from 21145966992
  N=5: score=0.394109 from 21145966992
  N=6: score=0.399610 from 21191209482
  N=7: score=0.399897 from 21165874980
  N=8: score=0.385407 from 21329069570
  N=9: score=0.387415 from 21331543270
  N=10: score=0.376630 from 21322576451
  N=11: score=0.374924 from 21322576827
  N=12: score=0.372724 from 21191209482
  N=13: score=0.372294 from 21329069570
  N=14: score=0.369543 from 21191209482
  N=15: score=0.376949 from 21331543270
  N=16: score=0.373894 from 21145966992
  N=17: score=0.370040 from 21121943993
  N=18: score=0.368771 from 21116303805
  N=19: score=0.368615 from 21322576827
  N=20: score=0.376057 from 21191209482

Total unique sources used: 14


In [6]:
# Validate the ensemble solution for overlaps
print("Validating ensemble solution for overlaps...")
overlap_count = 0
for n in range(1, 201):
    trees = best_per_n[n]['trees']
    if check_overlaps(trees):
        overlap_count += 1
        if overlap_count <= 10:
            print(f"  N={n} has overlaps!")

print(f"\nTotal N values with overlaps: {overlap_count}")

Validating ensemble solution for overlaps...
  N=2 has overlaps!
  N=4 has overlaps!
  N=5 has overlaps!
  N=16 has overlaps!
  N=29 has overlaps!
  N=35 has overlaps!
  N=40 has overlaps!
  N=46 has overlaps!


  N=47 has overlaps!


  N=48 has overlaps!



Total N values with overlaps: 73


In [7]:
# If there are overlaps, we need to fall back to the baseline for those N values
# Load baseline solution
baseline_path = '/home/submission/submission.csv'
df_baseline = pd.read_csv(baseline_path)
baseline_solutions = parse_submission(df_baseline)

# Replace overlapping N values with baseline
if overlap_count > 0:
    print("Replacing overlapping N values with baseline...")
    for n in range(1, 201):
        trees = best_per_n[n]['trees']
        if check_overlaps(trees):
            best_per_n[n]['trees'] = baseline_solutions[n]
            best_per_n[n]['score'] = calculate_score_for_n(baseline_solutions[n], n)
            best_per_n[n]['source'] = 'baseline'
    
    # Recalculate total score
    total_score = sum(best_per_n[n]['score'] for n in range(1, 201))
    print(f"New total score after fixing overlaps: {total_score:.6f}")

Replacing overlapping N values with baseline...


New total score after fixing overlaps: 70.615106


In [8]:
# Create submission dataframe
def create_submission_df(best_per_n):
    rows = []
    for n in range(1, 201):
        trees = best_per_n[n]['trees']
        for idx, (x, y, angle) in enumerate(trees):
            rows.append({
                'id': f'{n:03d}_{idx}',
                'x': f's{x}',
                'y': f's{y}',
                'deg': f's{angle}'
            })
    return pd.DataFrame(rows)

df_ensemble = create_submission_df(best_per_n)
print(f"Created submission with {len(df_ensemble)} rows")
print(df_ensemble.head())

Created submission with 20100 rows
      id                     x                      y                  deg
0  001_0    s43.59119209210215    s-31.78326706874178   s44.99999999999998
1  002_0   s0.1540970696213643  s-0.03854074269478543  s203.62937773065684
2  002_1  s-0.1540970696213643   s-0.5614592573052146  s23.629377730656792
3  003_0    s0.254937643697833    s-0.233436061549416  s113.56326044172948
4  003_1    s0.357722754471247     s0.250360566787394     s66.370622269343


In [9]:
# Final validation
print("\nFinal validation...")
final_overlap_count = 0
for n in range(1, 201):
    trees = best_per_n[n]['trees']
    if check_overlaps(trees):
        final_overlap_count += 1
        print(f"  N={n} STILL has overlaps!")

if final_overlap_count == 0:
    print("✅ No overlaps detected!")
    
    # Save submission
    df_ensemble.to_csv('/home/submission/submission.csv', index=False)
    print("Saved to /home/submission/submission.csv")
    
    # Verify
    df_verify = pd.read_csv('/home/submission/submission.csv')
    sol_verify = parse_submission(df_verify)
    verify_score = sum(calculate_score_for_n(sol_verify[n], n) for n in range(1, 201))
    print(f"Verified score: {verify_score:.6f}")
else:
    print(f"❌ {final_overlap_count} N values still have overlaps!")


Final validation...


✅ No overlaps detected!
Saved to /home/submission/submission.csv


Verified score: 70.615106


In [None]:
# Save metrics
metrics = {
    'cv_score': total_score,
    'baseline_score': baseline_score,
    'improvement': improvement,
    'unique_sources': len(sources),
    'overlap_count': final_overlap_count
}

with open('/home/code/experiments/002_ensemble_best_per_n/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"\nSaved metrics. CV Score: {total_score:.6f}")

In [None]:
# The issue is that many of the best per-N solutions have overlaps
# Let's try a different approach: only accept improvements that pass overlap validation

# Reload baseline
baseline_path = '/home/nonroot/snapshots/santa-2025/21145966992/submission/submission.csv'
df_baseline = pd.read_csv(baseline_path)
baseline_solutions = parse_submission(df_baseline)

# Calculate baseline per-N scores
baseline_per_n = {n: calculate_score_for_n(baseline_solutions[n], n) for n in range(1, 201)}
baseline_total = sum(baseline_per_n.values())
print(f"Baseline total: {baseline_total:.6f}")

# Start with baseline and only accept valid improvements
best_valid_per_n = {n: {'score': baseline_per_n[n], 'trees': baseline_solutions[n], 'source': 'baseline'} for n in range(1, 201)}

In [None]:
# Process each snapshot and only accept valid improvements
improvements_found = 0
improvements_rejected = 0

for snap_dir in snapshot_dirs:
    sub_path = os.path.join(snapshot_base, snap_dir, 'submission', 'submission.csv')
    if not os.path.exists(sub_path):
        continue
    
    try:
        df = pd.read_csv(sub_path)
        solutions = parse_submission(df)
        
        # Check each N
        for n in range(1, 201):
            if n not in solutions:
                continue
            
            trees = solutions[n]
            score = calculate_score_for_n(trees, n)
            
            # Only accept if better AND no overlaps
            if score < best_valid_per_n[n]['score'] - 1e-10:
                if not check_overlaps(trees):
                    best_valid_per_n[n]['score'] = score
                    best_valid_per_n[n]['trees'] = trees
                    best_valid_per_n[n]['source'] = snap_dir
                    improvements_found += 1
                else:
                    improvements_rejected += 1
                    
    except Exception as e:
        pass

print(f"Valid improvements found: {improvements_found}")
print(f"Improvements rejected (overlaps): {improvements_rejected}")