# Find Best Baseline Submission

Evaluate multiple snapshots to find the best baseline.

In [None]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
import os
import glob

# Tree polygon vertices
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, angle):
    """Create a tree polygon at position (x, y) with rotation angle (degrees)."""
    poly = Polygon(zip(TX, TY))
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def parse_submission(df):
    """Parse submission dataframe into dict of n -> list of (x, y, angle)."""
    solutions = {}
    for _, row in df.iterrows():
        id_parts = row['id'].split('_')
        n = int(id_parts[0])
        x = float(row['x'][1:])  # Remove 's' prefix
        y = float(row['y'][1:])  # Remove 's' prefix
        angle = float(row['deg'][1:])  # Remove 's' prefix
        if n not in solutions:
            solutions[n] = []
        solutions[n].append((x, y, angle))
    return solutions

def calculate_score(solutions):
    """Calculate total score for all N=1 to 200."""
    total = 0
    per_n_scores = {}
    for n in range(1, 201):
        if n not in solutions:
            print(f"Missing N={n}!")
            continue
        trees = solutions[n]
        polys = [create_tree_polygon(*t) for t in trees]
        union = unary_union(polys)
        bounds = union.bounds
        side = max(bounds[2] - bounds[0], bounds[3] - bounds[1])
        contribution = (side ** 2) / n
        per_n_scores[n] = contribution
        total += contribution
    return total, per_n_scores

print("Functions defined successfully!")

In [None]:
# Find all snapshot directories
snapshot_base = '/home/nonroot/snapshots/santa-2025/'
snapshot_dirs = sorted(os.listdir(snapshot_base))
print(f"Found {len(snapshot_dirs)} snapshot directories")

# Sample a few to find the best
test_dirs = snapshot_dirs[-10:]  # Check last 10 (likely most recent/best)
print(f"Testing: {test_dirs}")

In [None]:
# Evaluate each snapshot
results = []
for snap_dir in test_dirs:
    sub_path = os.path.join(snapshot_base, snap_dir, 'submission', 'submission.csv')
    if os.path.exists(sub_path):
        try:
            df = pd.read_csv(sub_path)
            solutions = parse_submission(df)
            score, per_n = calculate_score(solutions)
            results.append((snap_dir, score, per_n))
            print(f"{snap_dir}: score = {score:.6f}")
        except Exception as e:
            print(f"{snap_dir}: ERROR - {e}")
    else:
        print(f"{snap_dir}: No submission file")

In [None]:
# Find the best one
if results:
    best = min(results, key=lambda x: x[1])
    print(f"\nBest baseline: {best[0]} with score {best[1]:.6f}")
    best_dir = best[0]
    best_score = best[1]
    best_per_n = best[2]
else:
    print("No valid submissions found!")

In [None]:
# Also check the first snapshot mentioned in the seed prompt
first_snap = '20952569566'
sub_path = os.path.join(snapshot_base, first_snap, 'submission', 'submission.csv')
if os.path.exists(sub_path):
    df = pd.read_csv(sub_path)
    solutions = parse_submission(df)
    score, per_n = calculate_score(solutions)
    print(f"First snapshot {first_snap}: score = {score:.6f}")
    if score < best_score:
        best_dir = first_snap
        best_score = score
        best_per_n = per_n
        print(f"This is better! Using this as baseline.")

In [None]:
# Copy the best baseline to our submission folder
import shutil

best_path = os.path.join(snapshot_base, best_dir, 'submission', 'submission.csv')
os.makedirs('/home/submission', exist_ok=True)
shutil.copy(best_path, '/home/submission/submission.csv')
print(f"Copied best baseline from {best_dir} to /home/submission/submission.csv")
print(f"Baseline score: {best_score:.6f}")

In [None]:
# Save metrics
import json

metrics = {
    'cv_score': best_score,
    'best_snapshot': best_dir,
    'per_n_scores': {str(k): v for k, v in best_per_n.items()}
}

with open('/home/code/experiments/000_baseline/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Saved metrics to experiments/000_baseline/metrics.json")
print(f"\nBaseline CV Score: {best_score:.6f}")

In [None]:
# Analyze per-N scores to see where improvements might be possible
print("\nTop 10 N values with highest score contribution:")
sorted_per_n = sorted(best_per_n.items(), key=lambda x: x[1], reverse=True)
for n, score in sorted_per_n[:10]:
    print(f"  N={n}: {score:.6f}")

print("\nSmall N scores (N=1-10):")
for n in range(1, 11):
    print(f"  N={n}: {best_per_n[n]:.6f}")