# Score Analysis and Optimization Attempts

Analyze where improvements might be possible in the pre-optimized submission.

In [None]:
import numpy as np
import pandas as pd
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
import matplotlib.pyplot as plt

# Christmas tree polygon vertices
TREE_VERTICES = np.array([
    (0.0, 0.8), (0.125, 0.5), (0.0625, 0.5), (0.2, 0.25), (0.1, 0.25),
    (0.35, 0.0), (0.075, 0.0), (0.075, -0.2), (-0.075, -0.2), (-0.075, 0.0),
    (-0.35, 0.0), (-0.1, 0.25), (-0.2, 0.25), (-0.0625, 0.5), (-0.125, 0.5),
])

class ChristmasTree:
    def __init__(self, x=0, y=0, deg=0):
        self.x, self.y, self.deg = x, y, deg
        self._polygon = None
    
    @property
    def polygon(self):
        if self._polygon is None:
            base = Polygon(TREE_VERTICES)
            rotated = rotate(base, self.deg, origin=(0, 0))
            self._polygon = translate(rotated, self.x, self.y)
        return self._polygon

def load_submission(filepath):
    df = pd.read_csv(filepath)
    for col in ['x', 'y', 'deg']:
        df[col] = df[col].astype(str).str.replace('s', '', regex=False).astype(float)
    return df

def get_trees_for_n(df, n):
    prefix = f"{n:03d}_"
    subset = df[df['id'].str.startswith(prefix)]
    return [ChristmasTree(row['x'], row['y'], row['deg']) for _, row in subset.iterrows()]

def get_bounding_box_side(trees):
    if not trees:
        return 0
    all_points = np.vstack([np.array(t.polygon.exterior.coords) for t in trees])
    return max(all_points.max(axis=0) - all_points.min(axis=0))

print("Functions defined.")

In [None]:
# Load the best submission
df = load_submission('/home/code/experiments/002_bbox3_optimization/submission.csv')

# Calculate score breakdown by N
scores = []
for n in range(1, 201):
    trees = get_trees_for_n(df, n)
    side = get_bounding_box_side(trees)
    score = side**2 / n
    scores.append({'n': n, 'side': side, 'score': score})

scores_df = pd.DataFrame(scores)
print(f"Total score: {scores_df['score'].sum():.6f}")
print(f"\nScore breakdown by N range:")
for start, end in [(1, 10), (11, 50), (51, 100), (101, 150), (151, 200)]:
    subset = scores_df[(scores_df['n'] >= start) & (scores_df['n'] <= end)]
    print(f"  N={start}-{end}: {subset['score'].sum():.4f} ({100*subset['score'].sum()/scores_df['score'].sum():.1f}%)")

In [None]:
# Find the N values with highest individual scores
scores_df_sorted = scores_df.sort_values('score', ascending=False)
print("Top 20 N values by score contribution:")
print(scores_df_sorted.head(20).to_string())

In [None]:
# Theoretical lower bound analysis
# For N=1, the optimal is a single tree at 45 degrees (minimum bounding box)
# The tree has width 0.7 and height 1.0, so at 45 degrees:
# side = sqrt(0.7^2 + 1.0^2) / sqrt(2) * sqrt(2) = sqrt(0.49 + 1) = sqrt(1.49) â‰ˆ 1.22
# But actually the minimum bounding box for a rotated rectangle is achieved at 45 degrees

# Let's calculate the theoretical minimum for N=1
import numpy as np
from scipy.optimize import minimize_scalar

def get_single_tree_bbox(angle_deg):
    tree = ChristmasTree(0, 0, angle_deg)
    coords = np.array(tree.polygon.exterior.coords)
    return max(coords.max(axis=0) - coords.min(axis=0))

# Find optimal angle for single tree
res = minimize_scalar(get_single_tree_bbox, bounds=(0, 90), method='bounded')
print(f"Optimal angle for N=1: {res.x:.2f} degrees")
print(f"Minimum side for N=1: {res.fun:.6f}")
print(f"Current side for N=1: {scores_df[scores_df['n']==1]['side'].values[0]:.6f}")

In [None]:
# Check if there's room for improvement in small N values
print("\nSmall N analysis:")
for n in range(1, 11):
    trees = get_trees_for_n(df, n)
    side = get_bounding_box_side(trees)
    score = side**2 / n
    print(f"N={n}: side={side:.6f}, score={score:.6f}")

In [None]:
# Compare with other pre-optimized submissions
import os

preopt_dir = "/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized"
csv_files = []
for root, dirs, files in os.walk(preopt_dir):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

# For each N, find the best side across all submissions
best_per_n = {n: float('inf') for n in range(1, 201)}
best_source_per_n = {n: None for n in range(1, 201)}

for filepath in csv_files:
    try:
        df_sub = load_submission(filepath)
        if len(df_sub) != 20100:
            continue
        for n in range(1, 201):
            trees = get_trees_for_n(df_sub, n)
            if len(trees) != n:
                continue
            side = get_bounding_box_side(trees)
            if side < best_per_n[n]:
                best_per_n[n] = side
                best_source_per_n[n] = os.path.basename(filepath)
    except:
        pass

# Calculate potential improvement from ensemble
ensemble_score = sum(best_per_n[n]**2 / n for n in range(1, 201))
print(f"Best possible ensemble score: {ensemble_score:.6f}")
print(f"Current best score: {scores_df['score'].sum():.6f}")
print(f"Potential improvement: {scores_df['score'].sum() - ensemble_score:.6f}")

In [None]:
# Check which N values have different best sources
df_best = load_submission('/home/code/experiments/002_bbox3_optimization/submission.csv')

different_sources = []
for n in range(1, 201):
    trees = get_trees_for_n(df_best, n)
    current_side = get_bounding_box_side(trees)
    if best_per_n[n] < current_side - 1e-8:
        different_sources.append({
            'n': n,
            'current_side': current_side,
            'best_side': best_per_n[n],
            'improvement': current_side - best_per_n[n],
            'source': best_source_per_n[n]
        })

if different_sources:
    diff_df = pd.DataFrame(different_sources)
    print(f"Found {len(diff_df)} N values with better alternatives:")
    print(diff_df.to_string())
else:
    print("No better alternatives found - ensemble.csv is already the best for all N values")