# Loop 6 Analysis: Understanding the Optimization Time Gap

## Key Insight from Evaluator
We've been running bbox3 for 71-150 seconds while top kernels run for 3-11 HOURS.
This is ~150x shorter than what's needed.

## Goals
1. Understand what the bbox3 runner kernel does differently
2. Analyze the 3-phase approach
3. Plan a proper long-running optimization strategy

In [None]:
import pandas as pd
import numpy as np
import os

# Check available pre-optimized solutions
print("Available solutions:")
for f in os.listdir('/home/code/data'):
    if f.endswith('.csv'):
        df = pd.read_csv(f'/home/code/data/{f}')
        print(f"  {f}: {len(df)} rows")

In [None]:
# Load the current best solution and analyze per-N scores
from shapely.geometry import Polygon
from shapely import affinity

TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]
TREE_VERTICES = list(zip(TX, TY))

def parse_s_value(s):
    if isinstance(s, str) and s.startswith('s'):
        return float(s[1:])
    return float(s)

def create_tree_polygon(x, y, deg):
    poly = Polygon(TREE_VERTICES)
    poly = affinity.rotate(poly, deg, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def get_bounding_box_side(polygons):
    if not polygons:
        return 0
    all_coords = []
    for poly in polygons:
        all_coords.extend(list(poly.exterior.coords))
    xs = [c[0] for c in all_coords]
    ys = [c[1] for c in all_coords]
    return max(max(xs) - min(xs), max(ys) - min(ys))

def get_per_n_scores(csv_path):
    df = pd.read_csv(csv_path)
    df['x_val'] = df['x'].apply(parse_s_value)
    df['y_val'] = df['y'].apply(parse_s_value)
    df['deg_val'] = df['deg'].apply(parse_s_value)
    
    scores = {}
    for n in range(1, 201):
        prefix = f'{n:03d}_'
        group = df[df['id'].str.startswith(prefix)]
        polygons = [create_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) for _, row in group.iterrows()]
        side = get_bounding_box_side(polygons)
        scores[n] = side**2 / n
    return scores

# Current best
best_scores = get_per_n_scores('/home/submission/submission.csv')
print(f"Current best total: {sum(best_scores.values()):.6f}")
print(f"Target: 68.919154")
print(f"Gap: {sum(best_scores.values()) - 68.919154:.6f}")

In [None]:
# Analyze which N values have the most room for improvement
# Compare with theoretical minimum (perfect packing efficiency)

# Tree area (approximate)
tree_area = 0.35 * 0.8 + 0.15 * 0.2  # rough estimate
print(f"Approximate tree area: {tree_area:.4f}")

# For each N, calculate efficiency = (N * tree_area) / side^2
efficiencies = {}
for n, score in best_scores.items():
    side_sq = score * n
    efficiency = (n * tree_area) / side_sq if side_sq > 0 else 0
    efficiencies[n] = efficiency

# Find N values with lowest efficiency (most room for improvement)
eff_df = pd.DataFrame({
    'N': list(efficiencies.keys()),
    'efficiency': list(efficiencies.values()),
    'score_contribution': [best_scores[n] for n in efficiencies.keys()]
})
eff_df = eff_df.sort_values('efficiency')
print("\nLowest efficiency N values (most room for improvement):")
print(eff_df.head(20).to_string())

In [None]:
# Key insight: The bbox3 runner uses a 3-phase approach
# Phase A: 2-min runs with n ∈ {1000,1200,1500,1800,2000}, r ∈ {30,60,90} = 15 combinations
# Phase B: 10-min runs on top 3 candidates
# Phase C: 20-min runs on top 2 candidates

# Total time budget: 3 hours = 10800 seconds
# Phase A: 15 * 120 = 1800 seconds (30 min)
# Phase B: 3 * 600 = 1800 seconds (30 min)
# Phase C: 2 * 1200 = 2400 seconds (40 min)
# Total: ~100 minutes of actual optimization

print("bbox3 runner 3-phase approach:")
print("Phase A: 15 combinations × 2 min = 30 min")
print("Phase B: 3 candidates × 10 min = 30 min")
print("Phase C: 2 candidates × 20 min = 40 min")
print("Total: ~100 minutes of optimization")
print("")
print("Our current approach: 71-150 seconds = 1-2.5 minutes")
print("We need to run 40-60x LONGER!")

In [None]:
# Check if we have the bbox3 binary ready
import subprocess

bbox3_path = '/home/code/experiments/007_long_bbox3/bbox3'
if os.path.exists(bbox3_path):
    print(f"bbox3 binary exists at {bbox3_path}")
    result = subprocess.run([bbox3_path, '--help'], capture_output=True, text=True, timeout=5)
    print("Help output:")
    print(result.stdout[:500] if result.stdout else result.stderr[:500])
else:
    print("bbox3 binary not found!")

In [None]:
# The key insight from the saspav kernel is the replace_group function
# This allows us to:
# 1. Run aggressive optimization that might create overlaps
# 2. Replace overlapping groups with known-good solutions from a donor file
# 3. This way we can explore more aggressively without worrying about invalid submissions

print("Strategy for next experiment:")
print("1. Run bbox3 for 30+ minutes with systematic parameter search")
print("2. Use replace_group to fix any overlaps")
print("3. Apply fix_direction rotation optimization")
print("4. Validate and submit")
print("")
print("Expected improvement: The gap to target is 1.756 points (2.55%)")
print("With proper optimization time, we should be able to close this gap.")