# Loop 4 Analysis: Understanding the Gap and Next Steps

## Key Observations:
1. All 4 experiments stuck at 70.676102
2. Random initialization without optimization produces much worse results
3. Deletion cascade found 0 improvements
4. The baseline is at a very strong local optimum

In [None]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
import os

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def get_tree_polygon(x, y, deg):
    base_poly = Polygon(zip(TX, TY))
    rotated = affinity.rotate(base_poly, deg, origin=(0, 0))
    translated = affinity.translate(rotated, x, y)
    return translated

def get_bounding_box_side(trees):
    if not trees:
        return float('inf')
    all_x = []
    all_y = []
    for x, y, deg in trees:
        poly = get_tree_polygon(x, y, deg)
        bounds = poly.bounds
        all_x.extend([bounds[0], bounds[2]])
        all_y.extend([bounds[1], bounds[3]])
    width = max(all_x) - min(all_x)
    height = max(all_y) - min(all_y)
    return max(width, height)

print('Functions defined')

In [None]:
# Load baseline and analyze per-N scores
baseline_path = '/home/code/experiments/001_baseline/santa-2025.csv'
df = pd.read_csv(baseline_path, dtype=str)

def load_all_configs(df):
    configs = {}
    for n in range(1, 201):
        prefix = f'{n:03d}_'
        rows = df[df['id'].str.startswith(prefix)]
        trees = []
        for _, row in rows.iterrows():
            x = float(str(row['x']).replace('s', ''))
            y = float(str(row['y']).replace('s', ''))
            deg = float(str(row['deg']).replace('s', ''))
            trees.append((x, y, deg))
        configs[n] = trees
    return configs

configs = load_all_configs(df)

# Calculate per-N scores
scores = {}
for n in range(1, 201):
    side = get_bounding_box_side(configs[n])
    scores[n] = side**2 / n

total = sum(scores.values())
print(f'Total score: {total:.6f}')
print(f'Target: 68.919154')
print(f'Gap: {total - 68.919154:.6f}')

In [None]:
# Analyze which N values have the most room for improvement
# Compare to theoretical minimum (perfect packing efficiency)

print('\nPer-N score analysis (top 20 by contribution):')
print('=' * 60)

sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for n, score in sorted_scores[:20]:
    side = get_bounding_box_side(configs[n])
    print(f'N={n:3d}: score={score:.6f}, side={side:.6f}')

In [None]:
# Analyze score distribution by N ranges
ranges = [
    (1, 10, 'N=1-10'),
    (11, 50, 'N=11-50'),
    (51, 100, 'N=51-100'),
    (101, 150, 'N=101-150'),
    (151, 200, 'N=151-200')
]

print('\nScore distribution by N ranges:')
print('=' * 60)
for start, end, label in ranges:
    range_score = sum(scores[n] for n in range(start, end+1))
    pct = range_score / total * 100
    print(f'{label}: {range_score:.6f} ({pct:.1f}%)')

In [None]:
# Check what's available in snapshots for better solutions
import glob

print('\nAvailable pre-optimized solutions in snapshots:')
print('=' * 60)

snapshot_csvs = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
print(f'Total CSV files found: {len(snapshot_csvs)}')

# Check for any solutions with better scores
best_scores = {}
for csv_path in snapshot_csvs[:30]:  # Check first 30
    try:
        df_check = pd.read_csv(csv_path, dtype=str)
        if 'id' in df_check.columns and 'x' in df_check.columns:
            # Quick score check for N=1
            n1_rows = df_check[df_check['id'].str.startswith('001_')]
            if len(n1_rows) == 1:
                x = float(str(n1_rows.iloc[0]['x']).replace('s', ''))
                y = float(str(n1_rows.iloc[0]['y']).replace('s', ''))
                deg = float(str(n1_rows.iloc[0]['deg']).replace('s', ''))
                side = get_bounding_box_side([(x, y, deg)])
                score_n1 = side**2
                if score_n1 < 0.67:  # Good N=1 score
                    print(f'{os.path.basename(csv_path)}: N=1 score = {score_n1:.6f}')
    except:
        pass

In [None]:
# Key insight: The gap of 1.76 points requires ~2.5% improvement
# This is significant and cannot be achieved by local optimization

print('\n=== KEY INSIGHTS ===')
print('=' * 60)
print(f'Current score: {total:.6f}')
print(f'Target score: 68.919154')
print(f'Gap: {total - 68.919154:.6f} ({(total - 68.919154) / 68.919154 * 100:.2f}%)')
print()
print('What has been tried and FAILED:')
print('  1. Baseline from pre-optimized solutions')
print('  2. Full ensemble from 30 CSVs')
print('  3. Deletion cascade (0 improvements)')
print('  4. Random initialization without optimization (much worse)')
print()
print('What MUST be tried next:')
print('  1. Lattice initialization + SA optimization (egortrushin kernel)')
print('  2. Use bbox3 C++ optimizer with fresh configurations')
print('  3. Parallel SA with fractional translation (seshurajup kernel)')
print('  4. Greedy backtracking (constructive approach)')
print()
print('CRITICAL: Random placement without optimization is useless.')
print('The baseline is the result of sophisticated optimization.')