# Loop 4 Analysis: C++ Overlap Detection Mismatch

## Problem Identified
The jonathanchan C++ SA optimizer improved score from 70.68 to 51.66 but introduced overlaps in 176/200 N configurations. When validated with Python/Shapely, these overlaps were detected and repaired, returning the score to 70.68.

## Root Cause
The C++ overlap detection algorithm differs from Python/Shapely's algorithm. The C++ code uses:
1. Point-in-polygon test (ray casting)
2. Segment intersection test

But Shapely uses a more robust algorithm that handles edge cases differently.

In [None]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.ops import unary_union
from shapely.strtree import STRtree
import os
import glob

getcontext().prec = 30
scale_factor = Decimal('1e15')

class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(str(center_x))
        self.center_y = Decimal(str(center_y))
        self.angle = Decimal(str(angle))
        trunk_w = Decimal('0.15')
        trunk_h = Decimal('0.2')
        base_w = Decimal('0.7')
        mid_w = Decimal('0.4')
        top_w = Decimal('0.25')
        tip_y = Decimal('0.8')
        tier_1_y = Decimal('0.5')
        tier_2_y = Decimal('0.25')
        base_y = Decimal('0.0')
        trunk_bottom_y = -trunk_h
        initial_polygon = Polygon([
            (Decimal('0.0') * scale_factor, tip_y * scale_factor),
            (top_w / Decimal('2') * scale_factor, tier_1_y * scale_factor),
            (top_w / Decimal('4') * scale_factor, tier_1_y * scale_factor),
            (mid_w / Decimal('2') * scale_factor, tier_2_y * scale_factor),
            (mid_w / Decimal('4') * scale_factor, tier_2_y * scale_factor),
            (base_w / Decimal('2') * scale_factor, base_y * scale_factor),
            (trunk_w / Decimal('2') * scale_factor, base_y * scale_factor),
            (trunk_w / Decimal('2') * scale_factor, trunk_bottom_y * scale_factor),
            (-(trunk_w / Decimal('2')) * scale_factor, trunk_bottom_y * scale_factor),
            (-(trunk_w / Decimal('2')) * scale_factor, base_y * scale_factor),
            (-(base_w / Decimal('2')) * scale_factor, base_y * scale_factor),
            (-(mid_w / Decimal('4')) * scale_factor, tier_2_y * scale_factor),
            (-(mid_w / Decimal('2')) * scale_factor, tier_2_y * scale_factor),
            (-(top_w / Decimal('4')) * scale_factor, tier_1_y * scale_factor),
            (-(top_w / Decimal('2')) * scale_factor, tier_1_y * scale_factor),
        ])
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=float(self.center_x * scale_factor), yoff=float(self.center_y * scale_factor))

def load_configuration_from_df(n, df):
    group_data = df[df["id"].str.startswith(f"{n:03d}_")]
    trees = []
    for _, row in group_data.iterrows():
        x = str(row["x"])[1:] if str(row["x"]).startswith('s') else str(row["x"])
        y = str(row["y"])[1:] if str(row["y"]).startswith('s') else str(row["y"])
        deg = str(row["deg"])[1:] if str(row["deg"]).startswith('s') else str(row["deg"])
        if x and y and deg:
            trees.append(ChristmasTree(x, y, deg))
    return trees

def get_tree_list_side_length(tree_list):
    all_polygons = [t.polygon for t in tree_list]
    bounds = unary_union(all_polygons).bounds
    return Decimal(max(bounds[2] - bounds[0], bounds[3] - bounds[1])) / scale_factor

def get_score(trees, n):
    if not trees:
        return 0.0
    side = get_tree_list_side_length(trees)
    return float(side ** 2 / Decimal(n))

def has_overlap(trees):
    if len(trees) <= 1:
        return False
    polygons = [t.polygon for t in trees]
    tree_index = STRtree(polygons)
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx == i:
                continue
            if poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                return True
    return False

def score_submission(file_path, max_n=200):
    df = pd.read_csv(file_path)
    total_score = 0.0
    failed_overlap_n = []
    for n in range(1, max_n + 1):
        trees = load_configuration_from_df(n, df)
        if trees:
            current_score = get_score(trees, n)
            total_score += current_score
            if has_overlap(trees):
                failed_overlap_n.append(n)
    return total_score, failed_overlap_n

print("Functions loaded")

In [None]:
# Check available pre-optimized CSVs
preopt_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized'
print("Available pre-optimized CSVs:")
for f in sorted(os.listdir(preopt_dir)):
    if f.endswith('.csv'):
        path = os.path.join(preopt_dir, f)
        try:
            score, overlaps = score_submission(path)
            print(f"  {f}: score={score:.6f}, overlaps={len(overlaps)}")
        except Exception as e:
            print(f"  {f}: ERROR - {e}")

In [None]:
# Check the chistyakov folder
chistyakov_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/chistyakov'
if os.path.exists(chistyakov_dir):
    print("\nChistyakov CSVs:")
    for f in sorted(os.listdir(chistyakov_dir)):
        if f.endswith('.csv'):
            path = os.path.join(chistyakov_dir, f)
            try:
                score, overlaps = score_submission(path)
                print(f"  {f}: score={score:.6f}, overlaps={len(overlaps)}")
            except Exception as e:
                print(f"  {f}: ERROR - {e}")

In [None]:
# Check telegram folder
telegram_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram'
if os.path.exists(telegram_dir):
    print("\nTelegram CSVs:")
    for f in sorted(os.listdir(telegram_dir)):
        if f.endswith('.csv'):
            path = os.path.join(telegram_dir, f)
            try:
                score, overlaps = score_submission(path)
                print(f"  {f}: score={score:.6f}, overlaps={len(overlaps)}")
            except Exception as e:
                print(f"  {f}: ERROR - {e}")

In [None]:
# Check our current best submission
print("\nCurrent submission:")
score, overlaps = score_submission('/home/submission/submission.csv')
print(f"  Score: {score:.6f}")
print(f"  Overlaps: {overlaps if overlaps else 'None'}")
print(f"  Gap to target (68.92): {score - 68.922808:.6f} points")

## Key Insight: The Problem is Overlap Detection Mismatch

The C++ optimizer is producing configurations that it thinks are valid (no overlaps), but Python/Shapely detects overlaps. This means:

1. **The optimizer IS working** - it found better configurations (51.66 vs 70.68)
2. **The overlap detection is the bottleneck** - C++ and Python disagree on what's an overlap
3. **We need a PURE PYTHON optimizer** that uses Shapely for overlap detection

## Options:

### Option A: Pure Python Backpacking (crodoc kernel)
- Start from N=200, work backward to N=1
- For each N, try removing each tree from N+1 configuration
- Uses Shapely for overlap detection (guaranteed correct)
- Slower but produces valid results

### Option B: Fix C++ Overlap Detection
- Add buffer/tolerance to C++ overlap detection
- Make it stricter than Shapely to ensure no false negatives
- Requires modifying C++ code and recompiling

### Option C: Hybrid Approach
- Run C++ optimizer to generate candidates
- Validate each candidate with Python/Shapely
- Only keep configurations that pass Python validation
- This is what we tried, but 176/200 failed

### Option D: Python SA Optimizer with Shapely
- Implement SA optimizer in pure Python
- Use Shapely for overlap detection
- Slower but guaranteed correct

In [None]:
# Let's analyze which N values have the most room for improvement
# by comparing our baseline to the theoretical minimum

df_baseline = pd.read_csv('/home/code/experiments/003_preoptimized/repaired_baseline.csv')

print("Score breakdown by N range:")
ranges = [(1, 50), (51, 100), (101, 150), (151, 200)]
for start, end in ranges:
    range_score = 0
    for n in range(start, end + 1):
        trees = load_configuration_from_df(n, df_baseline)
        if trees:
            range_score += get_score(trees, n)
    print(f"  N={start}-{end}: {range_score:.4f}")

print(f"\nTotal: {sum([get_score(load_configuration_from_df(n, df_baseline), n) for n in range(1, 201)]):.6f}")

In [None]:
# Find the N values with the worst efficiency (highest score contribution per tree)
print("\nWorst-performing N values (highest score per N):")
scores_by_n = []
for n in range(1, 201):
    trees = load_configuration_from_df(n, df_baseline)
    if trees:
        score = get_score(trees, n)
        scores_by_n.append((n, score))

# Sort by score (descending)
scores_by_n.sort(key=lambda x: x[1], reverse=True)

print("Top 20 worst N values:")
for n, score in scores_by_n[:20]:
    print(f"  N={n}: {score:.6f}")

print(f"\nSum of worst 20: {sum([s for _, s in scores_by_n[:20]]):.4f}")
print(f"If improved by 10%: saves {sum([s for _, s in scores_by_n[:20]]) * 0.1:.4f} points")