# Loop 5 Analysis: Understanding Overlap Validation Failures

exp_004 was rejected with 'Overlapping trees in group 002' despite our local validation showing 0 overlaps.

Let's investigate:
1. What does Kaggle's validation actually check?
2. Why does our validation miss these overlaps?
3. What's the safest path forward?

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.ops import unary_union

getcontext().prec = 30
scale_factor = Decimal('1e15')

print("Analysis setup complete")

Analysis setup complete


In [2]:
# Load exp_004 submission and check N=2 specifically (the failing group)
df = pd.read_csv('/home/code/experiments/004_fix_overlaps/submission.csv')
print(f"Loaded {len(df)} rows")

# Get N=2 trees
n2_rows = df[df['id'].str.startswith('002_')]
print(f"\nN=2 rows:")
print(n2_rows)

Loaded 20100 rows

N=2 rows:
      id           x           y          deg
1  002_0   s0.154097  s-0.038541  s203.629378
2  002_1  s-0.154097  s-0.561459   s23.629378


In [3]:
# Parse values and create polygons
def parse_val(v):
    if isinstance(v, str) and v.startswith('s'):
        return v[1:]
    return str(v)

class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(str(center_x))
        self.center_y = Decimal(str(center_y))
        self.angle = Decimal(str(angle))
        self.polygon = self._create_polygon()
    
    def _create_polygon(self):
        trunk_w = Decimal('0.15')
        trunk_h = Decimal('0.2')
        base_w = Decimal('0.7')
        mid_w = Decimal('0.4')
        top_w = Decimal('0.25')
        tip_y = Decimal('0.8')
        tier_1_y = Decimal('0.5')
        tier_2_y = Decimal('0.25')
        base_y = Decimal('0.0')
        trunk_bottom_y = -trunk_h

        initial_polygon = Polygon([
            (float(Decimal('0.0') * scale_factor), float(tip_y * scale_factor)),
            (float(top_w / Decimal('2') * scale_factor), float(tier_1_y * scale_factor)),
            (float(top_w / Decimal('4') * scale_factor), float(tier_1_y * scale_factor)),
            (float(mid_w / Decimal('2') * scale_factor), float(tier_2_y * scale_factor)),
            (float(mid_w / Decimal('4') * scale_factor), float(tier_2_y * scale_factor)),
            (float(base_w / Decimal('2') * scale_factor), float(base_y * scale_factor)),
            (float(trunk_w / Decimal('2') * scale_factor), float(base_y * scale_factor)),
            (float(trunk_w / Decimal('2') * scale_factor), float(trunk_bottom_y * scale_factor)),
            (float(-(trunk_w / Decimal('2')) * scale_factor), float(trunk_bottom_y * scale_factor)),
            (float(-(trunk_w / Decimal('2')) * scale_factor), float(base_y * scale_factor)),
            (float(-(base_w / Decimal('2')) * scale_factor), float(base_y * scale_factor)),
            (float(-(mid_w / Decimal('4')) * scale_factor), float(tier_2_y * scale_factor)),
            (float(-(mid_w / Decimal('2')) * scale_factor), float(tier_2_y * scale_factor)),
            (float(-(top_w / Decimal('4')) * scale_factor), float(tier_1_y * scale_factor)),
            (float(-(top_w / Decimal('2')) * scale_factor), float(tier_1_y * scale_factor)),
        ])
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        return affinity.translate(rotated,
                                  xoff=float(self.center_x * scale_factor),
                                  yoff=float(self.center_y * scale_factor))

print("ChristmasTree class defined")

ChristmasTree class defined


In [4]:
# Create trees for N=2
trees_n2 = []
for _, row in n2_rows.iterrows():
    x = parse_val(row['x'])
    y = parse_val(row['y'])
    deg = parse_val(row['deg'])
    trees_n2.append(ChristmasTree(x, y, deg))
    print(f"Tree: x={x}, y={y}, deg={deg}")

# Check overlap
p1, p2 = trees_n2[0].polygon, trees_n2[1].polygon
print(f"\nPolygon 1 bounds: {p1.bounds}")
print(f"Polygon 2 bounds: {p2.bounds}")
print(f"\nIntersects: {p1.intersects(p2)}")
print(f"Touches: {p1.touches(p2)}")
print(f"Overlaps (intersects and not touches): {p1.intersects(p2) and not p1.touches(p2)}")

# Check intersection area
if p1.intersects(p2):
    intersection = p1.intersection(p2)
    print(f"\nIntersection type: {intersection.geom_type}")
    print(f"Intersection area: {intersection.area}")
    if hasattr(intersection, 'length'):
        print(f"Intersection length: {intersection.length}")

Tree: x=0.154097, y=-0.038541, deg=203.629378
Tree: x=-0.154097, y=-0.561459, deg=23.629378

Polygon 1 bounds: (-166558066556289.44, -771466866414375.9, 474752070661195.8, 174751879478081.06)
Polygon 2 bounds: (-474752070661195.94, -774751879478081.0, 166558066556289.44, 171466866414375.75)

Intersects: True
Touches: False
Overlaps (intersects and not touches): True

Intersection type: MultiPolygon
Intersection area: 7.019842220764982e+17
Intersection length: 7507614211.861746


In [5]:
# The issue: Kaggle likely uses integer coordinates and checks for ANY intersection
# Let's check what happens with integer-only coordinates

def get_integer_coords(polygon):
    """Get coordinates as integers (Kaggle's approach)"""
    coords = list(polygon.exterior.coords)
    return [(int(round(x)), int(round(y))) for x, y in coords]

int_coords_1 = get_integer_coords(p1)
int_coords_2 = get_integer_coords(p2)

print("Integer coords tree 1:")
for c in int_coords_1[:5]:
    print(f"  {c}")
print("...")

print("\nInteger coords tree 2:")
for c in int_coords_2[:5]:
    print(f"  {c}")
print("...")

Integer coords tree 1:
  (474752070661196, -771466866414376)
  (239986752536001, -546722021299797)
  (297246585849624, -521670843904391)
  (71070242978030, -347744100919791)
  (162685976279827, -307662217087142)
...

Integer coords tree 2:
  (-474752070661196, 171466866414376)
  (-239986752536001, -53277978700203)
  (-297246585849624, -78329156095609)
  (-71070242978030, -252255899080209)
  (-162685976279827, -292337782912858)
...


In [6]:
# Create integer polygons and check overlap
from shapely.geometry import Polygon

int_poly1 = Polygon(int_coords_1)
int_poly2 = Polygon(int_coords_2)

print(f"Integer polygon 1 valid: {int_poly1.is_valid}")
print(f"Integer polygon 2 valid: {int_poly2.is_valid}")
print(f"\nInteger intersects: {int_poly1.intersects(int_poly2)}")
print(f"Integer touches: {int_poly1.touches(int_poly2)}")
print(f"Integer overlaps: {int_poly1.intersects(int_poly2) and not int_poly1.touches(int_poly2)}")

if int_poly1.intersects(int_poly2):
    int_intersection = int_poly1.intersection(int_poly2)
    print(f"\nInteger intersection type: {int_intersection.geom_type}")
    print(f"Integer intersection area: {int_intersection.area}")

Integer polygon 1 valid: True
Integer polygon 2 valid: True

Integer intersects: True
Integer touches: False
Integer overlaps: True

Integer intersection type: MultiPolygon
Integer intersection area: 7.019842204166403e+17


In [7]:
# KEY INSIGHT: The pre-optimized baseline has trees that are TOUCHING or have
# microscopic overlaps that Kaggle detects but our float validation misses.
#
# SOLUTION: Use Zaburo's row-based approach which GUARANTEES no overlaps
# because trees are placed on a grid with known spacing.

# Let's verify Zaburo's approach produces valid solutions
def find_best_trees_zaburo(n):
    """Zaburo's row-based placement - guaranteed no overlaps"""
    best_score, best_trees = float("inf"), None
    for n_even in range(1, n + 1):
        for n_odd in [n_even, n_even - 1]:
            all_trees = []
            rest = n
            r = 0
            while rest > 0:
                m = min(rest, n_even if r % 2 == 0 else n_odd)
                rest -= m
    
                angle = 0 if r % 2 == 0 else 180
                x_offset = Decimal('0') if r % 2 == 0 else Decimal('0.7') / 2
                y = r // 2 * Decimal('1.0') if r % 2 == 0 else (Decimal('0.8') + (r - 1) // 2 * Decimal('1.0'))
                row_trees = [ChristmasTree(
                    center_x=str(Decimal('0.7') * i + x_offset), 
                    center_y=str(y), 
                    angle=str(angle)
                ) for i in range(m)]
                all_trees.extend(row_trees)
                r += 1
            
            # Calculate bounding box
            all_coords = []
            for t in all_trees:
                coords = list(t.polygon.exterior.coords)
                all_coords.extend(coords)
            
            xs = [c[0] for c in all_coords]
            ys = [c[1] for c in all_coords]
            width = (max(xs) - min(xs)) / float(scale_factor)
            height = (max(ys) - min(ys)) / float(scale_factor)
            side = max(width, height)
            score = side ** 2
            
            if score < best_score:
                best_score = score
                best_trees = all_trees
    return best_score, best_trees

# Test for N=2
score_n2, trees_n2_zaburo = find_best_trees_zaburo(2)
print(f"Zaburo N=2: score={score_n2:.6f}, per-N score={score_n2/2:.6f}")

# Check for overlaps
for i in range(len(trees_n2_zaburo)):
    for j in range(i+1, len(trees_n2_zaburo)):
        p1 = trees_n2_zaburo[i].polygon
        p2 = trees_n2_zaburo[j].polygon
        if p1.intersects(p2) and not p1.touches(p2):
            print(f"OVERLAP between tree {i} and {j}!")
        else:
            print(f"Trees {i} and {j}: OK (no overlap)")

Zaburo N=2: score=1.440000, per-N score=0.720000
Trees 0 and 1: OK (no overlap)


In [8]:
# Compare Zaburo vs baseline for a few N values
print("Comparison: Zaburo row-based vs pre-optimized baseline")
print("="*60)

# Load baseline per-N scores from metrics
import json
with open('/home/code/experiments/004_fix_overlaps/metrics.json', 'r') as f:
    metrics = json.load(f)

baseline_per_n = metrics['per_n_scores']

for n in [2, 5, 10, 20, 30, 50, 100, 200]:
    zaburo_score, _ = find_best_trees_zaburo(n)
    zaburo_per_n = zaburo_score / n
    baseline_score = baseline_per_n.get(str(n), 999)
    diff = zaburo_per_n - baseline_score
    status = "WORSE" if diff > 0 else "BETTER" if diff < 0 else "SAME"
    print(f"N={n:3d}: Zaburo={zaburo_per_n:.4f}, Baseline={baseline_score:.4f}, Diff={diff:+.4f} ({status})")

print("\nNote: Zaburo is worse but GUARANTEED valid. Baseline is better but has overlaps.")

Comparison: Zaburo row-based vs pre-optimized baseline
N=  2: Zaburo=0.7200, Baseline=0.4508, Diff=+0.2692 (WORSE)
N=  5: Zaburo=0.8000, Baseline=0.4168, Diff=+0.3832 (WORSE)
N= 10: Zaburo=0.4840, Baseline=0.3766, Diff=+0.1074 (WORSE)
N= 20: Zaburo=0.4961, Baseline=0.3761, Diff=+0.1201 (WORSE)


N= 30: Zaburo=0.4941, Baseline=0.8947, Diff=-0.4006 (BETTER)


N= 50: Zaburo=0.4802, Baseline=0.3608, Diff=+0.1194 (WORSE)


N=100: Zaburo=0.3969, Baseline=0.3434, Diff=+0.0535 (WORSE)


N=200: Zaburo=0.4050, Baseline=0.3375, Diff=+0.0675 (WORSE)

Note: Zaburo is worse but GUARANTEED valid. Baseline is better but has overlaps.


In [None]:
# STRATEGY DECISION:
# 
# The pre-optimized baseline (70.6) has subtle overlaps that Kaggle rejects.
# Our overlap detection doesn't match Kaggle's exactly.
# 
# OPTIONS:
# 1. Keep trying to fix overlaps in baseline - RISKY (may never match Kaggle)
# 2. Use Zaburo's row-based approach (88.33) - SAFE but worse score
# 3. Implement simulated annealing FROM SCRATCH starting from Zaburo - BEST
#
# The path forward:
# 1. First, submit Zaburo's approach to verify it's accepted (should be ~88.33)
# 2. Then implement SA to optimize from there
# 3. SA can improve Zaburo's 88.33 toward 70.6 and beyond
#
# This is the SAFEST path because:
# - Zaburo's approach is GUARANTEED valid (no overlaps by construction)
# - SA can be implemented in pure Python (no binaries)
# - We can verify each step is valid before proceeding

print("RECOMMENDED STRATEGY:")
print("1. Implement Zaburo's row-based approach (exp_005)")
print("2. Submit to verify Kaggle accepts it (~88.33 expected)")
print("3. Implement simulated annealing from scratch (exp_006+)")
print("4. Start SA from Zaburo's valid solution")
print("5. Gradually improve toward target 68.89")