# Loop 2 Analysis: Overlap Validation Issue

Both submissions have been rejected by Kaggle due to overlapping trees:
- exp_000: Overlapping trees in group 002
- exp_001: Overlapping trees in group 151

Our local validation says 0 overlaps, but Kaggle disagrees. This is a CRITICAL issue.

**Hypothesis:** Kaggle uses stricter floating-point precision than our Shapely-based validation.

In [None]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from decimal import Decimal, getcontext
import json

getcontext().prec = 50  # High precision

# Tree polygon vertices
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def parse_value(val):
    if isinstance(val, str) and val.startswith('s'):
        return float(val[1:])
    return float(val)

def get_tree_polygon(x, y, angle):
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

print("Functions loaded")

In [None]:
# Load submission and check N=151 specifically
df = pd.read_csv('/home/submission/submission.csv')
df_151 = df[df['id'].str.startswith('151_')]
print(f"N=151 has {len(df_151)} trees")
print(df_151.head())

In [None]:
# Create polygons for N=151
trees_151 = []
for _, row in df_151.iterrows():
    x = parse_value(row['x'])
    y = parse_value(row['y'])
    deg = parse_value(row['deg'])
    trees_151.append((x, y, deg))

polygons_151 = [get_tree_polygon(x, y, deg) for x, y, deg in trees_151]
print(f"Created {len(polygons_151)} polygons")

In [None]:
# Check for overlaps with VERY strict tolerance
def check_overlap_strict(polygons, tolerance=0):
    """Check overlaps with strict tolerance"""
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                # Check if it's more than just touching
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > tolerance:
                    overlaps.append((i, j, intersection.area))
    return overlaps

# Test with different tolerances
for tol in [1e-12, 1e-15, 1e-18, 0]:
    overlaps = check_overlap_strict(polygons_151, tol)
    print(f"Tolerance {tol}: {len(overlaps)} overlaps")
    if overlaps and len(overlaps) <= 5:
        for i, j, area in overlaps:
            print(f"  Trees {i} and {j}: area={area:.2e}")

In [None]:
# Let's check if any polygons are EXACTLY touching (which should be OK)
# vs actually overlapping
from shapely.ops import unary_union

def detailed_overlap_check(polygons):
    """More detailed overlap analysis"""
    issues = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                # Get intersection type
                geom_type = intersection.geom_type
                if geom_type == 'Point':
                    continue  # Single point touch is OK
                elif geom_type == 'LineString' or geom_type == 'MultiLineString':
                    continue  # Edge touch is OK
                elif geom_type == 'Polygon' or geom_type == 'MultiPolygon':
                    if intersection.area > 0:
                        issues.append((i, j, geom_type, intersection.area))
                elif geom_type == 'GeometryCollection':
                    # Check if any polygon in collection
                    for geom in intersection.geoms:
                        if geom.geom_type in ['Polygon', 'MultiPolygon'] and geom.area > 0:
                            issues.append((i, j, geom.geom_type, geom.area))
                            break
    return issues

issues = detailed_overlap_check(polygons_151)
print(f"Found {len(issues)} actual overlaps (not just touches)")
for i, j, gtype, area in issues[:10]:
    print(f"  Trees {i} and {j}: {gtype}, area={area:.2e}")

In [None]:
# The issue might be floating point precision in the submission file itself
# Let's check if Kaggle uses integer-scaled coordinates

# According to the strategy, Kaggle might use SCALE = 10**18 for integer coordinates
SCALE = 10**18

def get_tree_polygon_int(x, y, angle):
    """Create polygon with integer-scaled coordinates"""
    import math
    # Rotate tree vertices
    angle_rad = math.radians(angle)
    cos_a = math.cos(angle_rad)
    sin_a = math.sin(angle_rad)
    
    coords = []
    for tx, ty in zip(TX, TY):
        # Rotate
        rx = tx * cos_a - ty * sin_a
        ry = tx * sin_a + ty * cos_a
        # Translate
        fx = rx + x
        fy = ry + y
        # Scale to integer
        ix = int(Decimal(str(fx)) * SCALE)
        iy = int(Decimal(str(fy)) * SCALE)
        coords.append((ix, iy))
    
    return Polygon(coords)

# Create integer-scaled polygons
polygons_151_int = [get_tree_polygon_int(x, y, deg) for x, y, deg in trees_151]
print(f"Created {len(polygons_151_int)} integer-scaled polygons")

In [None]:
# Check overlaps with integer-scaled polygons
overlaps_int = []
for i in range(len(polygons_151_int)):
    for j in range(i+1, len(polygons_151_int)):
        if polygons_151_int[i].intersects(polygons_151_int[j]):
            intersection = polygons_151_int[i].intersection(polygons_151_int[j])
            if intersection.area > 0:
                overlaps_int.append((i, j, intersection.area))

print(f"Integer-scaled overlaps: {len(overlaps_int)}")
for i, j, area in overlaps_int[:10]:
    # Convert area back to original scale
    original_area = area / (SCALE * SCALE)
    print(f"  Trees {i} and {j}: area={original_area:.2e} (scaled: {area:.2e})")

In [None]:
# Let's also check N=2 which was the first rejection
df_2 = df[df['id'].str.startswith('002_')]
print(f"N=2 has {len(df_2)} trees")

trees_2 = []
for _, row in df_2.iterrows():
    x = parse_value(row['x'])
    y = parse_value(row['y'])
    deg = parse_value(row['deg'])
    trees_2.append((x, y, deg))

print(f"Tree 0: x={trees_2[0][0]}, y={trees_2[0][1]}, deg={trees_2[0][2]}")
print(f"Tree 1: x={trees_2[1][0]}, y={trees_2[1][1]}, deg={trees_2[1][2]}")

# Check overlap
poly_0 = get_tree_polygon(*trees_2[0])
poly_1 = get_tree_polygon(*trees_2[1])

print(f"\nIntersects: {poly_0.intersects(poly_1)}")
if poly_0.intersects(poly_1):
    intersection = poly_0.intersection(poly_1)
    print(f"Intersection type: {intersection.geom_type}")
    print(f"Intersection area: {intersection.area:.2e}")

In [None]:
# Let's look at all snapshots and find ones that might be truly valid
import os
import glob

snapshot_dir = '/home/nonroot/snapshots/santa-2025/'
snapshots = sorted(os.listdir(snapshot_dir))
print(f"Found {len(snapshots)} snapshots")

# Check a few recent ones for overlaps
for snap in snapshots[-5:]:
    sub_path = f"{snapshot_dir}{snap}/submission/submission.csv"
    if os.path.exists(sub_path):
        df_snap = pd.read_csv(sub_path)
        # Quick check N=2 and N=151
        for n in [2, 151]:
            df_n = df_snap[df_snap['id'].str.startswith(f'{n:03d}_')]
            if len(df_n) == n:
                trees = [(parse_value(r['x']), parse_value(r['y']), parse_value(r['deg'])) for _, r in df_n.iterrows()]
                polys = [get_tree_polygon(*t) for t in trees]
                has_overlap = False
                for i in range(len(polys)):
                    for j in range(i+1, len(polys)):
                        if polys[i].intersects(polys[j]):
                            inter = polys[i].intersection(polys[j])
                            if inter.area > 1e-15:
                                has_overlap = True
                                break
                    if has_overlap:
                        break
                status = "OVERLAP" if has_overlap else "OK"
                print(f"{snap} N={n}: {status}")

In [None]:
# Let's check the ORIGINAL competition data - maybe there's a reference solution
# that's guaranteed to be valid
import os

data_dir = '/home/nonroot/data/santa-2025/'
print("Data directory contents:")
for f in os.listdir(data_dir):
    print(f"  {f}")

# Check if there's a sample submission
sample_path = f"{data_dir}sample_submission.csv"
if os.path.exists(sample_path):
    df_sample = pd.read_csv(sample_path)
    print(f"\nSample submission: {len(df_sample)} rows")
    print(df_sample.head())

In [None]:
# The sample submission should be valid - let's check its score
df_sample = pd.read_csv('/home/nonroot/data/santa-2025/sample_submission.csv')

def calculate_score(df):
    total = 0
    for n in range(1, 201):
        df_n = df[df['id'].str.startswith(f'{n:03d}_')]
        if len(df_n) == n:
            trees = [(parse_value(r['x']), parse_value(r['y']), parse_value(r['deg'])) for _, r in df_n.iterrows()]
            polys = [get_tree_polygon(*t) for t in trees]
            all_points = []
            for p in polys:
                all_points.extend(list(p.exterior.coords))
            points = np.array(all_points)
            min_xy = points.min(axis=0)
            max_xy = points.max(axis=0)
            side = max(max_xy[0] - min_xy[0], max_xy[1] - min_xy[1])
            total += (side ** 2) / n
    return total

sample_score = calculate_score(df_sample)
print(f"Sample submission score: {sample_score:.6f}")

In [None]:
# The sample submission is likely very bad (trees not optimized)
# Let's verify it has no overlaps at least
def count_overlaps_all_n(df):
    overlap_count = 0
    overlap_ns = []
    for n in range(1, 201):
        df_n = df[df['id'].str.startswith(f'{n:03d}_')]
        if len(df_n) == n:
            trees = [(parse_value(r['x']), parse_value(r['y']), parse_value(r['deg'])) for _, r in df_n.iterrows()]
            polys = [get_tree_polygon(*t) for t in trees]
            has_overlap = False
            for i in range(len(polys)):
                for j in range(i+1, len(polys)):
                    if polys[i].intersects(polys[j]):
                        inter = polys[i].intersection(polys[j])
                        if inter.area > 1e-15:
                            has_overlap = True
                            break
                if has_overlap:
                    break
            if has_overlap:
                overlap_count += 1
                overlap_ns.append(n)
    return overlap_count, overlap_ns

overlap_count, overlap_ns = count_overlaps_all_n(df_sample)
print(f"Sample submission overlaps: {overlap_count}")
if overlap_ns:
    print(f"Overlapping N values: {overlap_ns[:20]}...")