# Loop 2 Analysis: Overlap Validation Issue

Both submissions have been rejected by Kaggle due to overlapping trees:
- exp_000: Overlapping trees in group 002
- exp_001: Overlapping trees in group 151

Our local validation says 0 overlaps, but Kaggle disagrees. This is a CRITICAL issue.

**Hypothesis:** Kaggle uses stricter floating-point precision than our Shapely-based validation.

In [11]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from decimal import Decimal, getcontext
import json

getcontext().prec = 50  # High precision

# Tree polygon vertices
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def parse_value(val):
    if isinstance(val, str) and val.startswith('s'):
        return float(val[1:])
    return float(val)

def get_tree_polygon(x, y, angle):
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

print("Functions loaded")

Functions loaded


In [12]:
# Load submission and check N=151 specifically
df = pd.read_csv('/home/submission/submission.csv')
df_151 = df[df['id'].str.startswith('151_')]
print(f"N=151 has {len(df_151)} trees")
print(df_151.head())

N=151 has 151 trees
          id                      x                     y                  deg
11325  151_0    s0.1504631577805953   s-2.396892766865921   s248.9868427979508
11326  151_1    s2.4617789867642976  s-3.4481416876556676  s248.38996071043908
11327  151_2  s-0.10076544576112685  s-1.9437379854085977   s68.38959410162349
11328  151_3    s2.4580209657898555   s-2.361663274867279  s248.68713989629498
11329  151_4    s0.1407460033729936  s-1.3173379929270548  s249.49303925448473


In [13]:
# Create polygons for N=151
trees_151 = []
for _, row in df_151.iterrows():
    x = parse_value(row['x'])
    y = parse_value(row['y'])
    deg = parse_value(row['deg'])
    trees_151.append((x, y, deg))

polygons_151 = [get_tree_polygon(x, y, deg) for x, y, deg in trees_151]
print(f"Created {len(polygons_151)} polygons")

Created 151 polygons


In [14]:
# Check for overlaps with VERY strict tolerance
def check_overlap_strict(polygons, tolerance=0):
    """Check overlaps with strict tolerance"""
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                # Check if it's more than just touching
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > tolerance:
                    overlaps.append((i, j, intersection.area))
    return overlaps

# Test with different tolerances
for tol in [1e-12, 1e-15, 1e-18, 0]:
    overlaps = check_overlap_strict(polygons_151, tol)
    print(f"Tolerance {tol}: {len(overlaps)} overlaps")
    if overlaps and len(overlaps) <= 5:
        for i, j, area in overlaps:
            print(f"  Trees {i} and {j}: area={area:.2e}")

Tolerance 1e-12: 0 overlaps
Tolerance 1e-15: 0 overlaps


Tolerance 1e-18: 0 overlaps


Tolerance 0: 0 overlaps


In [15]:
# Let's check if any polygons are EXACTLY touching (which should be OK)
# vs actually overlapping
from shapely.ops import unary_union

def detailed_overlap_check(polygons):
    """More detailed overlap analysis"""
    issues = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                # Get intersection type
                geom_type = intersection.geom_type
                if geom_type == 'Point':
                    continue  # Single point touch is OK
                elif geom_type == 'LineString' or geom_type == 'MultiLineString':
                    continue  # Edge touch is OK
                elif geom_type == 'Polygon' or geom_type == 'MultiPolygon':
                    if intersection.area > 0:
                        issues.append((i, j, geom_type, intersection.area))
                elif geom_type == 'GeometryCollection':
                    # Check if any polygon in collection
                    for geom in intersection.geoms:
                        if geom.geom_type in ['Polygon', 'MultiPolygon'] and geom.area > 0:
                            issues.append((i, j, geom.geom_type, geom.area))
                            break
    return issues

issues = detailed_overlap_check(polygons_151)
print(f"Found {len(issues)} actual overlaps (not just touches)")
for i, j, gtype, area in issues[:10]:
    print(f"  Trees {i} and {j}: {gtype}, area={area:.2e}")

Found 0 actual overlaps (not just touches)


In [16]:
# The issue might be floating point precision in the submission file itself
# Let's check if Kaggle uses integer-scaled coordinates

# According to the strategy, Kaggle might use SCALE = 10**18 for integer coordinates
SCALE = 10**18

def get_tree_polygon_int(x, y, angle):
    """Create polygon with integer-scaled coordinates"""
    import math
    # Rotate tree vertices
    angle_rad = math.radians(angle)
    cos_a = math.cos(angle_rad)
    sin_a = math.sin(angle_rad)
    
    coords = []
    for tx, ty in zip(TX, TY):
        # Rotate
        rx = tx * cos_a - ty * sin_a
        ry = tx * sin_a + ty * cos_a
        # Translate
        fx = rx + x
        fy = ry + y
        # Scale to integer
        ix = int(Decimal(str(fx)) * SCALE)
        iy = int(Decimal(str(fy)) * SCALE)
        coords.append((ix, iy))
    
    return Polygon(coords)

# Create integer-scaled polygons
polygons_151_int = [get_tree_polygon_int(x, y, deg) for x, y, deg in trees_151]
print(f"Created {len(polygons_151_int)} integer-scaled polygons")

Created 151 integer-scaled polygons


In [17]:
# Check overlaps with integer-scaled polygons
overlaps_int = []
for i in range(len(polygons_151_int)):
    for j in range(i+1, len(polygons_151_int)):
        if polygons_151_int[i].intersects(polygons_151_int[j]):
            intersection = polygons_151_int[i].intersection(polygons_151_int[j])
            if intersection.area > 0:
                overlaps_int.append((i, j, intersection.area))

print(f"Integer-scaled overlaps: {len(overlaps_int)}")
for i, j, area in overlaps_int[:10]:
    # Convert area back to original scale
    original_area = area / (SCALE * SCALE)
    print(f"  Trees {i} and {j}: area={original_area:.2e} (scaled: {area:.2e})")

Integer-scaled overlaps: 0


In [18]:
# Let's also check N=2 which was the first rejection
df_2 = df[df['id'].str.startswith('002_')]
print(f"N=2 has {len(df_2)} trees")

trees_2 = []
for _, row in df_2.iterrows():
    x = parse_value(row['x'])
    y = parse_value(row['y'])
    deg = parse_value(row['deg'])
    trees_2.append((x, y, deg))

print(f"Tree 0: x={trees_2[0][0]}, y={trees_2[0][1]}, deg={trees_2[0][2]}")
print(f"Tree 1: x={trees_2[1][0]}, y={trees_2[1][1]}, deg={trees_2[1][2]}")

# Check overlap
poly_0 = get_tree_polygon(*trees_2[0])
poly_1 = get_tree_polygon(*trees_2[1])

print(f"\nIntersects: {poly_0.intersects(poly_1)}")
if poly_0.intersects(poly_1):
    intersection = poly_0.intersection(poly_1)
    print(f"Intersection type: {intersection.geom_type}")
    print(f"Intersection area: {intersection.area:.2e}")

N=2 has 2 trees
Tree 0: x=0.1540970696213559, y=-0.03854074269479465, deg=203.62937773065684
Tree 1: x=-0.15409706962137285, y=-0.5614592573052241, deg=23.629377730656792

Intersects: False


In [None]:
# Let's look at all snapshots and find ones that might be truly valid
import os
import glob

snapshot_dir = '/home/nonroot/snapshots/santa-2025/'
snapshots = sorted(os.listdir(snapshot_dir))
print(f"Found {len(snapshots)} snapshots")

# Check a few recent ones for overlaps
for snap in snapshots[-5:]:
    sub_path = f"{snapshot_dir}{snap}/submission/submission.csv"
    if os.path.exists(sub_path):
        df_snap = pd.read_csv(sub_path)
        # Quick check N=2 and N=151
        for n in [2, 151]:
            df_n = df_snap[df_snap['id'].str.startswith(f'{n:03d}_')]
            if len(df_n) == n:
                trees = [(parse_value(r['x']), parse_value(r['y']), parse_value(r['deg'])) for _, r in df_n.iterrows()]
                polys = [get_tree_polygon(*t) for t in trees]
                has_overlap = False
                for i in range(len(polys)):
                    for j in range(i+1, len(polys)):
                        if polys[i].intersects(polys[j]):
                            inter = polys[i].intersection(polys[j])
                            if inter.area > 1e-15:
                                has_overlap = True
                                break
                    if has_overlap:
                        break
                status = "OVERLAP" if has_overlap else "OK"
                print(f"{snap} N={n}: {status}")

In [None]:
# Let's check the ORIGINAL competition data - maybe there's a reference solution
# that's guaranteed to be valid
import os

data_dir = '/home/nonroot/data/santa-2025/'
print("Data directory contents:")
for f in os.listdir(data_dir):
    print(f"  {f}")

# Check if there's a sample submission
sample_path = f"{data_dir}sample_submission.csv"
if os.path.exists(sample_path):
    df_sample = pd.read_csv(sample_path)
    print(f"\nSample submission: {len(df_sample)} rows")
    print(df_sample.head())

In [None]:
# The sample submission should be valid - let's check its score
df_sample = pd.read_csv('/home/nonroot/data/santa-2025/sample_submission.csv')

def calculate_score(df):
    total = 0
    for n in range(1, 201):
        df_n = df[df['id'].str.startswith(f'{n:03d}_')]
        if len(df_n) == n:
            trees = [(parse_value(r['x']), parse_value(r['y']), parse_value(r['deg'])) for _, r in df_n.iterrows()]
            polys = [get_tree_polygon(*t) for t in trees]
            all_points = []
            for p in polys:
                all_points.extend(list(p.exterior.coords))
            points = np.array(all_points)
            min_xy = points.min(axis=0)
            max_xy = points.max(axis=0)
            side = max(max_xy[0] - min_xy[0], max_xy[1] - min_xy[1])
            total += (side ** 2) / n
    return total

sample_score = calculate_score(df_sample)
print(f"Sample submission score: {sample_score:.6f}")

In [None]:
# The sample submission is likely very bad (trees not optimized)
# Let's verify it has no overlaps at least
def count_overlaps_all_n(df):
    overlap_count = 0
    overlap_ns = []
    for n in range(1, 201):
        df_n = df[df['id'].str.startswith(f'{n:03d}_')]
        if len(df_n) == n:
            trees = [(parse_value(r['x']), parse_value(r['y']), parse_value(r['deg'])) for _, r in df_n.iterrows()]
            polys = [get_tree_polygon(*t) for t in trees]
            has_overlap = False
            for i in range(len(polys)):
                for j in range(i+1, len(polys)):
                    if polys[i].intersects(polys[j]):
                        inter = polys[i].intersection(polys[j])
                        if inter.area > 1e-15:
                            has_overlap = True
                            break
                if has_overlap:
                    break
            if has_overlap:
                overlap_count += 1
                overlap_ns.append(n)
    return overlap_count, overlap_ns

overlap_count, overlap_ns = count_overlaps_all_n(df_sample)
print(f"Sample submission overlaps: {overlap_count}")
if overlap_ns:
    print(f"Overlapping N values: {overlap_ns[:20]}...")

In [19]:
# Let's check if the 's' prefix is causing issues
# Maybe Kaggle interprets the values differently

# Check raw values in the CSV
print("Raw values from CSV:")
print(df_151.iloc[0])

# Check if there are any parsing issues
print("\nParsed values:")
for col in ['x', 'y', 'deg']:
    raw = df_151.iloc[0][col]
    parsed = parse_value(raw)
    print(f"  {col}: raw='{raw}' -> parsed={parsed}")

Raw values from CSV:
id                   151_0
x      s0.1504631577805953
y      s-2.396892766865921
deg     s248.9868427979508
Name: 11325, dtype: object

Parsed values:
  x: raw='s0.1504631577805953' -> parsed=0.1504631577805953
  y: raw='s-2.396892766865921' -> parsed=-2.396892766865921
  deg: raw='s248.9868427979508' -> parsed=248.9868427979508


In [20]:
# Let's check if the issue is with the 's' prefix format
# Maybe Kaggle expects plain numbers without 's' prefix

# Check if any values DON'T have 's' prefix
no_s_count = 0
for col in ['x', 'y', 'deg']:
    for val in df[col]:
        if isinstance(val, str) and not val.startswith('s'):
            no_s_count += 1
            print(f"Found value without 's': {val}")
            if no_s_count > 5:
                break
    if no_s_count > 5:
        break

print(f"\nTotal values without 's' prefix: {no_s_count}")


Total values without 's' prefix: 0


In [21]:
# Let's look at the competition data to understand the expected format
import os
import glob

# Find any reference files
data_paths = [
    '/home/nonroot/data/',
    '/home/code/data/',
    '/kaggle/input/',
]

for path in data_paths:
    if os.path.exists(path):
        print(f"\n{path}:")
        for root, dirs, files in os.walk(path):
            for f in files[:10]:
                print(f"  {os.path.join(root, f)}")

In [22]:
# Let's check the research kernels to see how they format submissions
import os

kernel_dir = '/home/code/research/kernels/'
if os.path.exists(kernel_dir):
    print("Available kernels:")
    for d in os.listdir(kernel_dir):
        print(f"  {d}")

Available kernels:
  jonathanchan_santa25-ensemble-sa-fractional-translation
  saspav_santa-submission
  jazivxt_why-not
  yongsukprasertsuk_santa-2025-best-keeping-bbox3-runner
  smartmanoj_santa-claude
  metadata.json
  inversion_santa-2025-getting-started


In [24]:
# Let's check if there's a difference in how Kaggle validates vs our validation
# The key insight from the getting-started kernel is that it uses scale_factor = 1e15
# and Decimal precision for all calculations

from decimal import Decimal, getcontext
getcontext().prec = 25
scale_factor = Decimal('1e15')

# Tree vertices using Decimal
trunk_w = Decimal('0.15')
trunk_h = Decimal('0.2')
base_w = Decimal('0.7')
mid_w = Decimal('0.4')
top_w = Decimal('0.25')
tip_y = Decimal('0.8')
tier_1_y = Decimal('0.5')
tier_2_y = Decimal('0.25')
base_y = Decimal('0.0')
trunk_bottom_y = -trunk_h

TREE_VERTICES_DECIMAL = [
    (Decimal('0.0'), tip_y),
    (top_w / Decimal('2'), tier_1_y),
    (top_w / Decimal('4'), tier_1_y),
    (mid_w / Decimal('2'), tier_2_y),
    (mid_w / Decimal('4'), tier_2_y),
    (base_w / Decimal('2'), base_y),
    (trunk_w / Decimal('2'), base_y),
    (trunk_w / Decimal('2'), trunk_bottom_y),
    (-(trunk_w / Decimal('2')), trunk_bottom_y),
    (-(trunk_w / Decimal('2')), base_y),
    (-(base_w / Decimal('2')), base_y),
    (-(mid_w / Decimal('4')), tier_2_y),
    (-(mid_w / Decimal('2')), tier_2_y),
    (-(top_w / Decimal('4')), tier_1_y),
    (-(top_w / Decimal('2')), tier_1_y),
]

print("Tree vertices (Decimal):")
for i, (x, y) in enumerate(TREE_VERTICES_DECIMAL):
    print(f"  {i}: ({x}, {y})")

Tree vertices (Decimal):
  0: (0.0, 0.8)
  1: (0.125, 0.5)
  2: (0.0625, 0.5)
  3: (0.2, 0.25)
  4: (0.1, 0.25)
  5: (0.35, 0.0)
  6: (0.075, 0.0)
  7: (0.075, -0.2)
  8: (-0.075, -0.2)
  9: (-0.075, 0.0)
  10: (-0.35, 0.0)
  11: (-0.1, 0.25)
  12: (-0.2, 0.25)
  13: (-0.0625, 0.5)
  14: (-0.125, 0.5)


In [25]:
# Let's create a Kaggle-compatible validation function
import math

def get_tree_polygon_kaggle(center_x, center_y, angle):
    """Create tree polygon using Kaggle's exact method"""
    from decimal import Decimal, getcontext
    getcontext().prec = 25
    scale_factor = Decimal('1e15')
    
    # Convert inputs to Decimal
    cx = Decimal(str(center_x))
    cy = Decimal(str(center_y))
    ang = Decimal(str(angle))
    
    # Tree vertices
    trunk_w = Decimal('0.15')
    trunk_h = Decimal('0.2')
    base_w = Decimal('0.7')
    mid_w = Decimal('0.4')
    top_w = Decimal('0.25')
    tip_y = Decimal('0.8')
    tier_1_y = Decimal('0.5')
    tier_2_y = Decimal('0.25')
    base_y = Decimal('0.0')
    trunk_bottom_y = -trunk_h
    
    vertices = [
        (Decimal('0.0') * scale_factor, tip_y * scale_factor),
        (top_w / Decimal('2') * scale_factor, tier_1_y * scale_factor),
        (top_w / Decimal('4') * scale_factor, tier_1_y * scale_factor),
        (mid_w / Decimal('2') * scale_factor, tier_2_y * scale_factor),
        (mid_w / Decimal('4') * scale_factor, tier_2_y * scale_factor),
        (base_w / Decimal('2') * scale_factor, base_y * scale_factor),
        (trunk_w / Decimal('2') * scale_factor, base_y * scale_factor),
        (trunk_w / Decimal('2') * scale_factor, trunk_bottom_y * scale_factor),
        (-(trunk_w / Decimal('2')) * scale_factor, trunk_bottom_y * scale_factor),
        (-(trunk_w / Decimal('2')) * scale_factor, base_y * scale_factor),
        (-(base_w / Decimal('2')) * scale_factor, base_y * scale_factor),
        (-(mid_w / Decimal('4')) * scale_factor, tier_2_y * scale_factor),
        (-(mid_w / Decimal('2')) * scale_factor, tier_2_y * scale_factor),
        (-(top_w / Decimal('4')) * scale_factor, tier_1_y * scale_factor),
        (-(top_w / Decimal('2')) * scale_factor, tier_1_y * scale_factor),
    ]
    
    # Create polygon
    poly = Polygon(vertices)
    
    # Rotate and translate
    poly = affinity.rotate(poly, float(ang), origin=(0, 0))
    poly = affinity.translate(poly, xoff=float(cx * scale_factor), yoff=float(cy * scale_factor))
    
    return poly

# Test with N=151 trees
print("Testing Kaggle-compatible validation for N=151...")
kaggle_polys_151 = [get_tree_polygon_kaggle(x, y, deg) for x, y, deg in trees_151]
print(f"Created {len(kaggle_polys_151)} Kaggle-compatible polygons")

Testing Kaggle-compatible validation for N=151...
Created 151 Kaggle-compatible polygons


In [26]:
# Check for overlaps using Kaggle's method
def check_overlaps_kaggle(polygons):
    """Check overlaps using Kaggle's exact collision detection"""
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                overlaps.append((i, j))
    return overlaps

kaggle_overlaps_151 = check_overlaps_kaggle(kaggle_polys_151)
print(f"Kaggle-compatible overlaps for N=151: {len(kaggle_overlaps_151)}")
if kaggle_overlaps_151:
    print(f"Overlapping pairs: {kaggle_overlaps_151[:10]}")

Kaggle-compatible overlaps for N=151: 0


In [27]:
# Let's check if the issue is with how we're reading the CSV
# Maybe the 's' prefix parsing is causing precision loss

# Read raw CSV and check exact values
with open('/home/submission/submission.csv', 'r') as f:
    lines = f.readlines()

# Find N=151 entries
n151_lines = [l for l in lines if l.startswith('151_')]
print(f"Found {len(n151_lines)} lines for N=151")
print("First 5 lines:")
for l in n151_lines[:5]:
    print(f"  {l.strip()}")

Found 151 lines for N=151
First 5 lines:
  151_0,s0.1504631577805953,s-2.396892766865921,s248.9868427979508
  151_1,s2.4617789867642976,s-3.4481416876556676,s248.38996071043908
  151_2,s-0.10076544576112685,s-1.9437379854085977,s68.38959410162349
  151_3,s2.4580209657898555,s-2.361663274867279,s248.68713989629498
  151_4,s0.1407460033729936,s-1.3173379929270548,s249.49303925448473


In [28]:
# Let's check if there's an issue with the number of decimal places
# Kaggle might require specific precision

# Check precision of values in submission
import re

def count_decimal_places(s):
    if '.' in s:
        return len(s.split('.')[1])
    return 0

# Sample some values
sample_lines = n151_lines[:10]
for line in sample_lines:
    parts = line.strip().split(',')
    if len(parts) == 4:
        id_, x, y, deg = parts
        # Remove 's' prefix
        x_val = x[1:] if x.startswith('s') else x
        y_val = y[1:] if y.startswith('s') else y
        deg_val = deg[1:] if deg.startswith('s') else deg
        
        print(f"{id_}: x has {count_decimal_places(x_val)} decimals, y has {count_decimal_places(y_val)} decimals, deg has {count_decimal_places(deg_val)} decimals")

151_0: x has 16 decimals, y has 15 decimals, deg has 13 decimals
151_1: x has 16 decimals, y has 16 decimals, deg has 14 decimals
151_2: x has 17 decimals, y has 16 decimals, deg has 14 decimals
151_3: x has 16 decimals, y has 15 decimals, deg has 14 decimals
151_4: x has 16 decimals, y has 16 decimals, deg has 14 decimals
151_5: x has 17 decimals, y has 16 decimals, deg has 13 decimals
151_6: x has 16 decimals, y has 16 decimals, deg has 13 decimals
151_7: x has 15 decimals, y has 16 decimals, deg has 14 decimals
151_8: x has 16 decimals, y has 16 decimals, deg has 14 decimals
151_9: x has 17 decimals, y has 16 decimals, deg has 13 decimals


In [29]:
# Let's check if there's something wrong with the snapshot we're using
# Maybe it was generated with a buggy optimizer

# Let's try a different approach - find a snapshot that was successfully submitted
# by checking if any snapshots have different characteristics

import os

snapshot_dir = '/home/nonroot/snapshots/santa-2025/'
snapshots = sorted(os.listdir(snapshot_dir))

# Check the first few and last few snapshots for N=151 overlap
print("Checking snapshots for N=151 overlaps...")
for snap in snapshots[:3] + snapshots[-3:]:
    sub_path = f"{snapshot_dir}{snap}/submission/submission.csv"
    if os.path.exists(sub_path):
        df_snap = pd.read_csv(sub_path)
        df_n = df_snap[df_snap['id'].str.startswith('151_')]
        if len(df_n) == 151:
            trees = [(parse_value(r['x']), parse_value(r['y']), parse_value(r['deg'])) for _, r in df_n.iterrows()]
            polys = [get_tree_polygon_kaggle(*t) for t in trees]
            overlaps = check_overlaps_kaggle(polys)
            print(f"{snap}: {len(overlaps)} overlaps")

Checking snapshots for N=151 overlaps...
20952569566: 0 overlaps


20970671503: 0 overlaps


20971964134: 0 overlaps
21337107511: 0 overlaps


21337353543: 0 overlaps


21337353626: 0 overlaps


In [30]:
# The issue might be that Kaggle's validation is done BEFORE the 's' prefix is stripped
# Let's check if the submission format itself is correct

# According to the getting-started kernel, the format should be:
# - id: NNN_T format (e.g., 001_0, 002_0, 002_1)
# - x, y, deg: prefixed with 's' to keep as strings

# Let's verify our submission format matches
df_check = pd.read_csv('/home/submission/submission.csv')
print("Submission format check:")
print(f"  Columns: {list(df_check.columns)}")
print(f"  Total rows: {len(df_check)}")
print(f"  Expected rows: {sum(range(1, 201))} (1+2+...+200)")

# Check ID format
print(f"\nID format samples:")
print(df_check['id'].head(10).tolist())

# Check if all IDs are present
expected_ids = [f'{n:03d}_{t}' for n in range(1, 201) for t in range(n)]
actual_ids = df_check['id'].tolist()
missing_ids = set(expected_ids) - set(actual_ids)
extra_ids = set(actual_ids) - set(expected_ids)
print(f"\nMissing IDs: {len(missing_ids)}")
print(f"Extra IDs: {len(extra_ids)}")
if missing_ids:
    print(f"  First 5 missing: {list(missing_ids)[:5]}")
if extra_ids:
    print(f"  First 5 extra: {list(extra_ids)[:5]}")

Submission format check:
  Columns: ['id', 'x', 'y', 'deg']
  Total rows: 20100
  Expected rows: 20100 (1+2+...+200)

ID format samples:
['001_0', '002_0', '002_1', '003_0', '003_1', '003_2', '004_0', '004_1', '004_2', '004_3']

Missing IDs: 0
Extra IDs: 0


In [31]:
# Let's try a MUCH stricter validation - buffer the polygons slightly inward
# and check if they still don't overlap

def check_overlaps_strict_buffer(polygons, buffer_distance=-1e-10):
    """Check overlaps with buffered polygons (negative buffer = shrink)"""
    buffered = [p.buffer(buffer_distance) for p in polygons]
    overlaps = []
    for i in range(len(buffered)):
        for j in range(i+1, len(buffered)):
            if buffered[i].intersects(buffered[j]) and not buffered[i].touches(buffered[j]):
                overlaps.append((i, j))
    return overlaps

# Test with different buffer distances
for buf in [0, -1e-10, -1e-8, -1e-6]:
    overlaps = check_overlaps_strict_buffer(kaggle_polys_151, buf)
    print(f"Buffer {buf}: {len(overlaps)} overlaps")

Buffer 0: 0 overlaps
Buffer -1e-10: 0 overlaps
Buffer -1e-08: 0 overlaps
Buffer -1e-06: 0 overlaps


In [32]:
# Let's also check if there's any issue with the polygon validity
print("Checking polygon validity for N=151:")
invalid_count = 0
for i, poly in enumerate(kaggle_polys_151):
    if not poly.is_valid:
        print(f"  Polygon {i} is invalid: {poly.is_valid}")
        invalid_count += 1
print(f"Total invalid polygons: {invalid_count}")

Checking polygon validity for N=151:
Total invalid polygons: 0


In [33]:
# Let's check if the issue is with how the snapshot was created
# Maybe it has trees that are EXACTLY touching (which should be OK)
# but Kaggle considers them overlapping due to floating point

# Count how many pairs are touching vs not touching
touching_count = 0
not_touching_count = 0
for i in range(len(kaggle_polys_151)):
    for j in range(i+1, len(kaggle_polys_151)):
        if kaggle_polys_151[i].touches(kaggle_polys_151[j]):
            touching_count += 1
        elif kaggle_polys_151[i].intersects(kaggle_polys_151[j]):
            not_touching_count += 1

print(f"Touching pairs: {touching_count}")
print(f"Intersecting (not touching) pairs: {not_touching_count}")

Touching pairs: 0
Intersecting (not touching) pairs: 0


In [None]:
# Let's check if there's any issue with the CSV encoding or format
import hashlib

# Check file encoding and content
with open('/home/submission/submission.csv', 'rb') as f:
    content = f.read()
    
print(f"File size: {len(content)} bytes")
print(f"MD5 hash: {hashlib.md5(content).hexdigest()}")

# Check for any non-ASCII characters
non_ascii = [c for c in content if c > 127]
print(f"Non-ASCII characters: {len(non_ascii)}")

# Check line endings
crlf_count = content.count(b'\\r\\n')
lf_count = content.count(b'\\n') - crlf_count
print(f"CRLF line endings: {crlf_count}")
print(f"LF line endings: {lf_count}")

In [None]:
# Let's try creating a fresh submission from scratch using the getting-started approach
# and see if that passes validation

# First, let's create a simple test submission with just N=1 and N=2
# to verify the format is correct

from decimal import Decimal, getcontext
getcontext().prec = 25

# Create a simple submission with N=1 at origin with 45 degree rotation
test_data = []

# N=1: single tree at origin
test_data.append({'id': '001_0', 'x': 's0.0', 'y': 's0.0', 'deg': 's45.0'})

# N=2: two trees side by side
test_data.append({'id': '002_0', 'x': 's0.5', 'y': 's0.0', 'deg': 's45.0'})
test_data.append({'id': '002_1', 'x': 's-0.5', 'y': 's0.0', 'deg': 's45.0'})

test_df = pd.DataFrame(test_data)
print("Test submission:")
print(test_df)