# Validate Baseline and Check for Overlaps

The previous submission was rejected with 'Overlapping trees in group 040'.
We need to implement stricter validation matching Kaggle's precision.

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
from decimal import Decimal, getcontext
import os

# Set high precision for decimal arithmetic
getcontext().prec = 50

# Tree polygon vertices
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, angle):
    """Create a tree polygon at position (x, y) with rotation angle (degrees)."""
    poly = Polygon(zip(TX, TY))
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def parse_submission(df):
    """Parse submission dataframe into dict of n -> list of (x, y, angle)."""
    solutions = {}
    for _, row in df.iterrows():
        id_parts = row['id'].split('_')
        n = int(id_parts[0])
        # Handle 's' prefix in coordinates
        x_str = str(row['x'])
        y_str = str(row['y'])
        deg_str = str(row['deg'])
        x = float(x_str[1:] if x_str.startswith('s') else x_str)
        y = float(y_str[1:] if y_str.startswith('s') else y_str)
        angle = float(deg_str[1:] if deg_str.startswith('s') else deg_str)
        if n not in solutions:
            solutions[n] = []
        solutions[n].append((x, y, angle))
    return solutions

print("Functions defined!")

Functions defined!


In [2]:
# Load the current submission
df = pd.read_csv('/home/submission/submission.csv')
print(f"Loaded {len(df)} rows")
print(df.head())

solutions = parse_submission(df)
print(f"\nParsed solutions for N=1 to {max(solutions.keys())}")

Loaded 20100 rows
      id                          x                         y  \
0  001_0  s-48.19608619421424577922  s58.77098461521422478882   
1  002_0      s0.202513410337269301    s-0.028957664041420434   
2  002_1     s-0.105680728905459279    s-0.551876178651849569   
3  003_0       s1.12365581614030097      s0.78110181599256301   
4  003_1       s1.23405569584216002      s1.27599950066375900   

                        deg  
0  s45.00000000000000000000  
1   s203.629377730656727863  
2    s23.629377730656813128  
3    s111.12513229289299943  
4     s66.37062226934300213  



Parsed solutions for N=1 to 200


In [3]:
# Validate N=40 specifically (the one that failed)
n = 40
trees = solutions[n]
print(f"N={n}: {len(trees)} trees")
for i, (x, y, angle) in enumerate(trees[:5]):
    print(f"  Tree {i}: x={x:.15f}, y={y:.15f}, angle={angle:.15f}")

N=40: 40 trees
  Tree 0: x=-1.689641332825619, y=0.052105250362737, angle=252.060472972647403
  Tree 1: x=0.953897110420389, y=-0.252668814531146, angle=252.151405771973003
  Tree 2: x=1.689657077220469, y=-0.789206884608834, angle=72.151405771973472
  Tree 3: x=0.011430135146958, y=-0.795948282226946, angle=252.219219656190376
  Tree 4: x=-0.821323180824477, y=0.656433257446972, angle=252.217176846484733


In [4]:
# Check for overlaps with standard Shapely
def validate_standard(trees):
    """Standard Shapely validation."""
    polys = [create_tree_polygon(*t) for t in trees]
    overlaps = []
    for i in range(len(polys)):
        for j in range(i+1, len(polys)):
            if polys[i].intersects(polys[j]):
                intersection = polys[i].intersection(polys[j])
                if intersection.area > 0:
                    overlaps.append((i, j, intersection.area))
    return overlaps

overlaps_40 = validate_standard(solutions[40])
print(f"N=40 overlaps (standard): {len(overlaps_40)}")
for i, j, area in overlaps_40[:5]:
    print(f"  Trees {i} and {j}: area={area:.2e}")

N=40 overlaps (standard): 0


In [5]:
# Check for overlaps with STRICT validation (very small threshold)
def validate_strict(trees, threshold=1e-15):
    """Strict validation with tiny threshold."""
    polys = [create_tree_polygon(*t) for t in trees]
    overlaps = []
    for i in range(len(polys)):
        for j in range(i+1, len(polys)):
            if polys[i].intersects(polys[j]):
                intersection = polys[i].intersection(polys[j])
                if intersection.area > threshold:
                    overlaps.append((i, j, intersection.area))
    return overlaps

overlaps_40_strict = validate_strict(solutions[40], threshold=1e-20)
print(f"N=40 overlaps (strict 1e-20): {len(overlaps_40_strict)}")
for i, j, area in overlaps_40_strict[:5]:
    print(f"  Trees {i} and {j}: area={area:.2e}")

N=40 overlaps (strict 1e-20): 0


In [6]:
# Check ALL N values for overlaps
print("Checking all N values for overlaps...")
overlap_counts = {}
for n in range(1, 201):
    overlaps = validate_strict(solutions[n], threshold=1e-20)
    if overlaps:
        overlap_counts[n] = len(overlaps)
        if n <= 50 or len(overlaps) > 0:  # Print first 50 and any with overlaps
            print(f"N={n}: {len(overlaps)} overlaps")

print(f"\nTotal N values with overlaps: {len(overlap_counts)}")

Checking all N values for overlaps...



Total N values with overlaps: 0


In [7]:
# Calculate score for the submission
def calculate_score(solutions):
    """Calculate total score for all N=1 to 200."""
    total = 0
    per_n_scores = {}
    for n in range(1, 201):
        trees = solutions[n]
        polys = [create_tree_polygon(*t) for t in trees]
        union = unary_union(polys)
        bounds = union.bounds
        side = max(bounds[2] - bounds[0], bounds[3] - bounds[1])
        contribution = (side ** 2) / n
        per_n_scores[n] = contribution
        total += contribution
    return total, per_n_scores

total_score, per_n_scores = calculate_score(solutions)
print(f"Total score: {total_score:.6f}")

Total score: 70.615745


In [8]:
# The issue might be with how Kaggle parses coordinates
# Let's check the raw coordinate precision
print("Checking coordinate precision in submission file...")
with open('/home/submission/submission.csv', 'r') as f:
    lines = f.readlines()

# Check N=40 rows
n40_lines = [l for l in lines if l.startswith('040_')]
print(f"\nN=40 rows ({len(n40_lines)} trees):")
for line in n40_lines[:5]:
    print(f"  {line.strip()}")

Checking coordinate precision in submission file...

N=40 rows (40 trees):
  040_0,s-1.689641332825619,s0.052105250362737,s252.0604729726474
  040_1,s0.953897110420389,s-0.252668814531146,s252.151405771973
  040_2,s1.689657077220469,s-0.789206884608834,s72.15140577197347
  040_3,s0.011430135146958,s-0.795948282226946,s252.21921965619038
  040_4,s-0.821323180824477,s0.656433257446972,s252.21717684648473


In [9]:
# Let's try a different approach - check if touching polygons are being detected as overlapping
# Kaggle might use a different definition of "overlap" vs "touch"

def validate_with_buffer(trees, buffer_size=-1e-10):
    """Validate using buffered polygons (shrink slightly to avoid touching = overlap)."""
    polys = [create_tree_polygon(*t).buffer(buffer_size) for t in trees]
    overlaps = []
    for i in range(len(polys)):
        for j in range(i+1, len(polys)):
            if polys[i].intersects(polys[j]):
                overlaps.append((i, j))
    return overlaps

overlaps_40_buffered = validate_with_buffer(solutions[40], buffer_size=-1e-10)
print(f"N=40 overlaps (with -1e-10 buffer): {len(overlaps_40_buffered)}")

N=40 overlaps (with -1e-10 buffer): 0


In [10]:
# Check if there are any snapshots that have been successfully submitted
# Look for snapshots with different content
import hashlib

snapshot_base = '/home/nonroot/snapshots/santa-2025/'
snapshot_dirs = sorted(os.listdir(snapshot_base))

# Group by md5 hash
hash_groups = {}
for snap_dir in snapshot_dirs:
    sub_path = os.path.join(snapshot_base, snap_dir, 'submission', 'submission.csv')
    if os.path.exists(sub_path):
        with open(sub_path, 'rb') as f:
            h = hashlib.md5(f.read()).hexdigest()
        if h not in hash_groups:
            hash_groups[h] = []
        hash_groups[h].append(snap_dir)

print(f"Found {len(hash_groups)} unique submissions across {len(snapshot_dirs)} snapshots")
for h, dirs in list(hash_groups.items())[:5]:
    print(f"  Hash {h[:8]}...: {len(dirs)} snapshots - e.g., {dirs[0]}")

Found 57 unique submissions across 114 snapshots
  Hash f3fc6728...: 1 snapshots - e.g., 20952569566
  Hash 853f5832...: 1 snapshots - e.g., 20970671503
  Hash 21a23889...: 3 snapshots - e.g., 20971964134
  Hash 84487ce7...: 1 snapshots - e.g., 20984924920
  Hash cde01d4f...: 1 snapshots - e.g., 20992150197


In [11]:
# Let's check a few different snapshots to find one with different content
# and see if any have better overlap properties

test_snapshots = list(hash_groups.keys())[:5]
for h in test_snapshots:
    snap_dir = hash_groups[h][0]
    sub_path = os.path.join(snapshot_base, snap_dir, 'submission', 'submission.csv')
    df_test = pd.read_csv(sub_path)
    sol_test = parse_submission(df_test)
    
    # Check N=40 overlaps
    overlaps = validate_strict(sol_test[40], threshold=1e-20)
    
    # Calculate score
    score, _ = calculate_score(sol_test)
    
    print(f"Snapshot {snap_dir}: score={score:.6f}, N=40 overlaps={len(overlaps)}")

Snapshot 20952569566: score=163.194569, N=40 overlaps=0


Snapshot 20970671503: score=164.820045, N=40 overlaps=0


Snapshot 20971964134: score=87.804045, N=40 overlaps=0


Snapshot 20984924920: score=173.688052, N=40 overlaps=0


Snapshot 20992150197: score=217.576225, N=40 overlaps=0


In [12]:
# Save metrics
import json

metrics = {
    'cv_score': total_score,
    'overlap_counts': overlap_counts,
    'notes': 'Validation analysis of baseline submission'
}

with open('/home/code/experiments/001_valid_baseline/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Saved metrics. CV Score: {total_score:.6f}")

Saved metrics. CV Score: 70.615745


In [16]:
# The submission was rejected by Kaggle with "Overlapping trees in group 040"
# But local Shapely validation shows NO overlaps
# This is a critical mismatch - Kaggle must use different precision

# Let's check if the issue is with coordinate parsing
# Kaggle might truncate or round coordinates differently

# Check the exact format of coordinates in the submission
print("Analyzing coordinate format...")
with open('/home/submission/submission.csv', 'r') as f:
    lines = f.readlines()

# Find N=40 rows
n40_rows = [l.strip().split(',') for l in lines if l.startswith('040_')]
print(f"\nN=40 has {len(n40_rows)} trees")

# Check coordinate lengths
for row in n40_rows[:3]:
    id_val, x_val, y_val, deg_val = row
    print(f"  {id_val}: x has {len(x_val)} chars, y has {len(y_val)} chars, deg has {len(deg_val)} chars")
    print(f"    x={x_val}")
    print(f"    y={y_val}")
    print(f"    deg={deg_val}")

Analyzing coordinate format...

N=40 has 40 trees
  040_0: x has 19 chars, y has 18 chars, deg has 18 chars
    x=s-1.689641332825619
    y=s0.052105250362737
    deg=s252.0604729726474
  040_1: x has 18 chars, y has 19 chars, deg has 17 chars
    x=s0.953897110420389
    y=s-0.252668814531146
    deg=s252.151405771973
  040_2: x has 18 chars, y has 19 chars, deg has 18 chars
    x=s1.689657077220469
    y=s-0.789206884608834
    deg=s72.15140577197347


In [17]:
# Let's try to understand Kaggle's validation by checking if there are any 
# "touching" polygons that might be interpreted as overlapping

def find_touching_pairs(trees):
    """Find pairs of polygons that are touching (intersect but don't overlap)."""
    polys = [create_tree_polygon(*t) for t in trees]
    touching = []
    for i in range(len(polys)):
        for j in range(i+1, len(polys)):
            if polys[i].intersects(polys[j]):
                intersection = polys[i].intersection(polys[j])
                # Check if it's a line or point (touching) vs area (overlapping)
                if intersection.area == 0:
                    touching.append((i, j, intersection.geom_type))
    return touching

touching_40 = find_touching_pairs(solutions[40])
print(f"N=40 touching pairs: {len(touching_40)}")
for i, j, geom_type in touching_40[:10]:
    print(f"  Trees {i} and {j}: {geom_type}")

N=40 touching pairs: 0


In [18]:
# Check all N values for touching pairs
print("Checking all N values for touching pairs...")
touching_counts = {}
for n in range(1, 201):
    touching = find_touching_pairs(solutions[n])
    if touching:
        touching_counts[n] = len(touching)

print(f"\nN values with touching pairs: {len(touching_counts)}")
print(f"Total touching pairs across all N: {sum(touching_counts.values())}")

# Show distribution
if touching_counts:
    print("\nTop 10 N values by touching pairs:")
    sorted_touching = sorted(touching_counts.items(), key=lambda x: x[1], reverse=True)
    for n, count in sorted_touching[:10]:
        print(f"  N={n}: {count} touching pairs")

Checking all N values for touching pairs...



N values with touching pairs: 3
Total touching pairs across all N: 3

Top 10 N values by touching pairs:
  N=4: 1 touching pairs
  N=18: 1 touching pairs
  N=31: 1 touching pairs


In [None]:
# The mystery deepens - N=40 has no overlaps and no touching pairs locally
# But Kaggle says it has overlapping trees

# Let's check if there's something special about N=40's coordinates
# Maybe Kaggle uses a different coordinate system or precision

# Check the bounding box and tree positions for N=40
n = 40
trees = solutions[n]
polys = [create_tree_polygon(*t) for t in trees]

# Get overall bounds
union = unary_union(polys)
bounds = union.bounds
print(f"N=40 bounding box: {bounds}")
print(f"  Width: {bounds[2] - bounds[0]:.10f}")
print(f"  Height: {bounds[3] - bounds[1]:.10f}")
print(f"  Side: {max(bounds[2] - bounds[0], bounds[3] - bounds[1]):.10f}")

# Check minimum distances between trees
min_distances = []
for i in range(len(polys)):
    for j in range(i+1, len(polys)):
        dist = polys[i].distance(polys[j])
        if dist < 1e-6:  # Very close
            min_distances.append((i, j, dist))

print(f"\nVery close tree pairs (dist < 1e-6): {len(min_distances)}")
for i, j, dist in min_distances[:10]:
    print(f"  Trees {i} and {j}: distance = {dist:.2e}")