# Loop 1 Analysis: Debugging Overlap Issue

The submission failed with "Overlapping trees in group 118" despite our local validation passing.

## Key Questions:
1. What's different about our overlap detection vs Kaggle's?
2. Is it a precision issue?
3. Can we use the sample submission as a baseline instead?

In [1]:
import math
import numpy as np
import pandas as pd
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.ops import unary_union
from shapely.strtree import STRtree
import warnings
warnings.filterwarnings('ignore')

# Set precision for Decimal
getcontext().prec = 50  # Increase precision
scale_factor = Decimal('1e18')  # Higher scale factor

print("Libraries loaded")

Libraries loaded


In [2]:
# Load our failed submission
failed_df = pd.read_csv('/home/code/experiments/001_baseline/submission.csv')
print("Failed submission shape:", failed_df.shape)
print(failed_df.head())

Failed submission shape: (20100, 4)
      id          x           y          deg
0  001_0  s0.056435  s-0.305079  s314.999998
1  002_0  s0.340626  s-0.261989  s346.663915
2  002_1  s0.002093   s0.119736   s40.667789
3  003_0  s0.340626  s-0.261989  s346.663915
4  003_1  s0.002093   s0.119736   s40.667789


In [3]:
# Load sample submission (known to be valid)
sample_df = pd.read_csv('/home/data/sample_submission.csv')
print("Sample submission shape:", sample_df.shape)
print(sample_df.head())

Sample submission shape: (20100, 4)
      id          x           y    deg
0  001_0       s0.0        s0.0  s90.0
1  002_0       s0.0        s0.0  s90.0
2  002_1  s0.202736  s-0.511271  s90.0
3  003_0       s0.0        s0.0  s90.0
4  003_1  s0.202736  s-0.511271  s90.0


In [4]:
# Define tree shape with higher precision
def create_tree_polygon(cx, cy, angle_deg, use_high_precision=True):
    """Create tree polygon at given position and angle."""
    # Tree vertices (relative to center at origin)
    trunk_w = 0.15
    trunk_h = 0.2
    base_w = 0.7
    mid_w = 0.4
    top_w = 0.25
    tip_y = 0.8
    tier_1_y = 0.5
    tier_2_y = 0.25
    base_y = 0.0
    trunk_bottom_y = -trunk_h
    
    vertices = [
        (0.0, tip_y),
        (top_w/2, tier_1_y),
        (top_w/4, tier_1_y),
        (mid_w/2, tier_2_y),
        (mid_w/4, tier_2_y),
        (base_w/2, base_y),
        (trunk_w/2, base_y),
        (trunk_w/2, trunk_bottom_y),
        (-trunk_w/2, trunk_bottom_y),
        (-trunk_w/2, base_y),
        (-base_w/2, base_y),
        (-mid_w/4, tier_2_y),
        (-mid_w/2, tier_2_y),
        (-top_w/4, tier_1_y),
        (-top_w/2, tier_1_y),
    ]
    
    # Create polygon
    poly = Polygon(vertices)
    
    # Rotate
    poly = affinity.rotate(poly, angle_deg, origin=(0, 0))
    
    # Translate
    poly = affinity.translate(poly, xoff=cx, yoff=cy)
    
    return poly

print("Tree polygon function defined")

Tree polygon function defined


In [5]:
# Check group 118 specifically from our failed submission
def load_group(df, n):
    """Load trees for a specific N from submission dataframe."""
    group_data = df[df['id'].str.startswith(f'{n:03d}_')]
    trees = []
    for _, row in group_data.iterrows():
        x = float(str(row['x'])[1:])  # Remove 's' prefix
        y = float(str(row['y'])[1:])
        deg = float(str(row['deg'])[1:])
        poly = create_tree_polygon(x, y, deg)
        trees.append({'x': x, 'y': y, 'deg': deg, 'poly': poly})
    return trees

# Load group 118 from our submission
group_118 = load_group(failed_df, 118)
print(f"Group 118 has {len(group_118)} trees")

Group 118 has 118 trees


In [6]:
# Check for overlaps in group 118 using strict intersection check
def check_overlaps_strict(trees):
    """Check for overlaps using strict intersection (not just touching)."""
    overlaps = []
    for i in range(len(trees)):
        for j in range(i+1, len(trees)):
            poly_i = trees[i]['poly']
            poly_j = trees[j]['poly']
            
            # Check if they intersect
            if poly_i.intersects(poly_j):
                # Check if it's more than just touching
                if not poly_i.touches(poly_j):
                    intersection = poly_i.intersection(poly_j)
                    area = intersection.area
                    if area > 1e-12:  # Non-trivial overlap
                        overlaps.append((i, j, area))
    return overlaps

overlaps_118 = check_overlaps_strict(group_118)
print(f"Found {len(overlaps_118)} overlaps in group 118")
for i, j, area in overlaps_118[:10]:
    print(f"  Trees {i} and {j}: overlap area = {area:.2e}")

Found 0 overlaps in group 118


In [7]:
# Let's also check using a more robust method - buffer-based
def check_overlaps_buffer(trees, buffer_dist=-1e-9):
    """Check overlaps with small negative buffer to handle edge cases."""
    overlaps = []
    for i in range(len(trees)):
        for j in range(i+1, len(trees)):
            poly_i = trees[i]['poly'].buffer(buffer_dist)
            poly_j = trees[j]['poly'].buffer(buffer_dist)
            
            if poly_i.intersects(poly_j) and not poly_i.touches(poly_j):
                intersection = poly_i.intersection(poly_j)
                if intersection.area > 0:
                    overlaps.append((i, j, intersection.area))
    return overlaps

overlaps_buffer = check_overlaps_buffer(group_118)
print(f"With buffer: Found {len(overlaps_buffer)} overlaps")

With buffer: Found 1 overlaps


In [8]:
# Check all groups for overlaps
print("Checking all groups for overlaps...")
all_overlaps = {}
for n in range(1, 201):
    group = load_group(failed_df, n)
    overlaps = check_overlaps_strict(group)
    if overlaps:
        all_overlaps[n] = overlaps
        
print(f"\nGroups with overlaps: {list(all_overlaps.keys())}")
for n, overlaps in all_overlaps.items():
    print(f"  Group {n}: {len(overlaps)} overlaps")

Checking all groups for overlaps...



Groups with overlaps: []


In [9]:
# The issue is clear - our validation was too lenient
# Let's check the sample submission to confirm it has no overlaps
print("Checking sample submission for overlaps...")
sample_overlaps = {}
for n in range(1, 201):
    group = load_group(sample_df, n)
    overlaps = check_overlaps_strict(group)
    if overlaps:
        sample_overlaps[n] = overlaps
        
print(f"Sample submission groups with overlaps: {list(sample_overlaps.keys())}")

Checking sample submission for overlaps...


Sample submission groups with overlaps: []


In [12]:
# Calculate sample submission score
def calculate_score(df):
    """Calculate total score for a submission."""
    total_score = 0
    for n in range(1, 201):
        group = load_group(df, n)
        if not group:
            continue
        
        # Get bounding box
        all_polys = [t['poly'] for t in group]
        union = unary_union(all_polys)
        bounds = union.bounds
        width = bounds[2] - bounds[0]
        height = bounds[3] - bounds[1]
        side = max(width, height)
        
        total_score += side**2 / n
    
    return total_score

sample_score = calculate_score(sample_df)
print(f"Sample submission score: {sample_score:.6f}")
print(f"Target score: 68.947559")
print(f"Gap: {sample_score - 68.947559:.6f}")

Sample submission score: 173.652299
Target score: 68.947559
Gap: 104.704740


In [13]:
# The buffer check found 1 overlap - let's identify it
def check_overlaps_detailed(trees):
    """Check overlaps with detailed info."""
    overlaps = []
    for i in range(len(trees)):
        for j in range(i+1, len(trees)):
            poly_i = trees[i]['poly']
            poly_j = trees[j]['poly']
            
            # Check intersection
            if poly_i.intersects(poly_j):
                intersection = poly_i.intersection(poly_j)
                touches = poly_i.touches(poly_j)
                
                # Get intersection details
                int_type = intersection.geom_type
                int_area = intersection.area if hasattr(intersection, 'area') else 0
                
                if int_area > 0 or (not touches and poly_i.intersects(poly_j)):
                    overlaps.append({
                        'i': i, 'j': j,
                        'type': int_type,
                        'area': int_area,
                        'touches': touches,
                        'tree_i': trees[i],
                        'tree_j': trees[j]
                    })
    return overlaps

# Check group 118 in detail
group_118 = load_group(failed_df, 118)
detailed_overlaps = check_overlaps_detailed(group_118)
print(f"Found {len(detailed_overlaps)} potential overlaps")
for ov in detailed_overlaps[:20]:
    print(f"  Trees {ov['i']} and {ov['j']}: type={ov['type']}, area={ov['area']:.2e}, touches={ov['touches']}")

Found 1 potential overlaps
  Trees 101 and 117: type=Polygon, area=5.71e-15, touches=False


In [14]:
# Let's check all groups for tiny overlaps (area > 0)
print("Checking all groups for ANY overlaps (including tiny ones)...")
tiny_overlaps = {}
for n in range(1, 201):
    group = load_group(failed_df, n)
    overlaps = check_overlaps_detailed(group)
    if overlaps:
        tiny_overlaps[n] = overlaps
        
print(f"\nGroups with any overlaps: {list(tiny_overlaps.keys())}")
for n, overlaps in tiny_overlaps.items():
    print(f"  Group {n}: {len(overlaps)} overlaps")
    for ov in overlaps[:3]:
        print(f"    Trees {ov['i']} and {ov['j']}: area={ov['area']:.2e}")

Checking all groups for ANY overlaps (including tiny ones)...



Groups with any overlaps: [118, 119, 120, 121, 122, 128, 129, 135, 136, 137, 138, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162]
  Group 118: 1 overlaps
    Trees 101 and 117: area=5.71e-15
  Group 119: 1 overlaps
    Trees 101 and 117: area=6.38e-14
  Group 120: 1 overlaps
    Trees 101 and 117: area=6.38e-14
  Group 121: 1 overlaps
    Trees 101 and 117: area=5.81e-14
  Group 122: 1 overlaps
    Trees 101 and 117: area=5.81e-14
  Group 128: 1 overlaps
    Trees 101 and 117: area=1.45e-13
  Group 129: 1 overlaps
    Trees 101 and 117: area=1.45e-13
  Group 135: 1 overlaps
    Trees 101 and 117: area=2.40e-13
  Group 136: 1 overlaps
    Trees 101 and 117: area=2.40e-13
  Group 137: 1 overlaps
    Trees 101 and 117: area=2.40e-13
  Group 138: 1 overlaps
    Trees 101 and 117: area=2.40e-13
  Group 147: 1 overlaps
    Trees 101 and 117: area=1.17e-13
  Group 148: 1 overlaps
    Trees 101 and 117: area=1.17e-13
  Group 149: 1 overlaps
    Trees 101 and 11

In [15]:
# Strategy: Use sample submission as baseline and apply optimization
# First, let's calculate the sample submission score properly

sample_score = calculate_score(sample_df)
print(f"Sample submission score: {sample_score:.6f}")
print(f"Target score: 68.947559")
print(f"Gap to target: {sample_score - 68.947559:.6f}")
print(f"\nThe sample submission is valid but has a high score.")
print("We need to optimize it while maintaining validity.")

Sample submission score: 173.652299
Target score: 68.947559
Gap to target: 104.704740

The sample submission is valid but has a high score.
We need to optimize it while maintaining validity.
