# Loop 1 Analysis - Fix Overlapping Trees Issue

The submission failed with 'Overlapping trees in group 004'. We need to:
1. Validate ALL configurations for overlaps (not just a sample)
2. Find a valid submission from snapshots or sample_submission
3. Understand why our validation missed this

In [1]:
import numpy as np
import pandas as pd
from shapely.geometry import Polygon
from shapely import STRtree
import math

# Tree shape vertices (15 points)
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def get_tree_polygon(x, y, deg):
    """Create a tree polygon at position (x, y) with rotation deg degrees."""
    angle_rad = math.radians(deg)
    cos_a = math.cos(angle_rad)
    sin_a = math.sin(angle_rad)
    
    vertices = []
    for tx, ty in zip(TX, TY):
        rx = tx * cos_a - ty * sin_a + x
        ry = tx * sin_a + ty * cos_a + y
        vertices.append((rx, ry))
    
    return Polygon(vertices)

def parse_value(val):
    """Parse a value that may be prefixed with 's'."""
    if isinstance(val, str) and val.startswith('s'):
        return float(val[1:])
    return float(val)

def check_overlaps_detailed(polygons):
    """Check if any polygons overlap and return details."""
    if len(polygons) < 2:
        return False, []
    
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                # Check if it's a real overlap (not just touching)
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > 1e-10:  # Significant overlap
                    overlaps.append((i, j, intersection.area))
    
    return len(overlaps) > 0, overlaps

print("Functions defined")

Functions defined


In [2]:
# Load the failed submission and check group 004
df_failed = pd.read_csv('/home/code/experiments/001_baseline/snapshot_submission.csv')
df_failed['x_val'] = df_failed['x'].apply(parse_value)
df_failed['y_val'] = df_failed['y'].apply(parse_value)
df_failed['deg_val'] = df_failed['deg'].apply(parse_value)
df_failed['n'] = df_failed['id'].apply(lambda x: int(x.split('_')[0]))

# Check group 004 specifically
config_4 = df_failed[df_failed['n'] == 4]
print("Group 004 configuration:")
print(config_4[['id', 'x_val', 'y_val', 'deg_val']])

# Create polygons and check overlaps
polygons_4 = [get_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) 
              for _, row in config_4.iterrows()]
has_overlap, overlap_details = check_overlaps_detailed(polygons_4)
print(f"\nHas overlaps: {has_overlap}")
print(f"Overlap details: {overlap_details}")

Group 004 configuration:
      id     x_val    y_val     deg_val
6  004_0 -0.324748  0.13211  156.370622
7  004_1  0.315354  0.13211  156.370622
8  004_2  0.324748 -0.73211  336.370622
9  004_3 -0.315354 -0.73211  336.370622

Has overlaps: False
Overlap details: []


In [3]:
# Check ALL configurations for overlaps
print("Checking all 200 configurations for overlaps...")
overlapping_configs = []

for n in range(1, 201):
    config = df_failed[df_failed['n'] == n]
    if len(config) != n:
        print(f"ERROR: n={n} has {len(config)} trees")
        continue
    
    polygons = [get_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) 
                for _, row in config.iterrows()]
    has_overlap, overlap_details = check_overlaps_detailed(polygons)
    
    if has_overlap:
        overlapping_configs.append((n, overlap_details))
        print(f"n={n}: OVERLAP! {overlap_details}")

print(f"\nTotal overlapping configurations: {len(overlapping_configs)}")
print(f"Overlapping groups: {[x[0] for x in overlapping_configs]}")

Checking all 200 configurations for overlaps...



Total overlapping configurations: 0
Overlapping groups: []


In [4]:
# Now check the sample_submission.csv which should be valid
df_sample = pd.read_csv('/home/data/sample_submission.csv')
df_sample['x_val'] = df_sample['x'].apply(parse_value)
df_sample['y_val'] = df_sample['y'].apply(parse_value)
df_sample['deg_val'] = df_sample['deg'].apply(parse_value)
df_sample['n'] = df_sample['id'].apply(lambda x: int(x.split('_')[0]))

print("Checking sample_submission.csv for overlaps...")
sample_overlaps = []

for n in range(1, 201):
    config = df_sample[df_sample['n'] == n]
    if len(config) != n:
        print(f"ERROR: n={n} has {len(config)} trees")
        continue
    
    polygons = [get_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) 
                for _, row in config.iterrows()]
    has_overlap, overlap_details = check_overlaps_detailed(polygons)
    
    if has_overlap:
        sample_overlaps.append((n, overlap_details))
        if n <= 20:  # Only print small N
            print(f"n={n}: OVERLAP! {overlap_details}")

print(f"\nTotal overlapping configurations in sample: {len(sample_overlaps)}")

Checking sample_submission.csv for overlaps...



Total overlapping configurations in sample: 0


In [5]:
# Calculate score for sample_submission
def get_bounding_box_side(polygons):
    if not polygons:
        return 0
    
    min_x = min_y = float('inf')
    max_x = max_y = float('-inf')
    
    for poly in polygons:
        bounds = poly.bounds
        min_x = min(min_x, bounds[0])
        min_y = min(min_y, bounds[1])
        max_x = max(max_x, bounds[2])
        max_y = max(max_y, bounds[3])
    
    width = max_x - min_x
    height = max_y - min_y
    return max(width, height)

# Calculate score for sample submission
scores_sample = []
for n in range(1, 201):
    config = df_sample[df_sample['n'] == n]
    polygons = [get_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) 
                for _, row in config.iterrows()]
    side = get_bounding_box_side(polygons)
    score_contrib = (side ** 2) / n
    scores_sample.append({'n': n, 'side': side, 'score_contrib': score_contrib})

scores_sample_df = pd.DataFrame(scores_sample)
sample_total = scores_sample_df['score_contrib'].sum()
print(f"Sample submission score: {sample_total:.6f}")
print(f"Target: 68.894234")
print(f"Gap: {sample_total - 68.894234:.6f}")

Sample submission score: 173.652299
Target: 68.894234
Gap: 104.758065


In [6]:
# Check preoptimized submissions from snapshots
import os

preopt_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized'
print(f"Checking preoptimized submissions in {preopt_dir}")

for fname in ['submission.csv', 'ensemble.csv', 'best_ensemble.csv', 'santa-2025.csv']:
    fpath = os.path.join(preopt_dir, fname)
    if os.path.exists(fpath):
        df = pd.read_csv(fpath)
        df['x_val'] = df['x'].apply(parse_value)
        df['y_val'] = df['y'].apply(parse_value)
        df['deg_val'] = df['deg'].apply(parse_value)
        df['n'] = df['id'].apply(lambda x: int(x.split('_')[0]))
        
        # Check for overlaps
        has_any_overlap = False
        for n in range(1, 201):
            config = df[df['n'] == n]
            if len(config) != n:
                continue
            polygons = [get_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) 
                        for _, row in config.iterrows()]
            has_overlap, _ = check_overlaps_detailed(polygons)
            if has_overlap:
                has_any_overlap = True
                break
        
        # Calculate score
        scores = []
        for n in range(1, 201):
            config = df[df['n'] == n]
            if len(config) != n:
                continue
            polygons = [get_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) 
                        for _, row in config.iterrows()]
            side = get_bounding_box_side(polygons)
            scores.append((side ** 2) / n)
        
        total = sum(scores)
        print(f"{fname}: score={total:.6f}, has_overlaps={has_any_overlap}")

Checking preoptimized submissions in /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized


submission.csv: score=70.676501, has_overlaps=False


ensemble.csv: score=70.676102, has_overlaps=False


best_ensemble.csv: score=70.676102, has_overlaps=False


santa-2025.csv: score=70.676102, has_overlaps=False


In [None]:
# The issue is likely precision. Let's check the official metric implementation
# and look for submissions that have been successfully submitted to Kaggle

# First, let's look at the bucket-of-chump folder which is from a top kernel
bucket_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/bucket-of-chump'
print(f"Checking {bucket_dir}")
if os.path.exists(bucket_dir):
    for f in os.listdir(bucket_dir)[:10]:
        print(f"  {f}")

# Check santa25-public which might have validated submissions
public_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public'
print(f"\nChecking {public_dir}")
if os.path.exists(public_dir):
    for f in os.listdir(public_dir)[:10]:
        print(f"  {f}")