# Loop 6 Analysis: Understanding Overlap Detection Failures

## Problem
- 006_strict_hybrid FAILED with 'Overlapping trees in group 042'
- We need to understand why our overlap detection doesn't match Kaggle's
- Then find a path to the target score (68.891380)

In [1]:
# Load the Kaggle metric code exactly as they use it
import pandas as pd
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.ops import unary_union
from shapely.strtree import STRtree

# EXACT Kaggle settings
getcontext().prec = 25
scale_factor = Decimal('1e18')

class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(center_x)
        self.center_y = Decimal(center_y)
        self.angle = Decimal(angle)

        trunk_w = Decimal('0.15')
        trunk_h = Decimal('0.2')
        base_w = Decimal('0.7')
        mid_w = Decimal('0.4')
        top_w = Decimal('0.25')
        tip_y = Decimal('0.8')
        tier_1_y = Decimal('0.5')
        tier_2_y = Decimal('0.25')
        base_y = Decimal('0.0')
        trunk_bottom_y = -trunk_h

        initial_polygon = Polygon([
            (Decimal('0.0') * scale_factor, tip_y * scale_factor),
            (top_w / Decimal('2') * scale_factor, tier_1_y * scale_factor),
            (top_w / Decimal('4') * scale_factor, tier_1_y * scale_factor),
            (mid_w / Decimal('2') * scale_factor, tier_2_y * scale_factor),
            (mid_w / Decimal('4') * scale_factor, tier_2_y * scale_factor),
            (base_w / Decimal('2') * scale_factor, base_y * scale_factor),
            (trunk_w / Decimal('2') * scale_factor, base_y * scale_factor),
            (trunk_w / Decimal('2') * scale_factor, trunk_bottom_y * scale_factor),
            (-(trunk_w / Decimal('2')) * scale_factor, trunk_bottom_y * scale_factor),
            (-(trunk_w / Decimal('2')) * scale_factor, base_y * scale_factor),
            (-(base_w / Decimal('2')) * scale_factor, base_y * scale_factor),
            (-(mid_w / Decimal('4')) * scale_factor, tier_2_y * scale_factor),
            (-(mid_w / Decimal('2')) * scale_factor, tier_2_y * scale_factor),
            (-(top_w / Decimal('4')) * scale_factor, tier_1_y * scale_factor),
            (-(top_w / Decimal('2')) * scale_factor, tier_1_y * scale_factor),
        ])
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated,
                                          xoff=float(self.center_x * scale_factor),
                                          yoff=float(self.center_y * scale_factor))

def check_overlaps_kaggle_exact(df, n):
    """Check overlaps using EXACT Kaggle logic"""
    group_data = df[df['id'].str.startswith(f'{n:03d}_')]
    placed_trees = []
    for _, row in group_data.iterrows():
        x = str(row['x']).lstrip('s')
        y = str(row['y']).lstrip('s')
        deg = str(row['deg']).lstrip('s')
        placed_trees.append(ChristmasTree(x, y, deg))
    
    all_polygons = [p.polygon for p in placed_trees]
    r_tree = STRtree(all_polygons)
    
    overlaps = []
    for i, poly in enumerate(all_polygons):
        indices = r_tree.query(poly)
        for index in indices:
            if index == i:
                continue
            if poly.intersects(all_polygons[index]) and not poly.touches(all_polygons[index]):
                overlaps.append((i, index))
    return len(overlaps) > 0, overlaps

print('Kaggle metric code loaded successfully')

Kaggle metric code loaded successfully


In [2]:
# Load the current submission and check ALL N values for overlaps
df = pd.read_csv('/home/submission/submission.csv')

print('Checking ALL N values for overlaps using EXACT Kaggle logic...')
overlap_ns = []
for n in range(1, 201):
    has_overlap, pairs = check_overlaps_kaggle_exact(df, n)
    if has_overlap:
        overlap_ns.append(n)
        if len(overlap_ns) <= 10:  # Show first 10
            print(f'  N={n}: OVERLAP detected ({len(pairs)} pairs)')

print(f'\nTotal N values with overlaps: {len(overlap_ns)}')
print(f'Overlap N values: {overlap_ns[:20]}...' if len(overlap_ns) > 20 else f'Overlap N values: {overlap_ns}')

Checking ALL N values for overlaps using EXACT Kaggle logic...



Total N values with overlaps: 0
Overlap N values: []


In [3]:
# Check the validated submission (70.622435) that PASSED Kaggle
validated_path = '/home/code/submission_candidates/candidate_003.csv'
df_validated = pd.read_csv(validated_path)

print('Checking VALIDATED submission (70.622435) for overlaps...')
overlap_ns_validated = []
for n in range(1, 201):
    has_overlap, pairs = check_overlaps_kaggle_exact(df_validated, n)
    if has_overlap:
        overlap_ns_validated.append(n)
        print(f'  N={n}: OVERLAP detected')

print(f'\nTotal overlaps in validated submission: {len(overlap_ns_validated)}')
if len(overlap_ns_validated) == 0:
    print('VALIDATED submission has ZERO overlaps - this is our safe baseline!')

Checking VALIDATED submission (70.622435) for overlaps...



Total overlaps in validated submission: 0
VALIDATED submission has ZERO overlaps - this is our safe baseline!


In [4]:
# Check Shapely version - Kaggle uses 2.1.2
import shapely
print(f'Local Shapely version: {shapely.__version__}')
print('Kaggle uses Shapely 2.1.2')

# The issue might be floating point precision in the rotation/translation
# Let's check N=42 specifically (the one that failed on Kaggle)
print('\n=== Detailed check for N=42 ===')
df_current = pd.read_csv('/home/submission/submission.csv')
group_42 = df_current[df_current['id'].str.startswith('042_')]
print(f'N=42 has {len(group_42)} trees')

# Check each pair for near-overlaps
placed_trees = []
for _, row in group_42.iterrows():
    x = str(row['x']).lstrip('s')
    y = str(row['y']).lstrip('s')
    deg = str(row['deg']).lstrip('s')
    placed_trees.append(ChristmasTree(x, y, deg))

all_polygons = [p.polygon for p in placed_trees]

# Check all pairs
near_overlaps = []
for i in range(len(all_polygons)):
    for j in range(i+1, len(all_polygons)):
        poly_i = all_polygons[i]
        poly_j = all_polygons[j]
        
        intersects = poly_i.intersects(poly_j)
        touches = poly_i.touches(poly_j)
        
        if intersects:
            intersection = poly_i.intersection(poly_j)
            near_overlaps.append((i, j, intersects, touches, intersection.area))
            
if near_overlaps:
    print(f'Found {len(near_overlaps)} intersecting pairs:')
    for i, j, inter, touch, area in near_overlaps[:10]:
        print(f'  Trees {i} and {j}: intersects={inter}, touches={touch}, intersection_area={area}')

Local Shapely version: 2.1.2
Kaggle uses Shapely 2.1.2

=== Detailed check for N=42 ===
N=42 has 42 trees


In [None]:
# Check the submission file format - maybe there's a formatting issue
df_current = pd.read_csv('/home/submission/submission.csv')
print('=== Submission file format check ===')
print(f'Columns: {list(df_current.columns)}')\nprint(f'Total rows: {len(df_current)}')\nprint(f'Expected rows: {sum(range(1, 201))} = {200*201//2}')\n\n# Check first few rows\nprint('\\nFirst 5 rows:')\nprint(df_current.head())\n\n# Check if all values have 's' prefix\nprint('\\nValue format check:')\nprint(f\"x starts with 's': {df_current['x'].str.startswith('s').all()}\")\nprint(f\"y starts with 's': {df_current['y'].str.startswith('s').all()}\")\nprint(f\"deg starts with 's': {df_current['deg'].str.startswith('s').all()}\")\n\n# Check for any NaN or empty values\nprint(f\"\\nNaN values: {df_current.isna().sum().sum()}\")\nprint(f\"Empty strings: {(df_current == '').sum().sum()}\")

In [None]:
# Strategy: Use the VALIDATED submission as base, only replace N values that:
# 1. Have better scores in other sources
# 2. Pass Kaggle overlap detection

# First, let's understand the score breakdown
import numpy as np
import math
from numba import njit

TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

@njit
def score_group(xs, ys, degs, tx, ty):
    n = xs.size
    V = tx.size
    mnx = mny = 1e300
    mxx = mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c, s = math.cos(r), math.sin(r)
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xs[i]
            Y = s * tx[j] + c * ty[j] + ys[i]
            mnx, mxx = min(mnx, X), max(mxx, X)
            mny, mxy = min(mny, Y), max(mxy, Y)
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def get_n_score(df, n):
    mask = df['id'].str.startswith(f'{n:03d}_')
    group = df[mask]
    if len(group) != n:
        return None
    xs = group['x'].str[1:].astype(float).values
    ys = group['y'].str[1:].astype(float).values
    degs = group['deg'].str[1:].astype(float).values
    return score_group(xs, ys, degs, TX, TY)

# Calculate per-N scores for validated submission
validated_scores = {}
for n in range(1, 201):
    validated_scores[n] = get_n_score(df_validated, n)

total_validated = sum(validated_scores.values())
print(f'Validated submission total score: {total_validated:.6f}')
print(f'Target score: 68.891380')
print(f'Gap: {total_validated - 68.891380:.6f} ({(total_validated - 68.891380)/68.891380*100:.2f}%)')

In [None]:
# Find ALL available source submissions in snapshots
import os
import glob

snapshot_dir = '/home/nonroot/snapshots/santa-2025'
snapshot_dirs = glob.glob(f'{snapshot_dir}/*')

print(f'Found {len(snapshot_dirs)} snapshot directories')

# Collect all submission files
all_submissions = []
for snap_dir in snapshot_dirs:
    # Check for submission.csv
    sub_path = os.path.join(snap_dir, 'code', 'submission.csv')
    if os.path.exists(sub_path):
        all_submissions.append(sub_path)
    # Check for candidates
    cand_dir = os.path.join(snap_dir, 'code', 'submission_candidates')
    if os.path.exists(cand_dir):
        for f in os.listdir(cand_dir):
            if f.endswith('.csv'):
                all_submissions.append(os.path.join(cand_dir, f))

print(f'Found {len(all_submissions)} total submission files')

In [None]:
# For each N, find the BEST score across ALL sources that passes Kaggle overlap detection
# This is the key to improvement!

best_per_n = {}  # {n: (score, source_path, has_overlap)}

print('Scanning all submissions for best per-N scores...')
for i, sub_path in enumerate(all_submissions[:50]):  # Limit to first 50 for speed
    try:
        df_sub = pd.read_csv(sub_path)
        for n in range(1, 201):
            score_n = get_n_score(df_sub, n)
            if score_n is None:
                continue
            
            # Check if this N passes overlap detection
            has_overlap, _ = check_overlaps_kaggle_exact(df_sub, n)
            
            if n not in best_per_n or (not has_overlap and score_n < best_per_n[n][0]):
                if not has_overlap:  # Only consider overlap-free solutions
                    best_per_n[n] = (score_n, sub_path, has_overlap)
    except Exception as e:
        continue
    
    if (i + 1) % 10 == 0:
        print(f'  Processed {i+1}/{min(50, len(all_submissions))} submissions')

print(f'\nFound best overlap-free scores for {len(best_per_n)} N values')

In [None]:
# Compare best available vs validated
improvement_potential = 0
improved_ns = []

for n in range(1, 201):
    if n in best_per_n:
        best_score, best_path, _ = best_per_n[n]
        validated_score = validated_scores[n]
        if best_score < validated_score:
            improvement = validated_score - best_score
            improvement_potential += improvement
            improved_ns.append((n, improvement, best_score, validated_score))

print(f'Total improvement potential: {improvement_potential:.6f}')
print(f'Number of N values that can be improved: {len(improved_ns)}')
print(f'\nTop 10 improvement opportunities:')
for n, imp, best, val in sorted(improved_ns, key=lambda x: -x[1])[:10]:
    print(f'  N={n}: improvement={imp:.6f} (best={best:.6f} vs validated={val:.6f})')

In [None]:
# Create a new hybrid submission using ONLY overlap-free improvements
df_new = df_validated.copy()

for n, imp, best_score, val_score in improved_ns:
    best_path = best_per_n[n][1]
    df_best = pd.read_csv(best_path)
    
    # Replace the N group in df_new with the better one
    mask_new = df_new['id'].str.startswith(f'{n:03d}_')
    mask_best = df_best['id'].str.startswith(f'{n:03d}_')
    
    df_new = df_new[~mask_new]  # Remove old
    df_new = pd.concat([df_new, df_best[mask_best]], ignore_index=True)

# Sort by id
df_new['sort_key'] = df_new['id'].apply(lambda x: (int(x.split('_')[0]), int(x.split('_')[1])))
df_new = df_new.sort_values('sort_key').drop('sort_key', axis=1)

# Calculate new score
new_total = 0
for n in range(1, 201):
    new_total += get_n_score(df_new, n)

print(f'New hybrid score: {new_total:.6f}')
print(f'Validated score: {total_validated:.6f}')
print(f'Improvement: {total_validated - new_total:.6f}')

In [None]:
# CRITICAL: Verify the new hybrid has NO overlaps
print('Verifying new hybrid has NO overlaps...')
overlap_ns_new = []
for n in range(1, 201):
    has_overlap, pairs = check_overlaps_kaggle_exact(df_new, n)
    if has_overlap:
        overlap_ns_new.append(n)
        print(f'  N={n}: OVERLAP detected!')

if len(overlap_ns_new) == 0:
    print('\n✓ New hybrid has ZERO overlaps - safe to submit!')
    # Save the new submission
    df_new.to_csv('/home/submission/submission.csv', index=False)
    df_new.to_csv('/home/code/experiments/007_kaggle_exact_hybrid/submission.csv', index=False)
    print(f'Saved to /home/submission/submission.csv')
    print(f'Score: {new_total:.6f}')
else:
    print(f'\n✗ New hybrid has {len(overlap_ns_new)} overlaps - DO NOT SUBMIT!')