# Evolver Loop 6 Analysis: Fixing Overlap Detection

## Critical Issue
exp_005 failed with "Overlapping trees in group 126"

Our relate() method found N=60 but missed N=126. We need:
1. More robust overlap detection
2. OR just use baseline for ALL N values that differ from baseline

In [None]:
import sys
sys.path.insert(0, '/home/code')

import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.strtree import STRtree

# Load submissions
baseline_path = '/home/code/experiments/000_baseline/submission.csv'
failed_path = '/home/code/experiments/005_fixed_submission/submission.csv'

baseline_df = pd.read_csv(baseline_path)
failed_df = pd.read_csv(failed_path)

print(f"Baseline shape: {baseline_df.shape}")
print(f"Failed submission shape: {failed_df.shape}")

In [None]:
# Define the Christmas tree polygon
def get_tree_polygon(x, y, angle):
    """Create a Christmas tree polygon at (x, y) with given rotation angle."""
    trunk_w = 0.15
    trunk_h = 0.2
    base_w = 0.7
    mid_w = 0.4
    top_w = 0.25
    tip_y = 0.8
    tier_1_y = 0.5
    tier_2_y = 0.25
    base_y = 0.0
    trunk_bottom_y = -trunk_h
    
    vertices = [
        (0.0, tip_y),
        (top_w / 2, tier_1_y),
        (top_w / 4, tier_1_y),
        (mid_w / 2, tier_2_y),
        (mid_w / 4, tier_2_y),
        (base_w / 2, base_y),
        (trunk_w / 2, base_y),
        (trunk_w / 2, trunk_bottom_y),
        (-trunk_w / 2, trunk_bottom_y),
        (-trunk_w / 2, base_y),
        (-base_w / 2, base_y),
        (-mid_w / 4, tier_2_y),
        (-mid_w / 2, tier_2_y),
        (-top_w / 4, tier_1_y),
        (-top_w / 2, tier_1_y),
    ]
    
    poly = Polygon(vertices)
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, xoff=x, yoff=y)
    return poly

def get_trees_for_n(df, n):
    """Extract trees for a specific N value."""
    prefix = f"{n:03d}_"
    n_data = df[df['id'].str.startswith(prefix)].copy()
    
    trees = []
    for _, row in n_data.iterrows():
        x = float(str(row['x']).lstrip('s'))
        y = float(str(row['y']).lstrip('s'))
        angle = float(str(row['deg']).lstrip('s'))
        trees.append((x, y, angle))
    return trees

print("Functions defined")

In [None]:
# Check N=126 specifically - this is where Kaggle found the overlap
n = 126
failed_trees = get_trees_for_n(failed_df, n)
baseline_trees = get_trees_for_n(baseline_df, n)

print(f"N={n}: Failed has {len(failed_trees)} trees, Baseline has {len(baseline_trees)} trees")

# Check if they're the same
same = True
for i, (f, b) in enumerate(zip(failed_trees, baseline_trees)):
    if f != b:
        same = False
        print(f"Tree {i} differs:")
        print(f"  Failed: {f}")
        print(f"  Baseline: {b}")
        break

print(f"\nN={n} same in both? {same}")

In [None]:
# Check for overlaps in N=126 using multiple methods
def check_overlaps_area(trees, tolerance=0):
    """Check for overlaps using intersection area."""
    polygons = [get_tree_polygon(x, y, a) for x, y, a in trees]
    
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                area = intersection.area
                if area > tolerance:
                    overlaps.append((i, j, area))
    return overlaps

def check_overlaps_relate(trees):
    """Check for overlaps using relate() - more accurate."""
    polygons = [get_tree_polygon(x, y, a) for x, y, a in trees]
    
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            relate = polygons[i].relate(polygons[j])
            # relate[0] == '2' means 2D interior intersection
            if relate[0] == '2':
                intersection = polygons[i].intersection(polygons[j])
                overlaps.append((i, j, intersection.area, relate))
    return overlaps

def check_overlaps_buffer(trees, buffer_size=1e-10):
    """Check for overlaps using buffer method."""
    polygons = [get_tree_polygon(x, y, a) for x, y, a in trees]
    
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            # Shrink polygons slightly and check intersection
            p1 = polygons[i].buffer(-buffer_size)
            p2 = polygons[j].buffer(-buffer_size)
            if p1.intersects(p2) and not p1.touches(p2):
                overlaps.append((i, j, p1.intersection(p2).area))
    return overlaps

print("Checking N=126 for overlaps...")
print(f"\nArea method (tol=0): {len(check_overlaps_area(failed_trees, 0))} overlaps")
print(f"Relate method: {len(check_overlaps_relate(failed_trees))} overlaps")
print(f"Buffer method: {len(check_overlaps_buffer(failed_trees))} overlaps")

# Show details
relate_overlaps = check_overlaps_relate(failed_trees)
if relate_overlaps:
    print(f"\nRelate overlaps:")
    for o in relate_overlaps[:5]:
        print(f"  Trees {o[0]}-{o[1]}: area={o[2]:.2e}, relate={o[3]}")

In [None]:
# Check ALL N values in the failed submission for overlaps using relate()
print("Checking ALL N values for overlaps using relate() method...")

problematic_ns = []
for n in range(1, 201):
    trees = get_trees_for_n(failed_df, n)
    overlaps = check_overlaps_relate(trees)
    if overlaps:
        problematic_ns.append((n, overlaps))
        print(f"N={n}: {len(overlaps)} overlaps")
        for o in overlaps[:3]:
            print(f"  Trees {o[0]}-{o[1]}: area={o[2]:.2e}, relate={o[3]}")

print(f"\nTotal problematic N values: {len(problematic_ns)}")

In [None]:
# Check which N values differ from baseline
print("Checking which N values differ from baseline...")

differing_ns = []
for n in range(1, 201):
    failed_trees = get_trees_for_n(failed_df, n)
    baseline_trees = get_trees_for_n(baseline_df, n)
    
    same = True
    for f, b in zip(failed_trees, baseline_trees):
        if f != b:
            same = False
            break
    
    if not same:
        differing_ns.append(n)

print(f"N values that differ from baseline: {len(differing_ns)}")
print(f"First 20: {differing_ns[:20]}")

In [None]:
# SAFEST APPROACH: Use baseline for ALL N values
# The baseline passes Kaggle validation, so it's guaranteed to work
# We can only improve by finding configurations that:
# 1. Are better than baseline
# 2. Pass Kaggle's strict validation

# For now, let's just submit the baseline to confirm it still works
print("Creating a submission using ONLY baseline configurations...")

import shutil
shutil.copy(baseline_path, '/home/submission/submission.csv')

print("Copied baseline to /home/submission/submission.csv")
print("This should pass Kaggle validation with score 70.676102")

In [None]:
# Alternative: Create a TRULY safe ensemble
# Only use non-baseline configurations if they:
# 1. Have a better score
# 2. Pass ALL overlap checks (area, relate, buffer)

def is_config_safe(trees):
    """Check if a configuration is safe using multiple methods."""
    # Method 1: Area check
    area_overlaps = check_overlaps_area(trees, tolerance=0)
    if area_overlaps:
        return False, "area"
    
    # Method 2: Relate check
    relate_overlaps = check_overlaps_relate(trees)
    if relate_overlaps:
        return False, "relate"
    
    # Method 3: Buffer check
    buffer_overlaps = check_overlaps_buffer(trees, buffer_size=1e-12)
    if buffer_overlaps:
        return False, "buffer"
    
    return True, "safe"

print("Checking which differing N values are actually safe...")

safe_improvements = []
for n in differing_ns:
    failed_trees = get_trees_for_n(failed_df, n)
    is_safe, method = is_config_safe(failed_trees)
    
    if is_safe:
        # Calculate scores
        failed_polys = [get_tree_polygon(x, y, a) for x, y, a in failed_trees]
        baseline_trees = get_trees_for_n(baseline_df, n)
        baseline_polys = [get_tree_polygon(x, y, a) for x, y, a in baseline_trees]
        
        def get_score(polys, n):
            all_coords = []
            for p in polys:
                all_coords.extend(list(p.exterior.coords))
            all_coords = np.array(all_coords)
            x_range = all_coords[:, 0].max() - all_coords[:, 0].min()
            y_range = all_coords[:, 1].max() - all_coords[:, 1].min()
            side = max(x_range, y_range)
            return (side ** 2) / n
        
        failed_score = get_score(failed_polys, n)
        baseline_score = get_score(baseline_polys, n)
        
        if failed_score < baseline_score:
            improvement = baseline_score - failed_score
            safe_improvements.append((n, improvement, failed_score, baseline_score))
            print(f"N={n}: SAFE improvement of {improvement:.6f}")
    else:
        print(f"N={n}: UNSAFE ({method})")

print(f"\nTotal safe improvements: {len(safe_improvements)}")

In [None]:
# Create a truly safe submission using only verified safe improvements
print("\nCreating truly safe submission...")

safe_df = baseline_df.copy()

if safe_improvements:
    for n, improvement, _, _ in safe_improvements:
        # Remove baseline N data
        prefix = f"{n:03d}_"
        safe_df = safe_df[~safe_df['id'].str.startswith(prefix)]
        
        # Add failed N data (which is safe and better)
        failed_n_data = failed_df[failed_df['id'].str.startswith(prefix)]
        safe_df = pd.concat([safe_df, failed_n_data], ignore_index=True)

# Sort by id
safe_df['n'] = safe_df['id'].apply(lambda x: int(x.split('_')[0]))
safe_df['tree_idx'] = safe_df['id'].apply(lambda x: int(x.split('_')[1]))
safe_df = safe_df.sort_values(['n', 'tree_idx']).drop(columns=['n', 'tree_idx'])

print(f"Safe submission shape: {safe_df.shape}")

In [None]:
# Calculate and verify the safe submission score
from utils import score_submission

safe_score, scores_by_n, overlapping_ns = score_submission(safe_df, check_overlaps=True)
baseline_score_check, _, _ = score_submission(baseline_df, check_overlaps=False)

print(f"Safe submission score: {safe_score:.6f}")
print(f"Baseline score: {baseline_score_check:.6f}")
print(f"Improvement over baseline: {baseline_score_check - safe_score:.6f}")
print(f"Overlapping N values (utils check): {overlapping_ns}")
print(f"\nTarget: 68.888293")
print(f"Gap to target: {safe_score - 68.888293:.6f}")

In [None]:
# Final verification using all three overlap methods
print("\nFinal verification of safe submission...")

final_problems = []
for n in range(1, 201):
    trees = get_trees_for_n(safe_df, n)
    is_safe, method = is_config_safe(trees)
    if not is_safe:
        final_problems.append((n, method))
        print(f"N={n}: PROBLEM ({method})")

if not final_problems:
    print("✅ All N values pass all overlap checks!")
else:
    print(f"❌ {len(final_problems)} problematic N values")

In [None]:
# Save the safe submission
import os
import json

work_dir = '/home/code/experiments/006_truly_safe'
os.makedirs(work_dir, exist_ok=True)

safe_df.to_csv(f'{work_dir}/submission.csv', index=False)

metrics = {
    'cv_score': safe_score,
    'baseline_score': baseline_score_check,
    'improvement_over_baseline': baseline_score_check - safe_score,
    'safe_improvements_count': len(safe_improvements),
    'is_valid': len(final_problems) == 0
}

with open(f'{work_dir}/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

# Copy to submission folder
import shutil
shutil.copy(f'{work_dir}/submission.csv', '/home/submission/submission.csv')

print(f"Saved to {work_dir}/submission.csv")
print(f"Copied to /home/submission/submission.csv")
print(f"Metrics: {metrics}")