# Loop 4 Analysis: Understanding the Overlap Problem and Finding Better Approaches

## Key Issues:
1. The eazy optimizer submission FAILED with 'Overlapping trees in group 003'
2. Our local Shapely validation says no overlaps, but Kaggle detects them
3. We need to understand why and find a more robust approach

In [None]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from decimal import Decimal, getcontext
getcontext().prec = 50

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]
TREE_VERTICES = list(zip(TX, TY))

def parse_s_value(s):
    if isinstance(s, str) and s.startswith('s'):
        return float(s[1:])
    return float(s)

def create_tree_polygon(x, y, deg):
    poly = Polygon(TREE_VERTICES)
    poly = affinity.rotate(poly, deg, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def get_bounding_box_side(polygons):
    if not polygons:
        return 0
    all_coords = []
    for poly in polygons:
        all_coords.extend(list(poly.exterior.coords))
    xs = [c[0] for c in all_coords]
    ys = [c[1] for c in all_coords]
    return max(max(xs) - min(xs), max(ys) - min(ys))

print('Functions loaded')

In [None]:
# Compare the failed submission (eazy optimizer) with the baseline
# The eazy optimizer modified tree 003_0 which caused the overlap

baseline_path = '/home/nonroot/snapshots/santa-2025/21129617858/code/preoptimized/ensemble.csv'
eazy_path = '/home/code/experiments/005_long_optimization/submission.csv'

baseline_df = pd.read_csv(baseline_path)
eazy_df = pd.read_csv(eazy_path)

# Check N=3 specifically (where the overlap was detected)
baseline_n3 = baseline_df[baseline_df['id'].str.startswith('003_')].copy()
eazy_n3 = eazy_df[eazy_df['id'].str.startswith('003_')].copy()

print('Baseline N=3:')
for _, row in baseline_n3.iterrows():
    print(f"  {row['id']}: x={parse_s_value(row['x']):.6f}, y={parse_s_value(row['y']):.6f}, deg={parse_s_value(row['deg']):.6f}")

print('\nEazy optimizer N=3:')
for _, row in eazy_n3.iterrows():
    print(f"  {row['id']}: x={parse_s_value(row['x']):.6f}, y={parse_s_value(row['y']):.6f}, deg={parse_s_value(row['deg']):.6f}")

In [None]:
# Check for overlaps in N=3 with high precision
from shapely.strtree import STRtree

def check_overlaps_detailed(df, n):
    prefix = f'{n:03d}_'
    group = df[df['id'].str.startswith(prefix)].copy()
    group['x_val'] = group['x'].apply(parse_s_value)
    group['y_val'] = group['y'].apply(parse_s_value)
    group['deg_val'] = group['deg'].apply(parse_s_value)
    
    polygons = []
    for _, row in group.iterrows():
        poly = create_tree_polygon(row['x_val'], row['y_val'], row['deg_val'])
        polygons.append(poly)
    
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > 1e-15:  # Very strict threshold
                    overlaps.append((i, j, intersection.area))
    return overlaps, polygons

print('Baseline N=3 overlaps:')
overlaps_base, polys_base = check_overlaps_detailed(baseline_df, 3)
print(f'  Found {len(overlaps_base)} overlaps')
for o in overlaps_base:
    print(f'    Trees {o[0]} and {o[1]}: area={o[2]:.2e}')

print('\nEazy N=3 overlaps:')
overlaps_eazy, polys_eazy = check_overlaps_detailed(eazy_df, 3)
print(f'  Found {len(overlaps_eazy)} overlaps')
for o in overlaps_eazy:
    print(f'    Trees {o[0]} and {o[1]}: area={o[2]:.2e}')

In [None]:
# Check ALL N values for overlaps in the eazy submission
print('Checking all N values for overlaps in eazy submission...')
all_overlaps = []
for n in range(1, 201):
    overlaps, _ = check_overlaps_detailed(eazy_df, n)
    if overlaps:
        all_overlaps.append((n, overlaps))
        
print(f'\nFound overlaps in {len(all_overlaps)} groups:')
for n, overlaps in all_overlaps[:10]:  # Show first 10
    print(f'  N={n}: {len(overlaps)} overlaps, max area={max(o[2] for o in overlaps):.2e}')

In [None]:
# Check the corner_extraction solution - it's overlap-free and better than ensemble
corner_path = '/home/nonroot/snapshots/santa-2025/21129617858/code/preoptimized/corner_extraction.csv'
corner_df = pd.read_csv(corner_path)

print('Checking corner_extraction.csv for overlaps...')
corner_overlaps = []
for n in range(1, 201):
    overlaps, _ = check_overlaps_detailed(corner_df, n)
    if overlaps:
        corner_overlaps.append((n, overlaps))
        
print(f'Found overlaps in {len(corner_overlaps)} groups')

# Calculate score
def calculate_score(df):
    total = 0
    for n in range(1, 201):
        prefix = f'{n:03d}_'
        group = df[df['id'].str.startswith(prefix)]
        group_copy = group.copy()
        group_copy['x_val'] = group_copy['x'].apply(parse_s_value)
        group_copy['y_val'] = group_copy['y'].apply(parse_s_value)
        group_copy['deg_val'] = group_copy['deg'].apply(parse_s_value)
        polygons = [create_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) for _, row in group_copy.iterrows()]
        side = get_bounding_box_side(polygons)
        total += side**2 / n
    return total

print(f'\ncorner_extraction.csv score: {calculate_score(corner_df):.6f}')
print(f'ensemble.csv score: {calculate_score(baseline_df):.6f}')

In [None]:
# Create an ensemble of corner_extraction + ensemble (best N from each)
def create_ensemble(df1, df2):
    """Create ensemble by taking best N from each source"""
    best_rows = []
    improvements = []
    
    for n in range(1, 201):
        prefix = f'{n:03d}_'
        
        # Get groups from each
        g1 = df1[df1['id'].str.startswith(prefix)].copy()
        g2 = df2[df2['id'].str.startswith(prefix)].copy()
        
        # Calculate scores
        g1['x_val'] = g1['x'].apply(parse_s_value)
        g1['y_val'] = g1['y'].apply(parse_s_value)
        g1['deg_val'] = g1['deg'].apply(parse_s_value)
        polys1 = [create_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) for _, row in g1.iterrows()]
        side1 = get_bounding_box_side(polys1)
        
        g2['x_val'] = g2['x'].apply(parse_s_value)
        g2['y_val'] = g2['y'].apply(parse_s_value)
        g2['deg_val'] = g2['deg'].apply(parse_s_value)
        polys2 = [create_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) for _, row in g2.iterrows()]
        side2 = get_bounding_box_side(polys2)
        
        if side1 < side2:
            best_rows.extend(g1[['id', 'x', 'y', 'deg']].values.tolist())
            if side1 < side2 - 1e-6:
                improvements.append((n, side2 - side1, side1, side2))
        else:
            best_rows.extend(g2[['id', 'x', 'y', 'deg']].values.tolist())
            if side2 < side1 - 1e-6:
                improvements.append((n, side1 - side2, side2, side1))
    
    ensemble_df = pd.DataFrame(best_rows, columns=['id', 'x', 'y', 'deg'])
    return ensemble_df, improvements

ensemble_df, improvements = create_ensemble(corner_df, baseline_df)
print(f'Created ensemble with {len(improvements)} improvements:')
for n, imp, best, worst in improvements[:10]:
    print(f'  N={n}: improved by {imp:.6f} (from {worst:.6f} to {best:.6f})')

print(f'\nEnsemble score: {calculate_score(ensemble_df):.6f}')

In [None]:
# Save the ensemble as a candidate
ensemble_df.to_csv('/home/code/experiments/006_corner_extraction/ensemble_best.csv', index=False)
print('Saved ensemble to /home/code/experiments/006_corner_extraction/ensemble_best.csv')

# Verify no overlaps
print('\nVerifying no overlaps in ensemble...')
ensemble_overlaps = []
for n in range(1, 201):
    overlaps, _ = check_overlaps_detailed(ensemble_df, n)
    if overlaps:
        ensemble_overlaps.append((n, overlaps))
        
print(f'Found overlaps in {len(ensemble_overlaps)} groups')

In [None]:
# Summary of current situation
print('='*60)
print('SUMMARY')
print('='*60)
print(f'Target score: 68.919154')
print(f'Best verified LB: 70.676102 (ensemble.csv)')
print(f'Gap to target: {70.676102 - 68.919154:.6f} ({(70.676102 - 68.919154)/68.919154*100:.2f}%)')
print()
print('Available overlap-free solutions:')
print(f'  ensemble.csv: 70.676102')
print(f'  corner_extraction.csv: {calculate_score(corner_df):.6f}')
print(f'  ensemble of both: {calculate_score(ensemble_df):.6f}')
print()
print('PROBLEM: The eazy optimizer creates overlaps that Kaggle detects')
print('but our local Shapely validation misses (precision issue).')
print()
print('NEXT STEPS:')
print('1. Submit the corner_extraction ensemble (70.675457) to verify it works')
print('2. Try the bbox3 optimizer from why-not kernel (more robust)')
print('3. Focus on fundamentally different approaches')