# Evolver Loop 1 Analysis: Investigating Overlap in Group 040

The submission failed with "Overlapping trees in group 040". Need to:
1. Understand why our overlap detection missed this
2. Fix the overlap detection
3. Repair the submission

In [1]:
import numpy as np
import pandas as pd
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
from shapely.strtree import STRtree
import os

# Christmas tree polygon vertices (15 vertices)
TREE_VERTICES = np.array([
    (0.0, 0.8),      # Tip
    (0.125, 0.5),    # Right top tier
    (0.0625, 0.5),
    (0.2, 0.25),     # Right mid tier
    (0.1, 0.25),
    (0.35, 0.0),     # Right base
    (0.075, 0.0),    # Right trunk
    (0.075, -0.2),
    (-0.075, -0.2),  # Left trunk
    (-0.075, 0.0),
    (-0.35, 0.0),    # Left base
    (-0.1, 0.25),    # Left mid tier
    (-0.2, 0.25),
    (-0.0625, 0.5),  # Left top tier
    (-0.125, 0.5),
])

class ChristmasTree:
    def __init__(self, x=0, y=0, deg=0):
        self.x = x
        self.y = y
        self.deg = deg
        self._polygon = None
    
    @property
    def polygon(self):
        if self._polygon is None:
            base = Polygon(TREE_VERTICES)
            rotated = rotate(base, self.deg, origin=(0, 0))
            self._polygon = translate(rotated, self.x, self.y)
        return self._polygon

def load_submission(filepath):
    df = pd.read_csv(filepath)
    for col in ['x', 'y', 'deg']:
        df[col] = df[col].astype(str).str.replace('s', '', regex=False).astype(float)
    return df

def get_trees_for_n(df, n):
    prefix = f"{n:03d}_"
    subset = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in subset.iterrows():
        trees.append(ChristmasTree(row['x'], row['y'], row['deg']))
    return trees

print("Functions defined")

Functions defined


In [2]:
# Load the submitted file
submission_path = '/home/submission/submission.csv'
df = load_submission(submission_path)
print(f"Loaded {len(df)} rows")

# Check group 040 specifically
trees_40 = get_trees_for_n(df, 40)
print(f"Group 040 has {len(trees_40)} trees")

Loaded 20100 rows
Group 040 has 40 trees


In [3]:
# More rigorous overlap detection
def check_overlaps_detailed(trees, tolerance=1e-10):
    """Check for overlaps with detailed reporting."""
    overlaps = []
    polygons = [t.polygon for t in trees]
    
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > tolerance:
                    overlaps.append({
                        'tree_i': i,
                        'tree_j': j,
                        'intersection_area': intersection.area,
                        'intersection_type': intersection.geom_type
                    })
    return overlaps

# Check group 040
overlaps_40 = check_overlaps_detailed(trees_40)
print(f"Overlaps in group 040: {len(overlaps_40)}")
for o in overlaps_40:
    print(f"  Trees {o['tree_i']} and {o['tree_j']}: area={o['intersection_area']:.10f}, type={o['intersection_type']}")

Overlaps in group 040: 0


In [4]:
# Check all groups for overlaps with stricter tolerance
print("Checking all groups for overlaps...")
all_overlaps = {}
for n in range(1, 201):
    trees = get_trees_for_n(df, n)
    overlaps = check_overlaps_detailed(trees, tolerance=1e-12)
    if overlaps:
        all_overlaps[n] = overlaps
        print(f"  Group {n:03d}: {len(overlaps)} overlaps")

print(f"\nTotal groups with overlaps: {len(all_overlaps)}")

Checking all groups for overlaps...



Total groups with overlaps: 0


In [5]:
# Load the original ensemble.csv to compare
original_path = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv'
original_df = load_submission(original_path)

# Check group 040 in original
original_trees_40 = get_trees_for_n(original_df, 40)
overlaps_original_40 = check_overlaps_detailed(original_trees_40)
print(f"Overlaps in original group 040: {len(overlaps_original_40)}")
for o in overlaps_original_40:
    print(f"  Trees {o['tree_i']} and {o['tree_j']}: area={o['intersection_area']:.10f}")

Overlaps in original group 040: 0


In [6]:
# Compare the data for group 040 between original and submitted
print("Original group 040:")
original_40_data = original_df[original_df['id'].str.startswith('040_')]
print(original_40_data.head(10))

print("\nSubmitted group 040:")
submitted_40_data = df[df['id'].str.startswith('040_')]
print(submitted_40_data.head(10))

Original group 040:
        id         x         y         deg
780  040_0 -1.689632 -0.247894  252.060473
781  040_1  0.953906 -0.552668  252.151406
782  040_2 -0.821314  0.356434  252.217177
783  040_3 -1.689632  0.489826  252.060473
784  040_4  0.842519 -1.882363  246.370622
785  040_5 -1.690955 -1.834206  242.547378
786  040_6 -0.874184 -1.869729  252.223495
787  040_7 -0.026478 -1.869689  252.111808
788  040_8  0.133073  0.291614  252.217182
789  040_9 -1.689632 -1.030321  252.060473

Submitted group 040:
        id         x         y         deg
780  040_0 -1.689632 -0.247894  252.060473
781  040_1  0.953906 -0.552668  252.151406
782  040_2 -0.821314  0.356434  252.217177
783  040_3 -1.689632  0.489826  252.060473
784  040_4  0.842519 -1.882363  246.370622
785  040_5 -1.690955 -1.834206  242.547378
786  040_6 -0.874184 -1.869729  252.223495
787  040_7 -0.026478 -1.869689  252.111808
788  040_8  0.133073  0.291614  252.217182
789  040_9 -1.689632 -1.030321  252.060473


In [7]:
# The Kaggle metric might use a different overlap detection method
# Let's check if there are any touching polygons that might be considered overlapping

def check_touching_detailed(trees):
    """Check for touching polygons."""
    touching = []
    polygons = [t.polygon for t in trees]
    
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].touches(polygons[j]):
                touching.append({
                    'tree_i': i,
                    'tree_j': j,
                    'touch_type': polygons[i].intersection(polygons[j]).geom_type
                })
            # Also check for very small intersections
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > 0:
                    print(f"Trees {i} and {j}: intersection area = {intersection.area:.15f}")
    return touching

# Check group 040
touching_40 = check_touching_detailed(trees_40)
print(f"Touching pairs in group 040: {len(touching_40)}")

Touching pairs in group 040: 0


In [8]:
# Let's check the Kaggle metric notebook for their overlap detection
# First, let's look at the research kernels for the metric implementation

# Check if there's a metric notebook in the research folder
import os
for root, dirs, files in os.walk('/home/code/research'):
    for f in files:
        if 'metric' in f.lower():
            print(os.path.join(root, f))

In [9]:
# Let's use a buffer-based approach to detect near-overlaps
# The Kaggle metric might be more sensitive to numerical precision

def check_overlaps_with_buffer(trees, buffer_size=1e-8):
    """Check for overlaps using buffered polygons."""
    overlaps = []
    polygons = [t.polygon for t in trees]
    
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            # Shrink both polygons slightly
            p1_shrunk = polygons[i].buffer(-buffer_size)
            p2_shrunk = polygons[j].buffer(-buffer_size)
            
            if p1_shrunk.intersects(p2_shrunk):
                intersection = p1_shrunk.intersection(p2_shrunk)
                if intersection.area > 0:
                    overlaps.append({
                        'tree_i': i,
                        'tree_j': j,
                        'intersection_area': intersection.area
                    })
    return overlaps

# Check with buffer
overlaps_buffer = check_overlaps_with_buffer(trees_40, buffer_size=1e-8)
print(f"Overlaps with buffer in group 040: {len(overlaps_buffer)}")

Overlaps with buffer in group 040: 0


In [10]:
# Let's check the raw submission file format - maybe the 's' prefix is causing issues
raw_df = pd.read_csv('/home/submission/submission.csv')
print("Raw submission format:")
print(raw_df[raw_df['id'].str.startswith('040_')].head(5))

# Check if the 's' prefix is correctly formatted
print("\nColumn types:")
print(raw_df.dtypes)

Raw submission format:
        id                     x                      y                  deg
780  040_0  s-1.6896321034220907  s-0.24789443118256277  s252.06047297262413
781  040_1    s0.953906339823958   s-0.5526684960764324  s252.15140577197445
782  040_2  s-0.8213139514209225    s0.3564335759017292  s252.21717684648004
783  040_3   s-1.689632103415816    s0.4898261274657638  s252.06047293571422
784  040_4   s0.8425191623179998   s-1.8823633520681906  s246.37062226934367

Column types:
id     object
x      object
y      object
deg    object
dtype: object


In [11]:
# Compare with sample submission format
sample_df = pd.read_csv('/home/data/sample_submission.csv')
print("Sample submission format:")
print(sample_df[sample_df['id'].str.startswith('040_')].head(5))

Sample submission format:
        id           x           y     deg
780  040_0        s0.0        s0.0   s90.0
781  040_1   s0.202736  s-0.511271   s90.0
782  040_2     s0.5206   s0.177413  s180.0
783  040_3  s-0.818657  s-0.228694  s180.0
784  040_4   s0.111852   s0.893022  s180.0


In [12]:
# Let's check the original ensemble.csv format
original_raw = pd.read_csv('/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv')
print("Original ensemble.csv format:")
print(original_raw[original_raw['id'].str.startswith('040_')].head(5))

Original ensemble.csv format:
        id                       x                       y  \
780  040_0  s-1.689632103422090692  s-0.247894431182562769   
781  040_1   s0.953906339823958005  s-0.552668496076432425   
782  040_2  s-0.821313951420922450   s0.356433575901729194   
783  040_3  s-1.689632103415815934   s0.489826127465763816   
784  040_4   s0.842519162317999837  s-1.882363352068190610   

                         deg  
780  s252.060472972624125987  
781  s252.151405771974452819  
782  s252.217176846480043650  
783  s252.060472935714216192  
784  s246.370622269343670041  


In [13]:
# The issue is precision loss. Let's just copy the original ensemble.csv directly
import shutil

# Copy original ensemble.csv to submission
src = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv'
dst = '/home/submission/submission.csv'
shutil.copy(src, dst)
print(f"Copied {src} to {dst}")

# Verify
verify_df = pd.read_csv(dst)
print(f"Submission rows: {len(verify_df)}")
print(verify_df[verify_df['id'].str.startswith('040_')].head(3))

Copied /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv to /home/submission/submission.csv
Submission rows: 20100
        id                       x                       y  \
780  040_0  s-1.689632103422090692  s-0.247894431182562769   
781  040_1   s0.953906339823958005  s-0.552668496076432425   
782  040_2  s-0.821313951420922450   s0.356433575901729194   

                         deg  
780  s252.060472972624125987  
781  s252.151405771974452819  
782  s252.217176846480043650  


In [None]:
# Check if the data is different
print("Checking if data differs between original and submitted...")
for col in ['x', 'y', 'deg']:
    orig_vals = original_40_data[col].values
    sub_vals = submitted_40_data[col].values
    diff = np.abs(orig_vals - sub_vals).max()
    print(f"  {col}: max difference = {diff}")

# Check if the issue is in how we wrote the submission
print("\nRe-reading submission to check format...")
raw_df = pd.read_csv(submission_path)
print(raw_df[raw_df['id'].str.startswith('040_')].head(5))