# Loop 2 Analysis: Investigating Overlap Error

The submission failed with 'Overlapping trees in group 069'. Let's investigate and fix this.

In [1]:
import numpy as np
import pandas as pd
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.strtree import STRtree

getcontext().prec = 30

class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(str(center_x))
        self.center_y = Decimal(str(center_y))
        self.angle = Decimal(str(angle))

        trunk_w = Decimal('0.15')
        trunk_h = Decimal('0.2')
        base_w = Decimal('0.7')
        mid_w = Decimal('0.4')
        top_w = Decimal('0.25')
        tip_y = Decimal('0.8')
        tier_1_y = Decimal('0.5')
        tier_2_y = Decimal('0.25')
        base_y = Decimal('0.0')
        trunk_bottom_y = -trunk_h

        initial_polygon = Polygon([
            (float(Decimal('0.0')), float(tip_y)),
            (float(top_w / Decimal('2')), float(tier_1_y)),
            (float(top_w / Decimal('4')), float(tier_1_y)),
            (float(mid_w / Decimal('2')), float(tier_2_y)),
            (float(mid_w / Decimal('4')), float(tier_2_y)),
            (float(base_w / Decimal('2')), float(base_y)),
            (float(trunk_w / Decimal('2')), float(base_y)),
            (float(trunk_w / Decimal('2')), float(trunk_bottom_y)),
            (float(-(trunk_w / Decimal('2'))), float(trunk_bottom_y)),
            (float(-(trunk_w / Decimal('2'))), float(base_y)),
            (float(-(base_w / Decimal('2'))), float(base_y)),
            (float(-(mid_w / Decimal('4'))), float(tier_2_y)),
            (float(-(mid_w / Decimal('2'))), float(tier_2_y)),
            (float(-(top_w / Decimal('4'))), float(tier_1_y)),
            (float(-(top_w / Decimal('2'))), float(tier_1_y)),
        ])
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=float(self.center_x), yoff=float(self.center_y))

print('ChristmasTree class defined')

ChristmasTree class defined


In [2]:
def parse_value(val):
    if isinstance(val, str) and val.startswith('s'):
        return val[1:]
    return str(val)

def load_trees_for_n(df, n):
    prefix = f"{n:03d}_"
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in rows.iterrows():
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        trees.append(ChristmasTree(x, y, deg))
    return trees

def has_overlap(trees):
    if len(trees) <= 1:
        return False, []
    polygons = [t.polygon for t in trees]
    tree_index = STRtree(polygons)
    overlaps = []
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx > i and poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                intersection = poly.intersection(polygons[idx])
                if intersection.area > 1e-10:
                    overlaps.append((i, idx, intersection.area))
    return len(overlaps) > 0, overlaps

print('Helper functions defined')

Helper functions defined


In [3]:
# Check candidate_001.csv for overlaps in group 069
df = pd.read_csv('/home/code/submission_candidates/candidate_001.csv')
print(f'Loaded {len(df)} rows')

# Check group 069
trees_69 = load_trees_for_n(df, 69)
print(f'Group 069 has {len(trees_69)} trees')

has_ovlp, overlaps = has_overlap(trees_69)
print(f'Has overlap: {has_ovlp}')
if overlaps:
    print(f'Overlaps found: {overlaps[:10]}')
else:
    print('No overlaps in group 069')

Loaded 20100 rows
Group 069 has 69 trees
Has overlap: False
No overlaps in group 069


In [4]:
# Check all groups for overlaps
overlap_groups = []
for n in range(1, 201):
    trees = load_trees_for_n(df, n)
    has_ovlp, overlaps = has_overlap(trees)
    if has_ovlp:
        overlap_groups.append((n, len(overlaps)))

print(f'Groups with overlaps: {len(overlap_groups)}')
for n, count in overlap_groups[:20]:
    print(f'  N={n}: {count} overlaps')

Groups with overlaps: 0


In [5]:
# Compare with baseline (candidate_000.csv)
df_baseline = pd.read_csv('/home/code/submission_candidates/candidate_000.csv')
print('Checking baseline for overlaps...')

overlap_groups_baseline = []
for n in range(1, 201):
    trees = load_trees_for_n(df_baseline, n)
    has_ovlp, overlaps = has_overlap(trees)
    if has_ovlp:
        overlap_groups_baseline.append((n, len(overlaps)))

print(f'Baseline groups with overlaps: {len(overlap_groups_baseline)}')
if overlap_groups_baseline:
    for n, count in overlap_groups_baseline[:10]:
        print(f'  N={n}: {count} overlaps')
else:
    print('Baseline has NO overlaps - this is the valid submission')

Checking baseline for overlaps...


Baseline groups with overlaps: 0
Baseline has NO overlaps - this is the valid submission


In [None]:
# The issue is clear: candidate_001.csv has overlaps, candidate_000.csv doesn't
# We need to use the baseline (candidate_000.csv) which is valid

# Let's also check the current.csv and submission.csv
import os

print('Checking current.csv...')
if os.path.exists('/home/code/current.csv'):
    df_current = pd.read_csv('/home/code/current.csv')
    overlap_count = 0
    for n in range(1, 201):
        trees = load_trees_for_n(df_current, n)
        has_ovlp, _ = has_overlap(trees)
        if has_ovlp:
            overlap_count += 1
    print(f'current.csv has {overlap_count} groups with overlaps')

print('\nChecking /home/submission/submission.csv...')
if os.path.exists('/home/submission/submission.csv'):
    df_sub = pd.read_csv('/home/submission/submission.csv')
    overlap_count = 0
    for n in range(1, 201):
        trees = load_trees_for_n(df_sub, n)
        has_ovlp, _ = has_overlap(trees)
        if has_ovlp:
            overlap_count += 1
    print(f'/home/submission/submission.csv has {overlap_count} groups with overlaps')

In [6]:
# The issue might be that Kaggle uses a stricter overlap detection
# Let's check with a smaller epsilon threshold

def has_overlap_strict(trees, epsilon=1e-12):
    """Stricter overlap detection"""
    if len(trees) <= 1:
        return False, []
    polygons = [t.polygon for t in trees]
    tree_index = STRtree(polygons)
    overlaps = []
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx > i:
                # Check if they intersect (not just touch)
                if poly.intersects(polygons[idx]):
                    intersection = poly.intersection(polygons[idx])
                    # Any non-zero area intersection is an overlap
                    if intersection.area > epsilon:
                        overlaps.append((i, idx, intersection.area))
    return len(overlaps) > 0, overlaps

# Check group 069 with stricter detection
trees_69 = load_trees_for_n(df, 69)
has_ovlp, overlaps = has_overlap_strict(trees_69, epsilon=1e-15)
print(f'Strict check - Has overlap: {has_ovlp}')
if overlaps:
    print(f'Overlaps found: {overlaps[:10]}')

Strict check - Has overlap: False


In [7]:
# Let's check the actual submission file that was submitted
df_submitted = pd.read_csv('/home/submission/submission.csv')
print(f'Submitted file has {len(df_submitted)} rows')

# Check group 069
trees_69_sub = load_trees_for_n(df_submitted, 69)
print(f'Group 069 has {len(trees_69_sub)} trees')

# Check with different epsilon values
for eps in [1e-10, 1e-12, 1e-15, 0]:
    has_ovlp, overlaps = has_overlap_strict(trees_69_sub, epsilon=eps)
    print(f'epsilon={eps}: Has overlap: {has_ovlp}, count: {len(overlaps)}')

Submitted file has 20100 rows
Group 069 has 69 trees
epsilon=1e-10: Has overlap: False, count: 0
epsilon=1e-12: Has overlap: False, count: 0
epsilon=1e-15: Has overlap: False, count: 0
epsilon=0: Has overlap: False, count: 0


In [8]:
# The submission file looks clean locally. Let's check if the issue is with the file format\n# or if there's a difference between what we submitted and what Kaggle sees\n\n# Check if the files are identical\nimport hashlib\n\ndef file_hash(path):\n    with open(path, 'rb') as f:\n        return hashlib.md5(f.read()).hexdigest()\n\nprint('File hashes:')\nprint(f'  candidate_000.csv: {file_hash(\"/home/code/submission_candidates/candidate_000.csv\")}')\nprint(f'  candidate_001.csv: {file_hash(\"/home/code/submission_candidates/candidate_001.csv\")}')\nprint(f'  submission.csv: {file_hash(\"/home/submission/submission.csv\")}')\n\n# Check if submission.csv matches candidate_001.csv\nif file_hash('/home/submission/submission.csv') == file_hash('/home/code/submission_candidates/candidate_001.csv'):\n    print('\\nsubmission.csv matches candidate_001.csv')\nelse:\n    print('\\nsubmission.csv does NOT match candidate_001.csv')"

In [None]:
# Let's check the actual content of group 069 in both files\nprint('Group 069 in candidate_000.csv:')\ndf0 = pd.read_csv('/home/code/submission_candidates/candidate_000.csv')\nprint(df0[df0['id'].str.startswith('069_')].head(10))\n\nprint('\\nGroup 069 in candidate_001.csv:')\ndf1 = pd.read_csv('/home/code/submission_candidates/candidate_001.csv')\nprint(df1[df1['id'].str.startswith('069_')].head(10))\n\nprint('\\nGroup 069 in submission.csv:')\ndf_sub = pd.read_csv('/home/submission/submission.csv')\nprint(df_sub[df_sub['id'].str.startswith('069_')].head(10))"