# Loop 1 Analysis - Understanding Validation Failures

The baseline submission failed with "Overlapping trees in group 004". Need to:
1. Understand Kaggle's stricter validation
2. Build a truly valid submission
3. Identify the gap to target

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.strtree import STRtree
import json
import os
import glob

getcontext().prec = 30

# Christmas Tree class
class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(str(center_x))
        self.center_y = Decimal(str(center_y))
        self.angle = Decimal(str(angle))
        
        initial_polygon = Polygon([
            (0.0, 0.8), (0.125, 0.5), (0.0625, 0.5),
            (0.2, 0.25), (0.1, 0.25), (0.35, 0.0),
            (0.075, 0.0), (0.075, -0.2), (-0.075, -0.2),
            (-0.075, 0.0), (-0.35, 0.0), (-0.1, 0.25),
            (-0.2, 0.25), (-0.0625, 0.5), (-0.125, 0.5),
        ])
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=float(self.center_x), yoff=float(self.center_y))

def parse_value(val):
    if isinstance(val, str) and val.startswith('s'):
        return val[1:]
    return str(val)

def load_trees_for_n(df, n):
    prefix = f"{n:03d}_"
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in rows.iterrows():
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        trees.append(ChristmasTree(x, y, deg))
    return trees

print('Helper functions defined')

Helper functions defined


In [2]:
# STRICT overlap check - any intersection at all (even point/edge touching)
def has_any_intersection(trees):
    """Check if any trees have ANY intersection (stricter than overlap)"""
    if len(trees) <= 1:
        return False, None
    polygons = [t.polygon for t in trees]
    
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > 0 or not intersection.is_empty:
                    return True, (i, j, intersection)
    return False, None

def get_bounding_box_side(trees):
    all_points = []
    for tree in trees:
        coords = np.array(tree.polygon.exterior.coords)
        all_points.append(coords)
    all_points = np.vstack(all_points)
    min_x, min_y = all_points.min(axis=0)
    max_x, max_y = all_points.max(axis=0)
    return max(max_x - min_x, max_y - min_y)

print('Strict validation functions defined')

Strict validation functions defined


In [3]:
# Check the current submission for issues
submission_path = '/home/submission/submission.csv'
df = pd.read_csv(submission_path)
print(f'Submission shape: {df.shape}')
print(f'Expected: 20100 rows')

# Check for inconsistent formatting
has_s_prefix = df['x'].astype(str).str.startswith('s').sum()
no_s_prefix = (~df['x'].astype(str).str.startswith('s')).sum()
print(f'\nRows with s prefix: {has_s_prefix}')
print(f'Rows without s prefix: {no_s_prefix}')

# Check which N values have issues
df['N'] = df['id'].str.split('_').str[0].astype(int)
for n in range(1, 11):
    n_rows = df[df['N'] == n]
    has_s = n_rows['x'].astype(str).str.startswith('s').sum()
    print(f'N={n}: {len(n_rows)} rows, {has_s} with s prefix')

Submission shape: (20100, 4)
Expected: 20100 rows

Rows with s prefix: 20072
Rows without s prefix: 28
N=1: 1 rows, 1 with s prefix
N=2: 2 rows, 0 with s prefix
N=3: 3 rows, 3 with s prefix
N=4: 4 rows, 0 with s prefix
N=5: 5 rows, 5 with s prefix
N=6: 6 rows, 6 with s prefix
N=7: 7 rows, 7 with s prefix
N=8: 8 rows, 8 with s prefix
N=9: 9 rows, 9 with s prefix
N=10: 10 rows, 10 with s prefix


In [4]:
# Check N=4 specifically (the one that failed)
trees_4 = load_trees_for_n(df, 4)
print(f'N=4 has {len(trees_4)} trees')

# Check for any intersection
has_int, details = has_any_intersection(trees_4)
print(f'Has intersection: {has_int}')
if has_int:
    i, j, intersection = details
    print(f'Trees {i} and {j} intersect')
    print(f'Intersection type: {intersection.geom_type}')
    print(f'Intersection area: {intersection.area}')
    if hasattr(intersection, 'length'):
        print(f'Intersection length: {intersection.length}')

N=4 has 4 trees
Has intersection: False


In [5]:
# Find ALL N values with intersections
print('Checking all N values for intersections...')
intersecting_ns = []
for n in range(1, 201):
    trees = load_trees_for_n(df, n)
    if len(trees) != n:
        print(f'N={n}: Wrong number of trees ({len(trees)})')
        continue
    has_int, details = has_any_intersection(trees)
    if has_int:
        i, j, intersection = details
        intersecting_ns.append((n, i, j, intersection.geom_type, intersection.area))

print(f'\nFound {len(intersecting_ns)} N values with intersections:')
for n, i, j, geom_type, area in intersecting_ns[:20]:
    print(f'  N={n}: trees {i},{j} - {geom_type}, area={area:.2e}')

Checking all N values for intersections...



Found 0 N values with intersections:


In [None]:
# Now let's find valid configurations from snapshots
# First, find all CSV files
snapshot_dir = '/home/nonroot/snapshots/santa-2025/'
csv_files = []
for root, dirs, files in os.walk(snapshot_dir):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

print(f'Found {len(csv_files)} CSV files in snapshots')

# Also check preoptimized folder
preopt_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/'
if os.path.exists(preopt_dir):
    for root, dirs, files in os.walk(preopt_dir):
        for f in files:
            if f.endswith('.csv'):
                csv_files.append(os.path.join(root, f))
    print(f'Total with preoptimized: {len(csv_files)} CSV files')

In [None]:
# Build a valid ensemble - for each N, find the best valid configuration
from tqdm import tqdm

def get_score_for_n(trees, n):
    side = get_bounding_box_side(trees)
    return side**2 / n

# Store best valid configuration for each N
best_valid = {n: {'score': float('inf'), 'data': None, 'source': None} for n in range(1, 201)}

print('Scanning CSV files for valid configurations...')
for csv_path in tqdm(csv_files[:100]):  # Start with first 100
    try:
        df_csv = pd.read_csv(csv_path)
        if not {'id', 'x', 'y', 'deg'}.issubset(df_csv.columns):
            continue
        
        # Check each N
        df_csv['N'] = df_csv['id'].astype(str).str.split('_').str[0].astype(int)
        for n in range(1, 201):
            n_rows = df_csv[df_csv['N'] == n]
            if len(n_rows) != n:
                continue
            
            trees = load_trees_for_n(df_csv, n)
            has_int, _ = has_any_intersection(trees)
            
            if not has_int:  # Valid!
                score = get_score_for_n(trees, n)
                if score < best_valid[n]['score']:
                    best_valid[n]['score'] = score
                    best_valid[n]['data'] = n_rows[['id', 'x', 'y', 'deg']].copy()
                    best_valid[n]['source'] = csv_path
    except Exception as e:
        continue

# Count how many N values have valid configurations
valid_count = sum(1 for n in range(1, 201) if best_valid[n]['data'] is not None)
print(f'\nFound valid configurations for {valid_count}/200 N values')

In [None]:
# Check which N values are missing valid configurations
missing_ns = [n for n in range(1, 201) if best_valid[n]['data'] is None]
print(f'Missing valid configurations for N: {missing_ns[:20]}...')

# For missing N, we need to find or create valid configurations
# Let's check more CSV files
if missing_ns:
    print(f'\nScanning more CSV files for missing N values...')
    for csv_path in tqdm(csv_files[100:]):
        try:
            df_csv = pd.read_csv(csv_path)
            if not {'id', 'x', 'y', 'deg'}.issubset(df_csv.columns):
                continue
            
            df_csv['N'] = df_csv['id'].astype(str).str.split('_').str[0].astype(int)
            for n in missing_ns:
                if best_valid[n]['data'] is not None:
                    continue
                n_rows = df_csv[df_csv['N'] == n]
                if len(n_rows) != n:
                    continue
                
                trees = load_trees_for_n(df_csv, n)
                has_int, _ = has_any_intersection(trees)
                
                if not has_int:
                    score = get_score_for_n(trees, n)
                    best_valid[n]['score'] = score
                    best_valid[n]['data'] = n_rows[['id', 'x', 'y', 'deg']].copy()
                    best_valid[n]['source'] = csv_path
        except:
            continue
    
    # Recount
    valid_count = sum(1 for n in range(1, 201) if best_valid[n]['data'] is not None)
    print(f'Now have valid configurations for {valid_count}/200 N values')

In [6]:
# The submission passed our check but failed Kaggle's
# Let's look at the N=4 configuration more carefully
print("N=4 configuration:")
n4_rows = df[df['N'] == 4][['id', 'x', 'y', 'deg']]
print(n4_rows)

# Check if the values without 's' prefix might be causing issues
# Kaggle might expect all values to have 's' prefix
print("\nChecking N=2 (no s prefix):")
n2_rows = df[df['N'] == 2][['id', 'x', 'y', 'deg']]
print(n2_rows)

print("\nChecking N=4 (no s prefix):")
n4_rows = df[df['N'] == 4][['id', 'x', 'y', 'deg']]
print(n4_rows)

N=4 configuration:
      id                    x                    y                 deg
6  004_0  -0.3247477895893721   0.1321099780881853   156.3706221456364
7  004_1   0.3153543462426376   0.1321099780634754   156.3706222692641
8  004_2   0.3247477895923792  -0.7321099780694755    336.370622269264
9  004_3  -0.3153543481348183  -0.7321099780941859  336.37062214563645

Checking N=2 (no s prefix):
      id                    x                    y                 deg
1  002_0   0.1540970696213558  -0.0385407426947946  203.62937773065684
2  002_1  -0.1540970696213728   -0.561459257305224  23.629377730656792

Checking N=4 (no s prefix):
      id                    x                    y                 deg
6  004_0  -0.3247477895893721   0.1321099780881853   156.3706221456364
7  004_1   0.3153543462426376   0.1321099780634754   156.3706222692641
8  004_2   0.3247477895923792  -0.7321099780694755    336.370622269264
9  004_3  -0.3153543481348183  -0.7321099780941859  336.37062214563645


In [8]:
# Use higher precision check for N=4
from shapely.validation import make_valid

trees_4 = load_trees_for_n(df, 4)
print(f"N=4 trees:")
for i, t in enumerate(trees_4):
    print(f"  Tree {i}: ({float(t.center_x):.15f}, {float(t.center_y):.15f}), angle={float(t.angle):.15f}")

# Check all pairs with buffer
print("\nPairwise checks:")
for i in range(len(trees_4)):
    for j in range(i+1, len(trees_4)):
        p1 = trees_4[i].polygon
        p2 = trees_4[j].polygon
        
        # Check various intersection types
        intersects = p1.intersects(p2)
        touches = p1.touches(p2)
        overlaps = p1.overlaps(p2)
        
        if intersects:
            intersection = p1.intersection(p2)
            print(f"  Trees {i},{j}: intersects={intersects}, touches={touches}, overlaps={overlaps}")
            print(f"    Intersection: {intersection.geom_type}, area={intersection.area:.2e}")
            if intersection.geom_type == 'Point':
                print(f"    Point: {list(intersection.coords)}")
            elif intersection.geom_type == 'LineString':
                print(f"    Line length: {intersection.length:.2e}")
        else:
            # Check distance
            dist = p1.distance(p2)
            print(f"  Trees {i},{j}: no intersection, distance={dist:.10f}")

N=4 trees:
  Tree 0: (-0.324747789589372, 0.132109978088185), angle=156.370622145636389
  Tree 1: (0.315354346242638, 0.132109978063475), angle=156.370622269264089
  Tree 2: (0.324747789592379, -0.732109978069476), angle=336.370622269264004
  Tree 3: (-0.315354348134818, -0.732109978094186), angle=336.370622145636446

Pairwise checks:
  Trees 0,1: no intersection, distance=0.0370210066
  Trees 0,2: no intersection, distance=0.4674935974
  Trees 0,3: no intersection, distance=0.0000000000
  Trees 1,2: no intersection, distance=0.0000000000
  Trees 1,3: no intersection, distance=0.0000000000
  Trees 2,3: no intersection, distance=0.0370210078


In [9]:
# Trees with distance=0 are TOUCHING - Kaggle considers this as overlapping!
# We need to find configurations where ALL trees have distance > 0

def has_touching_trees(trees, min_distance=1e-10):
    """Check if any trees are touching (distance = 0)"""
    if len(trees) <= 1:
        return False, None
    polygons = [t.polygon for t in trees]
    
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            dist = polygons[i].distance(polygons[j])
            if dist < min_distance:
                return True, (i, j, dist)
    return False, None

# Check all N values for touching trees
print("Checking all N values for touching trees...")
touching_ns = []
for n in range(1, 201):
    trees = load_trees_for_n(df, n)
    if len(trees) != n:
        continue
    has_touch, details = has_touching_trees(trees)
    if has_touch:
        i, j, dist = details
        touching_ns.append((n, i, j, dist))

print(f"\nFound {len(touching_ns)} N values with touching trees:")
for n, i, j, dist in touching_ns[:30]:
    print(f"  N={n}: trees {i},{j} - distance={dist:.2e}")

Checking all N values for touching trees...



Found 190 N values with touching trees:
  N=2: trees 0,1 - distance=5.35e-16
  N=3: trees 1,2 - distance=5.78e-17
  N=4: trees 0,3 - distance=2.99e-16
  N=5: trees 0,3 - distance=2.76e-12
  N=6: trees 0,1 - distance=1.79e-15
  N=7: trees 1,5 - distance=5.28e-11
  N=8: trees 0,5 - distance=1.65e-15
  N=9: trees 0,4 - distance=6.82e-12
  N=10: trees 0,7 - distance=6.34e-15
  N=11: trees 0,7 - distance=6.65e-11
  N=12: trees 0,1 - distance=4.84e-12
  N=13: trees 0,7 - distance=3.77e-15
  N=14: trees 0,3 - distance=5.46e-13
  N=15: trees 0,3 - distance=1.01e-14
  N=16: trees 0,3 - distance=1.69e-13
  N=17: trees 0,2 - distance=3.63e-15
  N=18: trees 0,1 - distance=4.39e-14
  N=19: trees 0,3 - distance=7.29e-15
  N=20: trees 0,2 - distance=1.02e-14
  N=21: trees 0,6 - distance=4.90e-15
  N=22: trees 0,2 - distance=9.59e-13
  N=23: trees 0,9 - distance=1.31e-16
  N=24: trees 0,2 - distance=2.56e-12
  N=25: trees 0,12 - distance=1.15e-13
  N=26: trees 0,18 - distance=4.20e-15
  N=27: trees 0

In [10]:
# Strategy: We need to either:
# 1. Find configurations from snapshots that have gaps between trees
# 2. Or slightly shrink/move trees to create gaps

# Let's first check if any snapshot CSVs have non-touching configurations
# We'll look for the sample_submission.csv which should have gaps

sample_path = '/home/data/sample_submission.csv'
sample_df = pd.read_csv(sample_path)
print(f"Sample submission shape: {sample_df.shape}")

# Check N=4 in sample
sample_df['N'] = sample_df['id'].str.split('_').str[0].astype(int)
trees_4_sample = load_trees_for_n(sample_df, 4)
print(f"\nSample N=4 trees:")
for i, t in enumerate(trees_4_sample):
    print(f"  Tree {i}: ({float(t.center_x):.6f}, {float(t.center_y):.6f}), angle={float(t.angle):.1f}")

# Check distances
print("\nSample N=4 pairwise distances:")
for i in range(len(trees_4_sample)):
    for j in range(i+1, len(trees_4_sample)):
        dist = trees_4_sample[i].polygon.distance(trees_4_sample[j].polygon)
        print(f"  Trees {i},{j}: distance={dist:.6f}")

Sample submission shape: (20100, 4)

Sample N=4 trees:
  Tree 0: (0.000000, 0.000000), angle=90.0
  Tree 1: (0.202736, -0.511271), angle=90.0
  Tree 2: (0.520600, 0.177413), angle=180.0
  Tree 3: (-0.818657, -0.228694), angle=180.0

Sample N=4 pairwise distances:
  Trees 0,1: distance=0.006035
  Trees 0,2: distance=0.051628
  Trees 0,3: distance=0.004816
  Trees 1,2: distance=0.037138
  Trees 1,3: distance=0.034444
  Trees 2,3: distance=0.757346
