# Loop 1 Analysis - Understanding Validation Failures

The baseline submission failed with "Overlapping trees in group 004". Need to:
1. Understand Kaggle's stricter validation
2. Build a truly valid submission
3. Identify the gap to target

In [None]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.strtree import STRtree
import json
import os
import glob

getcontext().prec = 30

# Christmas Tree class
class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(str(center_x))
        self.center_y = Decimal(str(center_y))
        self.angle = Decimal(str(angle))
        
        initial_polygon = Polygon([
            (0.0, 0.8), (0.125, 0.5), (0.0625, 0.5),
            (0.2, 0.25), (0.1, 0.25), (0.35, 0.0),
            (0.075, 0.0), (0.075, -0.2), (-0.075, -0.2),
            (-0.075, 0.0), (-0.35, 0.0), (-0.1, 0.25),
            (-0.2, 0.25), (-0.0625, 0.5), (-0.125, 0.5),
        ])
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=float(self.center_x), yoff=float(self.center_y))

def parse_value(val):
    if isinstance(val, str) and val.startswith('s'):
        return val[1:]
    return str(val)

def load_trees_for_n(df, n):
    prefix = f"{n:03d}_"
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in rows.iterrows():
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        trees.append(ChristmasTree(x, y, deg))
    return trees

print('Helper functions defined')

In [None]:
# STRICT overlap check - any intersection at all (even point/edge touching)
def has_any_intersection(trees):
    """Check if any trees have ANY intersection (stricter than overlap)"""
    if len(trees) <= 1:
        return False, None
    polygons = [t.polygon for t in trees]
    
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > 0 or not intersection.is_empty:
                    return True, (i, j, intersection)
    return False, None

def get_bounding_box_side(trees):
    all_points = []
    for tree in trees:
        coords = np.array(tree.polygon.exterior.coords)
        all_points.append(coords)
    all_points = np.vstack(all_points)
    min_x, min_y = all_points.min(axis=0)
    max_x, max_y = all_points.max(axis=0)
    return max(max_x - min_x, max_y - min_y)

print('Strict validation functions defined')

In [None]:
# Check the current submission for issues
submission_path = '/home/submission/submission.csv'
df = pd.read_csv(submission_path)
print(f'Submission shape: {df.shape}')
print(f'Expected: 20100 rows')

# Check for inconsistent formatting
has_s_prefix = df['x'].astype(str).str.startswith('s').sum()
no_s_prefix = (~df['x'].astype(str).str.startswith('s')).sum()
print(f'\nRows with s prefix: {has_s_prefix}')
print(f'Rows without s prefix: {no_s_prefix}')

# Check which N values have issues
df['N'] = df['id'].str.split('_').str[0].astype(int)
for n in range(1, 11):
    n_rows = df[df['N'] == n]
    has_s = n_rows['x'].astype(str).str.startswith('s').sum()
    print(f'N={n}: {len(n_rows)} rows, {has_s} with s prefix')

In [None]:
# Check N=4 specifically (the one that failed)
trees_4 = load_trees_for_n(df, 4)
print(f'N=4 has {len(trees_4)} trees')

# Check for any intersection
has_int, details = has_any_intersection(trees_4)
print(f'Has intersection: {has_int}')
if has_int:
    i, j, intersection = details
    print(f'Trees {i} and {j} intersect')
    print(f'Intersection type: {intersection.geom_type}')
    print(f'Intersection area: {intersection.area}')
    if hasattr(intersection, 'length'):
        print(f'Intersection length: {intersection.length}')

In [None]:
# Find ALL N values with intersections
print('Checking all N values for intersections...')
intersecting_ns = []
for n in range(1, 201):
    trees = load_trees_for_n(df, n)
    if len(trees) != n:
        print(f'N={n}: Wrong number of trees ({len(trees)})')
        continue
    has_int, details = has_any_intersection(trees)
    if has_int:
        i, j, intersection = details
        intersecting_ns.append((n, i, j, intersection.geom_type, intersection.area))

print(f'\nFound {len(intersecting_ns)} N values with intersections:')
for n, i, j, geom_type, area in intersecting_ns[:20]:
    print(f'  N={n}: trees {i},{j} - {geom_type}, area={area:.2e}')

In [None]:
# Now let's find valid configurations from snapshots
# First, find all CSV files
snapshot_dir = '/home/nonroot/snapshots/santa-2025/'
csv_files = []
for root, dirs, files in os.walk(snapshot_dir):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

print(f'Found {len(csv_files)} CSV files in snapshots')

# Also check preoptimized folder
preopt_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/'
if os.path.exists(preopt_dir):
    for root, dirs, files in os.walk(preopt_dir):
        for f in files:
            if f.endswith('.csv'):
                csv_files.append(os.path.join(root, f))
    print(f'Total with preoptimized: {len(csv_files)} CSV files')

In [None]:
# Build a valid ensemble - for each N, find the best valid configuration
from tqdm import tqdm

def get_score_for_n(trees, n):
    side = get_bounding_box_side(trees)
    return side**2 / n

# Store best valid configuration for each N
best_valid = {n: {'score': float('inf'), 'data': None, 'source': None} for n in range(1, 201)}

print('Scanning CSV files for valid configurations...')
for csv_path in tqdm(csv_files[:100]):  # Start with first 100
    try:
        df_csv = pd.read_csv(csv_path)
        if not {'id', 'x', 'y', 'deg'}.issubset(df_csv.columns):
            continue
        
        # Check each N
        df_csv['N'] = df_csv['id'].astype(str).str.split('_').str[0].astype(int)
        for n in range(1, 201):
            n_rows = df_csv[df_csv['N'] == n]
            if len(n_rows) != n:
                continue
            
            trees = load_trees_for_n(df_csv, n)
            has_int, _ = has_any_intersection(trees)
            
            if not has_int:  # Valid!
                score = get_score_for_n(trees, n)
                if score < best_valid[n]['score']:
                    best_valid[n]['score'] = score
                    best_valid[n]['data'] = n_rows[['id', 'x', 'y', 'deg']].copy()
                    best_valid[n]['source'] = csv_path
    except Exception as e:
        continue

# Count how many N values have valid configurations
valid_count = sum(1 for n in range(1, 201) if best_valid[n]['data'] is not None)
print(f'\nFound valid configurations for {valid_count}/200 N values')

In [None]:
# Check which N values are missing valid configurations
missing_ns = [n for n in range(1, 201) if best_valid[n]['data'] is None]
print(f'Missing valid configurations for N: {missing_ns[:20]}...')

# For missing N, we need to find or create valid configurations
# Let's check more CSV files
if missing_ns:
    print(f'\nScanning more CSV files for missing N values...')
    for csv_path in tqdm(csv_files[100:]):
        try:
            df_csv = pd.read_csv(csv_path)
            if not {'id', 'x', 'y', 'deg'}.issubset(df_csv.columns):
                continue
            
            df_csv['N'] = df_csv['id'].astype(str).str.split('_').str[0].astype(int)
            for n in missing_ns:
                if best_valid[n]['data'] is not None:
                    continue
                n_rows = df_csv[df_csv['N'] == n]
                if len(n_rows) != n:
                    continue
                
                trees = load_trees_for_n(df_csv, n)
                has_int, _ = has_any_intersection(trees)
                
                if not has_int:
                    score = get_score_for_n(trees, n)
                    best_valid[n]['score'] = score
                    best_valid[n]['data'] = n_rows[['id', 'x', 'y', 'deg']].copy()
                    best_valid[n]['source'] = csv_path
        except:
            continue
    
    # Recount
    valid_count = sum(1 for n in range(1, 201) if best_valid[n]['data'] is not None)
    print(f'Now have valid configurations for {valid_count}/200 N values')