# Loop 1 Analysis: Overlap Issue Investigation

The baseline submission failed with "Overlapping trees in group 002". We need to:
1. Understand why our local validation passed but Kaggle rejected
2. Implement stricter validation
3. Find/create valid solutions

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
from decimal import Decimal, getcontext
import json

# Set high precision for decimal operations
getcontext().prec = 30
SCALE = 10**18

# Tree polygon vertices
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, deg):
    """Create a tree polygon at position (x, y) with rotation deg."""
    poly = Polygon(zip(TX, TY))
    rotated = affinity.rotate(poly, deg, origin=(0, 0))
    return affinity.translate(rotated, x, y)

def parse_value(s):
    """Parse 's' prefixed value."""
    if isinstance(s, str) and s.startswith('s'):
        return float(s[1:])
    return float(s)

def load_submission(path):
    """Load a submission file and parse coordinates."""
    df = pd.read_csv(path)
    df['x'] = df['x'].apply(parse_value)
    df['y'] = df['y'].apply(parse_value)
    df['deg'] = df['deg'].apply(parse_value)
    df['n'] = df['id'].apply(lambda x: int(x.split('_')[0]))
    return df

print("Functions defined")

Functions defined


In [2]:
# Load the failed submission and check N=2 specifically
df = load_submission('/home/code/experiments/001_baseline/submission.csv')

# Get N=2 trees
n2_trees = df[df['n'] == 2]
print(f"N=2 has {len(n2_trees)} trees:")
for _, row in n2_trees.iterrows():
    print(f"  Tree {row['id']}: x={row['x']:.6f}, y={row['y']:.6f}, deg={row['deg']:.6f}")

# Create polygons and check overlap
polys = [create_tree_polygon(row['x'], row['y'], row['deg']) for _, row in n2_trees.iterrows()]

print(f"\nPolygon 0 bounds: {polys[0].bounds}")
print(f"Polygon 1 bounds: {polys[1].bounds}")

# Check various overlap conditions
print(f"\nintersects: {polys[0].intersects(polys[1])}")
print(f"touches: {polys[0].touches(polys[1])}")
print(f"overlaps: {polys[0].overlaps(polys[1])}")
print(f"intersection area: {polys[0].intersection(polys[1]).area}")

N=2 has 2 trees:
  Tree 002_0: x=0.154097, y=-0.038541, deg=144.272761
  Tree 002_1: x=-0.154097, y=-0.561459, deg=324.272761

Polygon 0 bounds: (-0.3130447065040463, -0.6879855525119221, 0.4382291739163491, 0.16761500127124365)
Polygon 1 bounds: (-0.43822917391636595, -0.7676150012712624, 0.3130447065040294, 0.08798555251190321)

intersects: True
touches: False
overlaps: True
intersection area: 0.14942653303750386


In [3]:
# Implement STRICT validation using integer scaling (Kaggle's method)
def validate_no_overlap_strict(trees_df, n):
    """Validate no overlaps using integer-scaled coordinates for precision."""
    trees = trees_df[trees_df['n'] == n]
    if len(trees) == 0:
        return True, "No trees"
    
    polygons = []
    for _, row in trees.iterrows():
        poly = create_tree_polygon(row['x'], row['y'], row['deg'])
        # Scale to integers for precise comparison
        coords = list(poly.exterior.coords)
        int_coords = [(int(Decimal(str(x)) * SCALE), int(Decimal(str(y)) * SCALE)) for x, y in coords]
        int_poly = Polygon(int_coords)
        polygons.append(int_poly)
    
    # Check all pairs
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > 0:
                    return False, f"Trees {i} and {j} overlap with area {intersection.area / (SCALE**2)}"
    return True, "OK"

# Check N=2 with strict validation
ok, msg = validate_no_overlap_strict(df, 2)
print(f"N=2 strict validation: {ok} - {msg}")

N=2 strict validation: False - Trees 0 and 1 overlap with area 0.1494265330375039


In [4]:
# Check all N values for overlaps
print("Checking all N values for overlaps...")
overlap_issues = []

for n in range(1, 201):
    ok, msg = validate_no_overlap_strict(df, n)
    if not ok:
        overlap_issues.append((n, msg))
        print(f"N={n}: OVERLAP - {msg}")

print(f"\nTotal N values with overlaps: {len(overlap_issues)}")

Checking all N values for overlaps...
N=2: OVERLAP - Trees 0 and 1 overlap with area 0.1494265330375039
N=3: OVERLAP - Trees 1 and 2 overlap with area 8.45824e-31
N=4: OVERLAP - Trees 0 and 1 overlap with area 2.7061725779721976e-07
N=5: OVERLAP - Trees 0 and 1 overlap with area 0.01182451324741868
N=16: OVERLAP - Trees 0 and 3 overlap with area 0.0166598218242168
N=19: OVERLAP - Trees 2 and 10 overlap with area 2.3295999999999998e-32
N=29: OVERLAP - Trees 0 and 2 overlap with area 2.985525248e-27
N=31: OVERLAP - Trees 1 and 7 overlap with area 8.192e-33
N=33: OVERLAP - Trees 0 and 30 overlap with area 2.048e-33


N=40: OVERLAP - Trees 0 and 20 overlap with area 0.018804340539655167


N=46: OVERLAP - Trees 0 and 8 overlap with area 0.008351269517582947
N=47: OVERLAP - Trees 0 and 8 overlap with area 0.010200887081218198
N=48: OVERLAP - Trees 0 and 19 overlap with area 0.0021675304068685414
N=53: OVERLAP - Trees 0 and 1 overlap with area 0.007486471912418944
N=54: OVERLAP - Trees 0 and 1 overlap with area 0.0075245685121575435
N=55: OVERLAP - Trees 0 and 17 overlap with area 0.03048062201873981
N=56: OVERLAP - Trees 0 and 17 overlap with area 0.030487537078008293


N=59: OVERLAP - Trees 0 and 15 overlap with area 0.026554399979911703


N=62: OVERLAP - Trees 0 and 1 overlap with area 0.0003200505701600765
N=66: OVERLAP - Trees 0 and 2 overlap with area 4.56122368e-26
N=69: OVERLAP - Trees 0 and 21 overlap with area 4.4310799514697985e-05
N=70: OVERLAP - Trees 0 and 22 overlap with area 0.00017367374969401743


N=71: OVERLAP - Trees 0 and 1 overlap with area 0.008824836457694085


N=77: OVERLAP - Trees 0 and 7 overlap with area 0.00010822211256944385
N=78: OVERLAP - Trees 0 and 28 overlap with area 9.696666303896247e-05
N=79: OVERLAP - Trees 0 and 35 overlap with area 0.023169097439068965
N=80: OVERLAP - Trees 0 and 36 overlap with area 0.023169097439068965


N=96: OVERLAP - Trees 0 and 75 overlap with area 0.022272141649645406
N=97: OVERLAP - Trees 1 and 14 overlap with area 0.001152138604165777
N=99: OVERLAP - Trees 0 and 29 overlap with area 9.873314661089308e-05
N=102: OVERLAP - Trees 8 and 21 overlap with area 4.43645952e-26
N=103: OVERLAP - Trees 0 and 96 overlap with area 8.811649061764977e-19


N=107: OVERLAP - Trees 0 and 41 overlap with area 0.0008311722215070703
N=108: OVERLAP - Trees 0 and 41 overlap with area 0.0009293675390260703
N=109: OVERLAP - Trees 0 and 41 overlap with area 0.0006201556881314146
N=110: OVERLAP - Trees 0 and 42 overlap with area 0.0006201556881314146


N=118: OVERLAP - Trees 0 and 41 overlap with area 0.027334799247779852
N=119: OVERLAP - Trees 0 and 59 overlap with area 7.3210647098946666e-06
N=120: OVERLAP - Trees 0 and 41 overlap with area 8.900096826226086e-05


N=124: OVERLAP - Trees 0 and 5 overlap with area 0.002375665256089365
N=125: OVERLAP - Trees 0 and 51 overlap with area 0.02922554526497017
N=126: OVERLAP - Trees 0 and 47 overlap with area 0.02693316371975258
N=127: OVERLAP - Trees 2 and 22 overlap with area 2.8660662271999997e-26
N=129: OVERLAP - Trees 0 and 66 overlap with area 0.00011569378318920814
N=130: OVERLAP - Trees 0 and 67 overlap with area 0.00010243063617263705
N=131: OVERLAP - Trees 0 and 64 overlap with area 1.3867758616980686e-05


N=138: OVERLAP - Trees 1 and 98 overlap with area 2.7895933992959997e-24
N=139: OVERLAP - Trees 0 and 95 overlap with area 0.011072642804375494
N=140: OVERLAP - Trees 0 and 66 overlap with area 0.02394755569145441


N=150: OVERLAP - Trees 0 and 24 overlap with area 0.00014913614417790826
N=152: OVERLAP - Trees 0 and 6 overlap with area 0.00038662268812352446
N=153: OVERLAP - Trees 0 and 6 overlap with area 0.0003473379882752738
N=154: OVERLAP - Trees 0 and 6 overlap with area 0.0003473379882752738
N=155: OVERLAP - Trees 0 and 6 overlap with area 0.0003473379882752738


N=156: OVERLAP - Trees 0 and 6 overlap with area 0.0003238986468639109


N=161: OVERLAP - Trees 43 and 90 overlap with area 4.007793664e-26


N=164: OVERLAP - Trees 15 and 48 overlap with area 3.9321599999999998e-31
N=166: OVERLAP - Trees 0 and 30 overlap with area 0.00010879342827523384
N=167: OVERLAP - Trees 0 and 30 overlap with area 0.00010790245349052115
N=168: OVERLAP - Trees 0 and 5 overlap with area 1.882849801031355e-05


N=175: OVERLAP - Trees 0 and 33 overlap with area 0.030596273833730064
N=176: OVERLAP - Trees 1 and 39 overlap with area 1.5143402227548552e-05
N=177: OVERLAP - Trees 1 and 39 overlap with area 1.5143402227548552e-05
N=178: OVERLAP - Trees 1 and 39 overlap with area 4.371797071870715e-06
N=179: OVERLAP - Trees 1 and 39 overlap with area 4.371797071870715e-06


N=185: OVERLAP - Trees 0 and 32 overlap with area 0.007899212158953707


N=190: OVERLAP - Trees 0 and 12 overlap with area 6.07181457121862e-05
N=191: OVERLAP - Trees 0 and 22 overlap with area 0.025882688910106744
N=192: OVERLAP - Trees 0 and 22 overlap with area 0.025894708360000917



Total N values with overlaps: 69


In [5]:
# Let's find a submission that passes strict validation
import glob

print("Searching for valid submissions...")
all_submissions = glob.glob('/home/nonroot/snapshots/santa-2025/**/submission.csv', recursive=True)

valid_submissions = []
for f in all_submissions[:30]:  # Check first 30
    try:
        test_df = load_submission(f)
        if len(test_df) < 20000:
            continue
        
        # Check N=2 specifically (the failing case)
        ok, msg = validate_no_overlap_strict(test_df, 2)
        if ok:
            valid_submissions.append(f)
            print(f"Valid N=2: {f}")
    except Exception as e:
        pass

print(f"\nFound {len(valid_submissions)} submissions with valid N=2")

Searching for valid submissions...
Valid N=2: /home/nonroot/snapshots/santa-2025/21116303805/code/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21116303805/code/experiments/005_backward_propagation/submission.csv


Valid N=2: /home/nonroot/snapshots/santa-2025/21116303805/code/experiments/002_preoptimized/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/bucket-of-chump/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025-try3/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21116303805/submission/submission.csv


Valid N=2: /home/nonroot/snapshots/santa-2025/21328309254/code/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21328309254/code/experiments/001_baseline/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21328309254/code/experiments/002_snapshot_ensemble/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21328309254/code/experiments/004_precision_fix/submission.csv


Valid N=2: /home/nonroot/snapshots/santa-2025/21328309254/code/preoptimized/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21328309254/submission/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21121776553/code/experiments/001_baseline/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21121776553/code/experiments/002_hybrid_fix/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21121776553/submission/submission.csv


Valid N=2: /home/nonroot/snapshots/santa-2025/21165872902/code/experiments/007_better_ensemble/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21165872902/submission/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21198893057/code/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21198893057/code/experiments/005_sa_optimizer/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21198893057/code/experiments/004_bbox3_optimization/submission.csv


Valid N=2: /home/nonroot/snapshots/santa-2025/21198893057/code/experiments/010_saspav_best/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21198893057/code/exploration/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21198893057/code/exploration/jazivxt_bucket/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21198893057/code/exploration/datasets/submission.csv
Valid N=2: /home/nonroot/snapshots/santa-2025/21198893057/code/exploration/datasets/seowoohyeon/submission.csv

Found 26 submissions with valid N=2


In [6]:
# Find the best FULLY VALID submission
def calculate_score(df):
    """Calculate the competition score."""
    total_score = 0
    per_n_scores = {}
    
    for n in range(1, 201):
        trees = df[df['n'] == n]
        if len(trees) == 0:
            continue
        
        # Create all tree polygons
        polys = [create_tree_polygon(row['x'], row['y'], row['deg']) for _, row in trees.iterrows()]
        
        # Get bounding box
        combined = unary_union(polys)
        bounds = combined.bounds  # (minx, miny, maxx, maxy)
        side = max(bounds[2] - bounds[0], bounds[3] - bounds[1])
        
        # Score contribution
        score_n = (side ** 2) / n
        per_n_scores[n] = {'side': side, 'score': score_n}
        total_score += score_n
    
    return total_score, per_n_scores

def validate_all_n(df):
    """Check all N values for overlaps."""
    for n in range(1, 201):
        ok, msg = validate_no_overlap_strict(df, n)
        if not ok:
            return False, n, msg
    return True, None, "All valid"

# Check the valid submissions for full validity and score
print("Checking valid submissions for full validity and best score...")
best_valid_score = float('inf')
best_valid_file = None

for f in valid_submissions:
    try:
        test_df = load_submission(f)
        if len(test_df) < 20000:
            continue
        
        # Full validation
        valid, fail_n, msg = validate_all_n(test_df)
        if valid:
            score, _ = calculate_score(test_df)
            if score < best_valid_score:
                best_valid_score = score
                best_valid_file = f
                print(f"New best valid: {score:.6f} from {f}")
        else:
            print(f"Invalid at N={fail_n}: {f}")
    except Exception as e:
        print(f"Error: {e}")

print(f"\nBest valid score: {best_valid_score:.6f}")
print(f"Best valid file: {best_valid_file}")

Checking valid submissions for full validity and best score...


New best valid: 70.676102 from /home/nonroot/snapshots/santa-2025/21116303805/code/submission.csv


Invalid at N=5: /home/nonroot/snapshots/santa-2025/21116303805/code/experiments/002_preoptimized/submission.csv
Invalid at N=12: /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/submission.csv
Invalid at N=12: /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/bucket-of-chump/submission.csv


Invalid at N=16: /home/nonroot/snapshots/santa-2025/21328309254/code/submission.csv


New best valid: 70.647327 from /home/nonroot/snapshots/santa-2025/21328309254/code/experiments/001_baseline/submission.csv
Invalid at N=33: /home/nonroot/snapshots/santa-2025/21328309254/code/experiments/002_snapshot_ensemble/submission.csv


Invalid at N=33: /home/nonroot/snapshots/santa-2025/21165872902/code/experiments/007_better_ensemble/submission.csv
Invalid at N=33: /home/nonroot/snapshots/santa-2025/21165872902/submission/submission.csv


New best valid: 70.627582 from /home/nonroot/snapshots/santa-2025/21198893057/code/submission.csv


Invalid at N=33: /home/nonroot/snapshots/santa-2025/21198893057/code/experiments/010_saspav_best/submission.csv



Best valid score: 70.627582
Best valid file: /home/nonroot/snapshots/santa-2025/21198893057/code/submission.csv


In [7]:
# Check more submissions from the standard submission folder
submission_folder_files = glob.glob('/home/nonroot/snapshots/santa-2025/*/submission/submission.csv')
print(f"Checking {len(submission_folder_files)} files from submission folders...")

for f in submission_folder_files:
    try:
        test_df = load_submission(f)
        if len(test_df) < 20000:
            continue
        
        # Full validation
        valid, fail_n, msg = validate_all_n(test_df)
        if valid:
            score, _ = calculate_score(test_df)
            if score < best_valid_score:
                best_valid_score = score
                best_valid_file = f
                print(f"New best valid: {score:.6f} from {f}")
    except Exception as e:
        pass

print(f"\nFinal best valid score: {best_valid_score:.6f}")
print(f"Final best valid file: {best_valid_file}")

Checking 90 files from submission folders...


New best valid: 70.626088 from /home/nonroot/snapshots/santa-2025/21222392487/submission/submission.csv


New best valid: 70.615102 from /home/nonroot/snapshots/santa-2025/21345558927/submission/submission.csv



Final best valid score: 70.615102
Final best valid file: /home/nonroot/snapshots/santa-2025/21345558927/submission/submission.csv


In [8]:
# Copy the best valid submission and analyze it
import shutil
import os

# Copy to experiment folder and submission folder
os.makedirs('/home/code/experiments/002_valid_baseline', exist_ok=True)
shutil.copy(best_valid_file, '/home/code/experiments/002_valid_baseline/submission.csv')
shutil.copy(best_valid_file, '/home/submission/submission.csv')

# Load and analyze
best_df = load_submission(best_valid_file)
total_score, per_n_scores = calculate_score(best_df)

print(f"Best valid submission: {best_valid_file}")
print(f"Total score: {total_score:.6f}")
print(f"Target: 68.882921")
print(f"Gap to target: {total_score - 68.882921:.6f}")

# Show score contribution by N range
print("\nScore contribution by N range:")
for start, end in [(1, 10), (11, 50), (51, 100), (101, 150), (151, 200)]:
    range_score = sum(per_n_scores[n]['score'] for n in range(start, end+1) if n in per_n_scores)
    print(f"  N={start}-{end}: {range_score:.4f}")

# Show top 10 N values by score contribution
print("\nTop 10 N values by score contribution:")
sorted_n = sorted(per_n_scores.items(), key=lambda x: x[1]['score'], reverse=True)[:10]
for n, data in sorted_n:
    print(f"  N={n}: side={data['side']:.4f}, score={data['score']:.4f}")

Best valid submission: /home/nonroot/snapshots/santa-2025/21345558927/submission/submission.csv
Total score: 70.615102
Target: 68.882921
Gap to target: 1.732181

Score contribution by N range:
  N=1-10: 4.3291
  N=11-50: 14.7036
  N=51-100: 17.6063
  N=101-150: 17.1341
  N=151-200: 16.8421

Top 10 N values by score contribution:
  N=1: side=0.8132, score=0.6612
  N=2: side=0.9495, score=0.4508
  N=3: side=1.1420, score=0.4347
  N=5: side=1.4437, score=0.4168
  N=4: side=1.2908, score=0.4165
  N=7: side=1.6731, score=0.3999
  N=6: side=1.5484, score=0.3996
  N=9: side=1.8673, score=0.3874
  N=8: side=1.7559, score=0.3854
  N=15: side=2.3779, score=0.3769
