# Loop 1 Analysis: Fix Overlapping Trees Issue

The baseline submission failed with 'Overlapping trees in group 070'. Need to:
1. Verify overlap detection
2. Find and fix overlapping trees
3. Create a valid submission

In [1]:
import numpy as np
import pandas as pd
from shapely.geometry import Polygon
from shapely import STRtree
import os

os.chdir('/home/code')

# Tree vertices
TX = np.array([0,0.125,0.0625,0.2,0.1,0.35,0.075,0.075,-0.075,-0.075,-0.35,-0.1,-0.2,-0.0625,-0.125])
TY = np.array([0.8,0.5,0.5,0.25,0.25,0,0,-0.2,-0.2,0,0,0.25,0.25,0.5,0.5])

def get_tree_polygon(x, y, deg):
    """Get tree polygon at position (x,y) with rotation deg"""
    rad = np.radians(deg)
    c, s = np.cos(rad), np.sin(rad)
    vertices = []
    for i in range(len(TX)):
        vx = TX[i] * c - TY[i] * s + x
        vy = TX[i] * s + TY[i] * c + y
        vertices.append((vx, vy))
    return Polygon(vertices)

def check_overlaps(df, n):
    """Check for overlapping trees in group n"""
    group = df[df['n'] == n].copy()
    if len(group) != n:
        return f"Wrong number of trees: {len(group)} vs {n}"
    
    polygons = []
    for _, row in group.iterrows():
        poly = get_tree_polygon(row['x_val'], row['y_val'], row['deg_val'])
        polygons.append(poly)
    
    # Check all pairs for overlap
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                # Check if it's more than just touching
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > 1e-10:  # Non-trivial overlap
                    overlaps.append((i, j, intersection.area))
    
    return overlaps

print("Loading baseline submission...")
df = pd.read_csv('experiments/001_baseline/baseline.csv')
df['x_val'] = df['x'].str[1:].astype(float)
df['y_val'] = df['y'].str[1:].astype(float)
df['deg_val'] = df['deg'].str[1:].astype(float)
df['n'] = df['id'].str.split('_').str[0].astype(int)
print(f"Loaded {len(df)} rows")

Loading baseline submission...
Loaded 20100 rows


In [2]:
# Check N=70 for overlaps
print("Checking N=70 for overlaps...")
overlaps_70 = check_overlaps(df, 70)
if overlaps_70:
    print(f"Found {len(overlaps_70)} overlapping pairs in N=70:")
    for i, j, area in overlaps_70[:10]:
        print(f"  Trees {i} and {j}: overlap area = {area:.10f}")
else:
    print("No overlaps found in N=70")

Checking N=70 for overlaps...
No overlaps found in N=70


In [3]:
# Check all N values for overlaps
print("\nChecking all N values for overlaps...")
all_overlaps = {}
for n in range(1, 201):
    overlaps = check_overlaps(df, n)
    if overlaps and isinstance(overlaps, list) and len(overlaps) > 0:
        all_overlaps[n] = overlaps
        
print(f"\nFound overlaps in {len(all_overlaps)} groups:")
for n, overlaps in sorted(all_overlaps.items()):
    print(f"  N={n}: {len(overlaps)} overlapping pairs")


Checking all N values for overlaps...



Found overlaps in 0 groups:


In [4]:
# Let's look at the snapshots to find one without overlaps
import os

snapshot_dir = '/home/nonroot/snapshots/santa-2025'
snapshots = sorted(os.listdir(snapshot_dir))
print(f"Found {len(snapshots)} snapshots")
print(f"Latest 5: {snapshots[-5:]}")

Found 100 snapshots
Latest 5: ['21328308881', '21328309254', '21328309666', '21328310048', '21328310479']


In [5]:
# Check a few snapshots for valid submissions
def check_submission_validity(filepath):
    """Check if a submission has any overlaps"""
    try:
        df = pd.read_csv(filepath)
        df['x_val'] = df['x'].str[1:].astype(float)
        df['y_val'] = df['y'].str[1:].astype(float)
        df['deg_val'] = df['deg'].str[1:].astype(float)
        df['n'] = df['id'].str.split('_').str[0].astype(int)
        
        # Quick check - just check a few N values
        for n in [70, 100, 150, 200]:
            overlaps = check_overlaps(df, n)
            if overlaps and isinstance(overlaps, list) and len(overlaps) > 0:
                return False, n
        return True, None
    except Exception as e:
        return False, str(e)

# Check latest snapshots
for snapshot in snapshots[-10:]:
    filepath = f"{snapshot_dir}/{snapshot}/submission/submission.csv"
    if os.path.exists(filepath):
        valid, issue = check_submission_validity(filepath)
        print(f"{snapshot}: {'VALID' if valid else f'INVALID (N={issue})'}")
    else:
        print(f"{snapshot}: No submission.csv")

21322576451: VALID


21322576827: VALID


21322577324: VALID
21322577814: No submission.csv


21322578388: VALID


21328308881: VALID


21328309254: VALID


21328309666: VALID


21328310048: VALID
21328310479: INVALID (N=70)


In [7]:
# The issue might be precision-related. Let's check the sample submission format
# and look at how the kernels handle overlap detection

# First, let's find a valid snapshot and score it
def score_submission(filepath):
    """Calculate total score for a submission"""
    df = pd.read_csv(filepath)
    df['x_val'] = df['x'].str[1:].astype(float)
    df['y_val'] = df['y'].str[1:].astype(float)
    df['deg_val'] = df['deg'].str[1:].astype(float)
    df['n'] = df['id'].str.split('_').str[0].astype(int)
    
    total_score = 0
    for n in range(1, 201):
        group = df[df['n'] == n]
        xs = group['x_val'].values
        ys = group['y_val'].values
        degs = group['deg_val'].values
        
        all_x, all_y = [], []
        for i in range(n):
            rad = np.radians(degs[i])
            c, s = np.cos(rad), np.sin(rad)
            for j in range(len(TX)):
                x = TX[j] * c - TY[j] * s + xs[i]
                y = TX[j] * s + TY[j] * c + ys[i]
                all_x.append(x)
                all_y.append(y)
        side = max(max(all_x) - min(all_x), max(all_y) - min(all_y))
        total_score += side * side / n
    
    return total_score

# Score the valid snapshots
print("Scoring valid snapshots...")
valid_snapshots = []
for snapshot in snapshots:
    filepath = f"{snapshot_dir}/{snapshot}/submission/submission.csv"
    if os.path.exists(filepath):
        try:
            score = score_submission(filepath)
            valid_snapshots.append((snapshot, score, filepath))
        except Exception as e:
            pass

valid_snapshots.sort(key=lambda x: x[1])
print(f"\nTop 10 best scoring snapshots:")
for snapshot, score, filepath in valid_snapshots[:10]:
    print(f"  {snapshot}: {score:.6f}")

Scoring valid snapshots...



Top 10 best scoring snapshots:
  21145965159: 38.909987
  21328310479: 70.523320
  21145966992: 70.572798
  21322576827: 70.616145
  21322576451: 70.619825
  21328309666: 70.619825
  21198925328: 70.624381
  21198927060: 70.624381
  21222373488: 70.624381
  21222375510: 70.624381


In [8]:
# Check the best snapshot more carefully
best_snapshot = '21328310479'
best_filepath = f"{snapshot_dir}/{best_snapshot}/submission/submission.csv"

# Load and check
df_best = pd.read_csv(best_filepath)
print(f"Shape: {df_best.shape}")
print(f"Expected: 20100 rows (1+2+...+200)")
print(df_best.head())

# Check for proper format
print(f"\nFirst few x values: {df_best['x'].head().tolist()}")
print(f"First few y values: {df_best['y'].head().tolist()}")
print(f"First few deg values: {df_best['deg'].head().tolist()}")

Shape: (20100, 4)
Expected: 20100 rows (1+2+...+200)
      id                       x                        y  \
0  001_0  s43.591192092102147626  s-31.783267068741778871   
1  002_0   s0.154097069621355887   s-0.038540742694794648   
2  002_1  s-0.154097069621372845   s-0.561459257305224058   
3  003_0      s0.254937643697833      s-0.233436061549416   
4  003_1      s0.357722754471247       s0.250360566787394   

                       deg  
0   s44.999999999999978684  
1  s144.272760863123583164  
2  s324.272760863123533426  
3      s113.56326044172948  
4         s66.370622269343  

First few x values: ['s43.591192092102147626', 's0.154097069621355887', 's-0.154097069621372845', 's0.254937643697833', 's0.357722754471247']
First few y values: ['s-31.783267068741778871', 's-0.038540742694794648', 's-0.561459257305224058', 's-0.233436061549416', 's0.250360566787394']
First few deg values: ['s44.999999999999978684', 's144.272760863123583164', 's324.272760863123533426', 's113.563260441

In [9]:
# Check all N values in this submission for overlaps (full check)
df_best['x_val'] = df_best['x'].str[1:].astype(float)
df_best['y_val'] = df_best['y'].str[1:].astype(float)
df_best['deg_val'] = df_best['deg'].str[1:].astype(float)
df_best['n'] = df_best['id'].str.split('_').str[0].astype(int)

print("Checking ALL N values for overlaps in best snapshot...")
all_overlaps_best = {}
for n in range(1, 201):
    overlaps = check_overlaps(df_best, n)
    if overlaps and isinstance(overlaps, list) and len(overlaps) > 0:
        all_overlaps_best[n] = overlaps

if all_overlaps_best:
    print(f"\nFound overlaps in {len(all_overlaps_best)} groups:")
    for n, overlaps in sorted(all_overlaps_best.items()):
        print(f"  N={n}: {len(overlaps)} overlapping pairs")
else:
    print("\nNo overlaps found! This submission should be valid.")

Checking ALL N values for overlaps in best snapshot...



Found overlaps in 57 groups:
  N=2: 1 overlapping pairs
  N=4: 4 overlapping pairs
  N=5: 10 overlapping pairs
  N=16: 21 overlapping pairs
  N=40: 37 overlapping pairs
  N=46: 39 overlapping pairs
  N=47: 40 overlapping pairs
  N=48: 48 overlapping pairs
  N=53: 86 overlapping pairs
  N=54: 89 overlapping pairs
  N=55: 91 overlapping pairs
  N=56: 93 overlapping pairs
  N=59: 55 overlapping pairs
  N=62: 95 overlapping pairs
  N=69: 62 overlapping pairs
  N=70: 63 overlapping pairs
  N=71: 120 overlapping pairs
  N=77: 101 overlapping pairs
  N=78: 95 overlapping pairs
  N=79: 94 overlapping pairs
  N=80: 95 overlapping pairs
  N=96: 88 overlapping pairs
  N=97: 163 overlapping pairs
  N=99: 146 overlapping pairs
  N=107: 192 overlapping pairs
  N=108: 194 overlapping pairs
  N=109: 196 overlapping pairs
  N=110: 198 overlapping pairs
  N=118: 183 overlapping pairs
  N=119: 163 overlapping pairs
  N=120: 167 overlapping pairs
  N=124: 121 overlapping pairs
  N=125: 119 overlapping pa

In [10]:
# Let's look for submissions in the research kernels
import os

kernel_dirs = [
    '/home/code/research/kernels/saspav_santa-submission',
    '/home/code/research/kernels/smartmanoj_santa-claude',
    '/home/code/research/kernels/jonathanchan_santa25-ensemble-sa-fractional-translation',
    '/home/code/research/kernels/yongsukprasertsuk_santa-2025-best-keeping-bbox3-runner',
    '/home/code/research/kernels/jazivxt_why-not',
]

for kdir in kernel_dirs:
    print(f"\n{kdir}:")
    if os.path.exists(kdir):
        for f in os.listdir(kdir):
            print(f"  {f}")
    else:
        print("  NOT FOUND")


/home/code/research/kernels/saspav_santa-submission:
  santa-submission.ipynb
  kernel-metadata.json

/home/code/research/kernels/smartmanoj_santa-claude:
  santa-claude.ipynb
  kernel-metadata.json

/home/code/research/kernels/jonathanchan_santa25-ensemble-sa-fractional-translation:
  santa25-ensemble-sa-fractional-translation.ipynb
  kernel-metadata.json

/home/code/research/kernels/yongsukprasertsuk_santa-2025-best-keeping-bbox3-runner:
  santa-2025-best-keeping-bbox3-runner.ipynb
  kernel-metadata.json

/home/code/research/kernels/jazivxt_why-not:
  why-not.ipynb
  kernel-metadata.json


In [11]:
# The issue is that our local overlap check uses area > 1e-10 threshold
# But Kaggle might use a stricter check (any intersection at all)
# Let's use a stricter overlap check

from decimal import Decimal, getcontext
getcontext().prec = 50

def check_overlaps_strict(df, n):
    """Check for overlapping trees in group n with stricter tolerance"""
    group = df[df['n'] == n].copy()
    if len(group) != n:
        return f"Wrong number of trees: {len(group)} vs {n}"
    
    polygons = []
    for _, row in group.iterrows():
        poly = get_tree_polygon(row['x_val'], row['y_val'], row['deg_val'])
        if not poly.is_valid:
            poly = poly.buffer(0)  # Fix invalid polygons
        polygons.append(poly)
    
    # Check all pairs for overlap using stricter check
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            # Use interior intersection (not just boundary touching)
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                # Check if intersection has any area (not just a point or line)
                if intersection.area > 0:  # Any positive area = overlap
                    overlaps.append((i, j, intersection.area))
    
    return overlaps

# Check N=70 with strict check
print("Checking N=70 with strict overlap detection...")
overlaps_70_strict = check_overlaps_strict(df, 70)
if overlaps_70_strict:
    print(f"Found {len(overlaps_70_strict)} overlapping pairs:")
    for i, j, area in overlaps_70_strict[:5]:
        print(f"  Trees {i} and {j}: overlap area = {area:.15e}")
else:
    print("No overlaps found")

Checking N=70 with strict overlap detection...
No overlaps found


In [12]:
# Let's look at the exact format of the submission
# The issue might be with the 's' prefix parsing or precision

# Check the baseline submission format
print("Baseline submission format:")
print(df.head(10).to_string())

# Check if there are any issues with the N=70 group specifically
group_70 = df[df['n'] == 70]
print(f"\nN=70 group has {len(group_70)} trees")
print(group_70.head())

Baseline submission format:
      id                       x                        y                      deg      x_val      y_val     deg_val  n
0  001_0  s40.752900903586450454  s-32.002948532171380691   s45.000000000000035527  40.752901 -32.002949   45.000000  1
1  002_0   s0.202513410337269301   s-0.028957664041420434  s203.629377730656727863   0.202513  -0.028958  203.629378  2
2  002_1  s-0.105680728905459279   s-0.551876178651849569   s23.629377730656813128  -0.105681  -0.551876   23.629378  2
3  003_0   s1.127378112162989332    s0.792211449857787242  s112.222533627590607352   1.127378   0.792211  112.222534  3
4  003_1   s1.234055695842160016    s1.275999500663759001   s66.370622269343002131   1.234056   1.276000   66.370622  3
5  003_2   s0.641714640229074984    s1.180458566613381111  s155.134051937100821306   0.641715   1.180459  155.134052  3
6  004_0  s-0.324747789590875469    s0.132109978091185853  s156.370622145636389178  -0.324748   0.132110  156.370622  4
7  004_1   s

In [13]:
# Let me check if the issue is with the N=1 tree position
# The N=1 tree has very large coordinates which is unusual

print("N=1 tree position:")
n1 = df[df['n'] == 1]
print(n1)

# The x,y values are very large (40, -32) which seems wrong for a single tree
# The optimal N=1 should be at origin with 45 degree rotation
# Let's check if this is causing issues

# Create a proper N=1 entry
print("\nExpected N=1 (optimal): x=0, y=0, deg=45")
print(f"Current N=1: x={n1['x_val'].values[0]}, y={n1['y_val'].values[0]}, deg={n1['deg_val'].values[0]}")

N=1 tree position:
      id                       x                        y  \
0  001_0  s40.752900903586450454  s-32.002948532171380691   

                      deg      x_val      y_val  deg_val  n  
0  s45.000000000000035527  40.752901 -32.002949     45.0  1  

Expected N=1 (optimal): x=0, y=0, deg=45
Current N=1: x=40.75290090358645, y=-32.00294853217138, deg=45.000000000000036


In [14]:
# Check the sample submission format
sample = pd.read_csv('/home/data/sample_submission.csv')
print("Sample submission format:")
print(sample.head(20))

# Check N=70 in sample
sample_70 = sample[sample['id'].str.startswith('070_')]
print(f"\nSample N=70 has {len(sample_70)} trees")
print(sample_70.head())

Sample submission format:
       id           x           y     deg
0   001_0        s0.0        s0.0   s90.0
1   002_0        s0.0        s0.0   s90.0
2   002_1   s0.202736  s-0.511271   s90.0
3   003_0        s0.0        s0.0   s90.0
4   003_1   s0.202736  s-0.511271   s90.0
5   003_2     s0.5206   s0.177413  s180.0
6   004_0        s0.0        s0.0   s90.0
7   004_1   s0.202736  s-0.511271   s90.0
8   004_2     s0.5206   s0.177413  s180.0
9   004_3  s-0.818657  s-0.228694  s180.0
10  005_0        s0.0        s0.0   s90.0
11  005_1   s0.202736  s-0.511271   s90.0
12  005_2     s0.5206   s0.177413  s180.0
13  005_3  s-0.818657  s-0.228694  s180.0
14  005_4   s0.111852   s0.893022  s180.0
15  006_0        s0.0        s0.0   s90.0
16  006_1   s0.202736  s-0.511271   s90.0
17  006_2     s0.5206   s0.177413  s180.0
18  006_3  s-0.818657  s-0.228694  s180.0
19  006_4   s0.111852   s0.893022  s180.0

Sample N=70 has 70 trees
         id           x           y     deg
2415  070_0        s0.

In [15]:
# Let me find a valid snapshot that doesn't have overlaps
# We need to check ALL N values, not just a few

def check_all_overlaps_fast(filepath):
    """Check all N values for overlaps, return list of N with overlaps"""
    try:
        df = pd.read_csv(filepath)
        df['x_val'] = df['x'].str[1:].astype(float)
        df['y_val'] = df['y'].str[1:].astype(float)
        df['deg_val'] = df['deg'].str[1:].astype(float)
        df['n'] = df['id'].str.split('_').str[0].astype(int)
        
        overlapping_n = []
        for n in range(1, 201):
            overlaps = check_overlaps(df, n)
            if overlaps and isinstance(overlaps, list) and len(overlaps) > 0:
                overlapping_n.append(n)
        
        return overlapping_n
    except Exception as e:
        return [str(e)]

# Check all snapshots for validity
print("Checking all snapshots for complete validity...")
valid_submissions = []
for snapshot in snapshots:
    filepath = f"{snapshot_dir}/{snapshot}/submission/submission.csv"
    if os.path.exists(filepath):
        overlapping = check_all_overlaps_fast(filepath)
        if len(overlapping) == 0:
            score = score_submission(filepath)
            valid_submissions.append((snapshot, score, filepath))
            print(f"  {snapshot}: VALID, score={score:.6f}")
        else:
            print(f"  {snapshot}: INVALID ({len(overlapping)} N values with overlaps)")

print(f"\nFound {len(valid_submissions)} completely valid submissions")

Checking all snapshots for complete validity...


  20952569566: VALID, score=163.194569


  20970671503: VALID, score=164.820045


  20971964134: VALID, score=87.804045


  20984924920: VALID, score=173.688052


  20991308120: VALID, score=87.804045


  20992150197: VALID, score=217.576225


  20992536951: VALID, score=87.804045


  21016257921: INVALID (3 N values with overlaps)


  21086827828: VALID, score=114.587809


  21090949260: VALID, score=84.711359


  21104669204: VALID, score=70.734327


  21105319338: VALID, score=70.734327


  21108486172: VALID, score=70.734327


  21116303805: VALID, score=70.676102


  21117525284: VALID, score=70.676104


  21117626902: VALID, score=70.676145


  21121776553: VALID, score=70.936674


  21121942239: VALID, score=70.676102


  21121943993: VALID, score=70.676102


  21122904233: VALID, score=118.230882


  21123763369: VALID, score=70.743774


  21123768399: VALID, score=70.676102


  21129617858: VALID, score=70.676764


  21129619422: VALID, score=170.909275


  21129620891: VALID, score=88.329998


  21129622493: VALID, score=129.272924


  21129625840: INVALID (198 N values with overlaps)


  21139436611: VALID, score=170.867211


  21139436684: VALID, score=148.177124


  21139436695: VALID, score=151.174322


  21139436707: VALID, score=162.204811


  21139436732: VALID, score=164.924007


  21145961371: VALID, score=70.676102
  21145963314: INVALID (1 N values with overlaps)


  21145965159: INVALID (196 N values with overlaps)


  21145966992: INVALID (72 N values with overlaps)


  21145968755: VALID, score=70.659959


  21156850282: VALID, score=70.659437


  21156851249: VALID, score=70.659437


  21156852373: VALID, score=70.676102


  21156853393: VALID, score=70.676102


  21165870845: VALID, score=70.676102


  21165872902: VALID, score=70.647306


  21165874980: VALID, score=70.630478


  21165876936: VALID, score=70.647306


  21165878844: VALID, score=70.659436


  21179744881: VALID, score=70.676102


  21180219583: VALID, score=70.630478


  21180221700: VALID, score=70.630478


  21180223864: VALID, score=70.630429


  21190222820: VALID, score=70.630455


  21190224310: VALID, score=70.630465


  21191206469: VALID, score=70.630455


  21191207951: VALID, score=70.627608


  21191209482: VALID, score=70.625918


  21191211160: VALID, score=70.627582


  21191212682: VALID, score=70.630455


  21198790429: VALID, score=70.627582


  21198891805: VALID, score=70.627582


  21198893057: VALID, score=70.625918


  21198925328: VALID, score=70.624381


  21198927060: VALID, score=70.624381


  21198928571: VALID, score=70.625918


  21222373488: VALID, score=70.624381


  21222375510: VALID, score=70.624381


  21222377956: VALID, score=70.624381


  21222390477: VALID, score=70.624381


  21222392487: VALID, score=70.626088


  21322576451: VALID, score=70.619825


  21322576827: VALID, score=70.616145


  21322577324: VALID, score=70.625376


  21322578388: VALID, score=70.926150


  21328308881: VALID, score=70.676102


  21328309254: VALID, score=70.624381
  21328309666: INVALID (1 N values with overlaps)


  21328310048: VALID, score=70.625918


  21328310479: INVALID (57 N values with overlaps)

Found 70 completely valid submissions


In [None]:
# Sort valid submissions by score
valid_submissions.sort(key=lambda x: x[1])
print("Top 10 valid submissions:")
for snapshot, score, filepath in valid_submissions[:10]:
    print(f"  {snapshot}: {score:.6f}")

# Use the best valid submission
best_valid = valid_submissions[0]
print(f"\nBest valid submission: {best_valid[0]} with score {best_valid[1]:.6f}")