# Loop 2 Analysis: Fix Overlapping Trees Issue

The submission failed with 'Overlapping trees in group 040'. Need to:
1. Check which source files have overlaps
2. Create a valid ensemble with overlap checking
3. Understand why best_snapshot.csv has overlaps

In [1]:
import pandas as pd
import numpy as np
import math
from numba import njit
from shapely.geometry import Polygon
from shapely import affinity
import json
import os

# Tree vertices
TX = np.array([0,0.125,0.0625,0.2,0.1,0.35,0.075,0.075,-0.075,-0.075,-0.35,-0.1,-0.2,-0.0625,-0.125])
TY = np.array([0.8,0.5,0.5,0.25,0.25,0,0,-0.2,-0.2,0,0,0.25,0.25,0.5,0.5])

print('Tree vertices loaded')

Tree vertices loaded


In [2]:
def make_tree_polygon(x, y, deg):
    """Create a tree polygon at position (x, y) with rotation deg"""
    p = Polygon(zip(TX, TY))
    p = affinity.rotate(p, deg, origin=(0,0))
    p = affinity.translate(p, x, y)
    return p

def check_overlaps(xs, ys, degs):
    """Check if any trees overlap in a configuration"""
    n = len(xs)
    polygons = [make_tree_polygon(xs[i], ys[i], degs[i]) for i in range(n)]
    
    overlaps = []
    for i in range(n):
        for j in range(i+1, n):
            if polygons[i].intersects(polygons[j]):
                # Check if it's more than just touching
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > 1e-10:  # Non-trivial overlap
                    overlaps.append((i, j, intersection.area))
    return overlaps

print('Overlap detection functions defined')

Overlap detection functions defined


In [3]:
def parse_submission(filepath):
    """Parse submission CSV and return dict of N -> (xs, ys, degs)"""
    df = pd.read_csv(filepath)
    
    # Parse values (remove 's' prefix)
    df['x_val'] = df['x'].str.replace('s', '').astype(float)
    df['y_val'] = df['y'].str.replace('s', '').astype(float)
    df['deg_val'] = df['deg'].str.replace('s', '').astype(float)
    
    # Extract N from id (format: NNN_idx)
    df['N'] = df['id'].str.split('_').str[0].astype(int)
    
    configs = {}
    for n, group in df.groupby('N'):
        xs = group['x_val'].values
        ys = group['y_val'].values
        degs = group['deg_val'].values
        configs[n] = (xs, ys, degs)
    
    return configs, df

print('Parsing function defined')

Parsing function defined


In [4]:
# Check overlaps in best_snapshot.csv
print('Checking best_snapshot.csv for overlaps...')
best_snapshot_configs, _ = parse_submission('/home/code/preoptimized/best_snapshot.csv')

best_snapshot_overlaps = {}
for n in range(1, 201):
    if n in best_snapshot_configs:
        xs, ys, degs = best_snapshot_configs[n]
        overlaps = check_overlaps(xs, ys, degs)
        if overlaps:
            best_snapshot_overlaps[n] = overlaps
            print(f'N={n}: {len(overlaps)} overlaps found')

print(f'\nTotal N values with overlaps in best_snapshot: {len(best_snapshot_overlaps)}')
if best_snapshot_overlaps:
    print('Overlapping N values:', list(best_snapshot_overlaps.keys()))

Checking best_snapshot.csv for overlaps...



Total N values with overlaps in best_snapshot: 0


In [5]:
# Check overlaps in baseline submission.csv
print('Checking baseline submission.csv for overlaps...')
baseline_configs, _ = parse_submission('/home/code/preoptimized/submission.csv')

baseline_overlaps = {}
for n in range(1, 201):
    if n in baseline_configs:
        xs, ys, degs = baseline_configs[n]
        overlaps = check_overlaps(xs, ys, degs)
        if overlaps:
            baseline_overlaps[n] = overlaps
            print(f'N={n}: {len(overlaps)} overlaps found')

print(f'\nTotal N values with overlaps in baseline: {len(baseline_overlaps)}')
if baseline_overlaps:
    print('Overlapping N values:', list(baseline_overlaps.keys()))

Checking baseline submission.csv for overlaps...



Total N values with overlaps in baseline: 0


In [6]:
@njit
def score_group(xs, ys, degs, tx, ty):
    """Calculate score for a single N configuration"""
    n = xs.size
    V = tx.size
    mnx = mny = 1e300
    mxx = mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c = math.cos(r)
        s = math.sin(r)
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xs[i]
            Y = s * tx[j] + c * ty[j] + ys[i]
            mnx = min(mnx, X)
            mxx = max(mxx, X)
            mny = min(mny, Y)
            mxy = max(mxy, Y)
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

print('Scoring function defined')

Scoring function defined


In [7]:
# Create a VALID ensemble - only use configurations without overlaps
print('Creating valid ensemble with overlap checking...')

# Load all available submissions
files = {
    'baseline': '/home/code/preoptimized/submission.csv',
    'best_snapshot': '/home/code/preoptimized/best_snapshot.csv',
    'saspav_best': '/home/code/preoptimized/saspav_best.csv',
    'smartmanoj': '/home/code/preoptimized/smartmanoj_submission.csv',
}

all_configs = {}
all_scores_by_n = {}

for name, path in files.items():
    if os.path.exists(path):
        configs, df = parse_submission(path)
        all_configs[name] = configs
        
        # Calculate scores for each N
        scores_by_n = {}
        for n in range(1, 201):
            if n in configs:
                xs, ys, degs = configs[n]
                score = score_group(xs, ys, degs, TX, TY)
                scores_by_n[n] = score
        all_scores_by_n[name] = scores_by_n
        
        total = sum(scores_by_n.values())
        print(f'{name}: {total:.6f}')
    else:
        print(f'{name}: FILE NOT FOUND')

Creating valid ensemble with overlap checking...


baseline: 70.647327
best_snapshot: 70.624381
saspav_best: 70.630478
smartmanoj: 70.743774


In [8]:
# Create valid ensemble - for each N, pick the best configuration WITHOUT overlaps
print('\nCreating valid ensemble (checking overlaps for each N)...')

best_configs = {}
best_scores_by_n = {}
source_counts = {}

for n in range(1, 201):
    best_score = float('inf')
    best_source = None
    best_config = None
    
    # Try each source in order of score (best first)
    candidates = []
    for name, scores_by_n in all_scores_by_n.items():
        if n in scores_by_n:
            candidates.append((scores_by_n[n], name))
    
    candidates.sort()  # Sort by score (ascending = better)
    
    for score, name in candidates:
        xs, ys, degs = all_configs[name][n]
        overlaps = check_overlaps(xs, ys, degs)
        
        if not overlaps:  # No overlaps - use this one
            best_score = score
            best_source = name
            best_config = (xs, ys, degs)
            break
        else:
            print(f'N={n}: Skipping {name} due to {len(overlaps)} overlaps')
    
    if best_config is None:
        print(f'ERROR: No valid configuration found for N={n}!')
        # Fall back to baseline even if it has overlaps (shouldn't happen)
        best_config = all_configs['baseline'][n]
        best_source = 'baseline_fallback'
        best_score = all_scores_by_n['baseline'][n]
    
    best_configs[n] = best_config
    best_scores_by_n[n] = best_score
    source_counts[best_source] = source_counts.get(best_source, 0) + 1

ensemble_total = sum(best_scores_by_n.values())
print(f'\nValid ensemble score: {ensemble_total:.6f}')
print(f'Source breakdown: {source_counts}')


Creating valid ensemble (checking overlaps for each N)...



Valid ensemble score: 70.624381
Source breakdown: {'best_snapshot': 197, 'baseline': 3}


In [9]:
# Generate submission CSV from valid ensemble
def generate_submission(configs, output_path):
    rows = []
    for n in range(1, 201):
        xs, ys, degs = configs[n]
        for i in range(len(xs)):
            row = {
                'id': f'{n:03d}_{i}',
                'x': f's{xs[i]}',
                'y': f's{ys[i]}',
                'deg': f's{degs[i]}'
            }
            rows.append(row)
    df = pd.DataFrame(rows)
    df.to_csv(output_path, index=False)
    return df

# Save valid ensemble submission
os.makedirs('/home/submission', exist_ok=True)
submission_df = generate_submission(best_configs, '/home/submission/submission.csv')
print(f'Saved valid ensemble submission to /home/submission/submission.csv')
print(f'Total rows: {len(submission_df)}')

Saved valid ensemble submission to /home/submission/submission.csv
Total rows: 20100


In [10]:
# Final verification - check the saved submission for overlaps
print('\nFinal verification of saved submission...')
verify_configs, _ = parse_submission('/home/submission/submission.csv')

final_overlaps = {}
for n in range(1, 201):
    if n in verify_configs:
        xs, ys, degs = verify_configs[n]
        overlaps = check_overlaps(xs, ys, degs)
        if overlaps:
            final_overlaps[n] = overlaps
            print(f'N={n}: {len(overlaps)} overlaps found!')

if final_overlaps:
    print(f'\nERROR: {len(final_overlaps)} N values still have overlaps!')
else:
    print('\nSUCCESS: No overlaps found in final submission!')

# Calculate final score
final_total = 0
for n in range(1, 201):
    xs, ys, degs = verify_configs[n]
    score = score_group(xs, ys, degs, TX, TY)
    final_total += score

print(f'\nFinal verified score: {final_total:.6f}')


Final verification of saved submission...



SUCCESS: No overlaps found in final submission!

Final verified score: 70.624381


In [14]:
# Investigate N=40 specifically - the group that failed
print('Investigating N=40 configuration...')

# Check N=40 in best_snapshot
xs, ys, degs = best_snapshot_configs[40]
print(f'N=40 from best_snapshot: {len(xs)} trees')

# Check for very close trees (potential near-overlaps)
from itertools import combinations

def check_near_overlaps(xs, ys, degs, threshold=1e-6):
    """Check for trees that are very close to overlapping"""
    n = len(xs)
    polygons = [make_tree_polygon(xs[i], ys[i], degs[i]) for i in range(n)]
    
    near_overlaps = []
    for i, j in combinations(range(n), 2):
        # Check distance between polygons
        dist = polygons[i].distance(polygons[j])
        if dist < threshold:
            near_overlaps.append((i, j, dist))
    return near_overlaps

near = check_near_overlaps(xs, ys, degs, threshold=1e-6)
print(f'Near-overlaps (dist < 1e-6): {len(near)}')
for i, j, d in near[:10]:
    print(f'  Trees {i} and {j}: distance = {d:.2e}')

Investigating N=40 configuration...
N=40 from best_snapshot: 40 trees
Near-overlaps (dist < 1e-6): 55
  Trees 0 and 5: distance = 3.35e-15
  Trees 0 and 18: distance = 2.99e-14
  Trees 1 and 2: distance = 2.83e-15
  Trees 1 and 7: distance = 5.08e-15
  Trees 1 and 23: distance = 1.03e-15
  Trees 2 and 17: distance = 6.54e-15
  Trees 3 and 15: distance = 1.61e-15
  Trees 3 and 20: distance = 3.74e-15
  Trees 3 and 25: distance = 6.64e-15
  Trees 3 and 37: distance = 9.78e-15


In [15]:
# Check if Kaggle uses a different overlap definition
# Let's check for ANY intersection (including touching)
def check_strict_overlaps(xs, ys, degs):
    """Check for ANY intersection including touching"""
    n = len(xs)
    polygons = [make_tree_polygon(xs[i], ys[i], degs[i]) for i in range(n)]
    
    overlaps = []
    for i in range(n):
        for j in range(i+1, n):
            # Check for any intersection
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                overlaps.append((i, j, intersection.geom_type, intersection.area if hasattr(intersection, 'area') else 0))
    return overlaps

print('Checking N=40 with strict overlap detection...')
strict_overlaps = check_strict_overlaps(xs, ys, degs)
print(f'Strict overlaps found: {len(strict_overlaps)}')
for i, j, geom_type, area in strict_overlaps[:10]:
    print(f'  Trees {i} and {j}: {geom_type}, area={area:.2e}')

Checking N=40 with strict overlap detection...
Strict overlaps found: 0


In [16]:
# Let's check all N values with strict overlap detection
print('Checking ALL N values with strict overlap detection...')

strict_overlaps_by_n = {}
for n in range(1, 201):
    xs, ys, degs = best_snapshot_configs[n]
    overlaps = check_strict_overlaps(xs, ys, degs)
    if overlaps:
        strict_overlaps_by_n[n] = overlaps
        print(f'N={n}: {len(overlaps)} strict overlaps')

print(f'\nTotal N values with strict overlaps: {len(strict_overlaps_by_n)}')

Checking ALL N values with strict overlap detection...



Total N values with strict overlaps: 0


In [17]:
# The baseline submission.csv was accepted by Kaggle (LB 70.647327)
# Let's use ONLY the baseline for now - it's guaranteed valid
print('Using baseline submission (known to be valid)...')

# Copy baseline to submission
import shutil
shutil.copy('/home/code/preoptimized/submission.csv', '/home/submission/submission.csv')

# Verify
verify_configs, _ = parse_submission('/home/submission/submission.csv')
final_total = 0
for n in range(1, 201):
    xs, ys, degs = verify_configs[n]
    score = score_group(xs, ys, degs, TX, TY)
    final_total += score

print(f'Baseline score: {final_total:.6f}')
print('This submission is known to be valid (LB 70.647327)')

Using baseline submission (known to be valid)...
Baseline score: 70.647327
This submission is known to be valid (LB 70.647327)


In [18]:
# Let's understand the difference between baseline and best_snapshot for N=40
print('Comparing N=40 between baseline and best_snapshot...')

baseline_xs, baseline_ys, baseline_degs = baseline_configs[40]
snapshot_xs, snapshot_ys, snapshot_degs = best_snapshot_configs[40]

baseline_score = score_group(baseline_xs, baseline_ys, baseline_degs, TX, TY)
snapshot_score = score_group(snapshot_xs, snapshot_ys, snapshot_degs, TX, TY)

print(f'N=40 baseline score: {baseline_score:.6f}')
print(f'N=40 best_snapshot score: {snapshot_score:.6f}')
print(f'Difference: {baseline_score - snapshot_score:.6f}')

# Check if they're the same configuration
same = np.allclose(baseline_xs, snapshot_xs) and np.allclose(baseline_ys, snapshot_ys) and np.allclose(baseline_degs, snapshot_degs)
print(f'Same configuration: {same}')

Comparing N=40 between baseline and best_snapshot...
N=40 baseline score: 0.362148
N=40 best_snapshot score: 0.362148
Difference: 0.000000
Same configuration: False


In [None]:
# Save metrics
metrics = {
    'cv_score': final_total,
    'baseline_score': sum(all_scores_by_n['baseline'].values()),
    'improvement': sum(all_scores_by_n['baseline'].values()) - final_total,
    'source_counts': source_counts,
    'overlaps_found_in_best_snapshot': list(best_snapshot_overlaps.keys()) if best_snapshot_overlaps else [],
    'valid_submission': len(final_overlaps) == 0
}

with open('/home/code/experiments/002_snapshot_ensemble/metrics_fixed.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f'\nSaved metrics to experiments/002_snapshot_ensemble/metrics_fixed.json')
print(f'CV Score: {final_total:.6f}')
print(f'Valid submission: {len(final_overlaps) == 0}')