# Loop 10 Analysis: Precision Fix and Ensemble Strategy

## Key Issues to Address:
1. Experiment 009 achieved 70.659944 but failed on Kaggle due to overlapping trees in group 046
2. The evaluator identified precision loss as the root cause
3. Need to fix precision preservation and resubmit

In [None]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
getcontext().prec = 25

# Compare baseline vs ensemble submission
baseline = pd.read_csv('/home/code/experiments/001_baseline/santa-2025.csv', dtype=str)
ensemble = pd.read_csv('/home/code/experiments/009_full_ensemble_v2/submission.csv', dtype=str)

print('Baseline sample:')
print(baseline.head(10))
print('\nEnsemble sample:')
print(ensemble.head(10))

In [None]:
# Find differences between baseline and ensemble
diffs = []
for idx in baseline['id']:
    b_row = baseline[baseline['id'] == idx].iloc[0]
    e_row = ensemble[ensemble['id'] == idx].iloc[0]
    
    if b_row['x'] != e_row['x'] or b_row['y'] != e_row['y'] or b_row['deg'] != e_row['deg']:
        diffs.append({
            'id': idx,
            'baseline_x': b_row['x'],
            'ensemble_x': e_row['x'],
            'baseline_y': b_row['y'],
            'ensemble_y': e_row['y'],
            'baseline_deg': b_row['deg'],
            'ensemble_deg': e_row['deg']
        })

print(f'Found {len(diffs)} differences between baseline and ensemble')
if diffs:
    print('\nFirst 10 differences:')
    for d in diffs[:10]:
        print(f"  {d['id']}: x={d['baseline_x']} -> {d['ensemble_x']}")
        print(f"           y={d['baseline_y']} -> {d['ensemble_y']}")
        print(f"         deg={d['baseline_deg']} -> {d['ensemble_deg']}")
        print()

In [None]:
# Check group 046 specifically (the one that failed)
print('Group 046 in baseline:')
baseline_046 = baseline[baseline['id'].str.startswith('046_')]
print(baseline_046)

print('\nGroup 046 in ensemble:')
ensemble_046 = ensemble[ensemble['id'].str.startswith('046_')]
print(ensemble_046)

In [None]:
# Check precision of values
def check_precision(val):
    val_str = str(val).replace('s', '')
    if '.' in val_str:
        return len(val_str.split('.')[1])
    return 0

print('Precision analysis for baseline:')
for col in ['x', 'y', 'deg']:
    precisions = baseline[col].apply(check_precision)
    print(f'  {col}: min={precisions.min()}, max={precisions.max()}, mean={precisions.mean():.1f}')

print('\nPrecision analysis for ensemble:')
for col in ['x', 'y', 'deg']:
    precisions = ensemble[col].apply(check_precision)
    print(f'  {col}: min={precisions.min()}, max={precisions.max()}, mean={precisions.mean():.1f}')

In [None]:
# Check if there are any files with better scores that we haven't used
import glob
import os

# Find all CSV files
csv_files = glob.glob('/home/nonroot/snapshots/**/*.csv', recursive=True)
print(f'Found {len(csv_files)} CSV files in snapshots')

# Check which ones have valid format
valid_files = []
for f in csv_files:
    try:
        df = pd.read_csv(f, dtype=str, nrows=5)
        if 'id' in df.columns and 'x' in df.columns:
            valid_files.append(f)
    except:
        pass

print(f'Found {len(valid_files)} valid CSV files')

In [None]:
# The key insight: we need to preserve EXACT string values when creating ensemble
# Let's create a proper ensemble that preserves precision

from shapely.geometry import Polygon
from shapely import affinity
from shapely.strtree import STRtree

# Tree geometry
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])
TY[0] = 0.8  # tip

def get_tree_polygon(x, y, deg):
    coords = list(zip(TX, TY))
    base_poly = Polygon(coords)
    rotated = affinity.rotate(base_poly, deg, origin=(0, 0))
    return affinity.translate(rotated, x, y)

def get_bounding_box_side(trees):
    if not trees:
        return float('inf')
    all_x, all_y = [], []
    for x, y, deg in trees:
        poly = get_tree_polygon(x, y, deg)
        bounds = poly.bounds
        all_x.extend([bounds[0], bounds[2]])
        all_y.extend([bounds[1], bounds[3]])
    return max(max(all_x) - min(all_x), max(all_y) - min(all_y))

def has_overlap(trees):
    if len(trees) <= 1:
        return False
    try:
        polygons = [get_tree_polygon(x, y, deg) for x, y, deg in trees]
        for i in range(len(polygons)):
            for j in range(i+1, len(polygons)):
                if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                    return True
        return False
    except:
        return True

print('Functions defined')

In [None]:
# Check group 046 for overlaps
def parse_val(v):
    return float(str(v).replace('s', ''))

baseline_046 = baseline[baseline['id'].str.startswith('046_')]
trees_046 = []
for _, row in baseline_046.iterrows():
    x = parse_val(row['x'])
    y = parse_val(row['y'])
    deg = parse_val(row['deg'])
    trees_046.append((x, y, deg))

print(f'Group 046 has {len(trees_046)} trees')
print(f'Has overlap (baseline): {has_overlap(trees_046)}')
print(f'Bounding box side: {get_bounding_box_side(trees_046):.6f}')
print(f'Score: {get_bounding_box_side(trees_046)**2 / 46:.6f}')

# Check ensemble 046
ensemble_046 = ensemble[ensemble['id'].str.startswith('046_')]
trees_046_ens = []
for _, row in ensemble_046.iterrows():
    x = parse_val(row['x'])
    y = parse_val(row['y'])
    deg = parse_val(row['deg'])
    trees_046_ens.append((x, y, deg))

print(f'\nEnsemble 046 has {len(trees_046_ens)} trees')
print(f'Has overlap (ensemble): {has_overlap(trees_046_ens)}')