# Experiment 006: Validated Ensemble with High-Precision Overlap Detection

The previous ensemble (exp_005) FAILED on Kaggle with 'Overlapping trees in group 002'.

Fix: Implement high-precision overlap detection (1e18 scaling) and only include sources that pass strict validation.

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely.geometry import Polygon
from shapely import affinity
from numba import njit
import os
import json
import time
import warnings
warnings.filterwarnings('ignore')

getcontext().prec = 30  # High precision for Decimal

print("Setup complete")

Setup complete


In [2]:
# High-precision overlap detection using Decimal and 1e18 scaling
# This matches Kaggle's strict validation

SCALE_FACTOR = Decimal('1e18')

# Tree dimensions
TRUNK_W = Decimal('0.15')
TRUNK_H = Decimal('0.2')
BASE_W = Decimal('0.7')
MID_W = Decimal('0.4')
TOP_W = Decimal('0.25')
TIP_Y = Decimal('0.8')
TIER_1_Y = Decimal('0.5')
TIER_2_Y = Decimal('0.25')
BASE_Y = Decimal('0.0')
TRUNK_BOTTOM_Y = -TRUNK_H

def create_high_precision_tree(x, y, angle):
    """Create tree polygon with high-precision (1e18 scaling)."""
    x = Decimal(str(x))
    y = Decimal(str(y))
    angle = Decimal(str(angle))
    
    # Create polygon vertices with scaling
    vertices = [
        (float(Decimal('0.0') * SCALE_FACTOR), float(TIP_Y * SCALE_FACTOR)),
        (float(TOP_W / Decimal('2') * SCALE_FACTOR), float(TIER_1_Y * SCALE_FACTOR)),
        (float(TOP_W / Decimal('4') * SCALE_FACTOR), float(TIER_1_Y * SCALE_FACTOR)),
        (float(MID_W / Decimal('2') * SCALE_FACTOR), float(TIER_2_Y * SCALE_FACTOR)),
        (float(MID_W / Decimal('4') * SCALE_FACTOR), float(TIER_2_Y * SCALE_FACTOR)),
        (float(BASE_W / Decimal('2') * SCALE_FACTOR), float(BASE_Y * SCALE_FACTOR)),
        (float(TRUNK_W / Decimal('2') * SCALE_FACTOR), float(BASE_Y * SCALE_FACTOR)),
        (float(TRUNK_W / Decimal('2') * SCALE_FACTOR), float(TRUNK_BOTTOM_Y * SCALE_FACTOR)),
        (float(-TRUNK_W / Decimal('2') * SCALE_FACTOR), float(TRUNK_BOTTOM_Y * SCALE_FACTOR)),
        (float(-TRUNK_W / Decimal('2') * SCALE_FACTOR), float(BASE_Y * SCALE_FACTOR)),
        (float(-BASE_W / Decimal('2') * SCALE_FACTOR), float(BASE_Y * SCALE_FACTOR)),
        (float(-MID_W / Decimal('4') * SCALE_FACTOR), float(TIER_2_Y * SCALE_FACTOR)),
        (float(-MID_W / Decimal('2') * SCALE_FACTOR), float(TIER_2_Y * SCALE_FACTOR)),
        (float(-TOP_W / Decimal('4') * SCALE_FACTOR), float(TIER_1_Y * SCALE_FACTOR)),
        (float(-TOP_W / Decimal('2') * SCALE_FACTOR), float(TIER_1_Y * SCALE_FACTOR)),
    ]
    
    poly = Polygon(vertices)
    poly = affinity.rotate(poly, float(angle), origin=(0, 0))
    poly = affinity.translate(poly, xoff=float(x * SCALE_FACTOR), yoff=float(y * SCALE_FACTOR))
    return poly

def validate_no_overlap_strict(trees_data):
    """Check for overlaps using high-precision validation.
    trees_data is a list of dicts with 'x', 'y', 'deg' keys.
    Returns (is_valid, error_message)
    """
    if len(trees_data) <= 1:
        return True, "OK"
    
    polygons = []
    for t in trees_data:
        poly = create_high_precision_tree(t['x'], t['y'], t['deg'])
        polygons.append(poly)
    
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                if not polygons[i].touches(polygons[j]):
                    return False, f"Trees {i} and {j} overlap"
    
    return True, "OK"

print("High-precision validation functions defined")

High-precision validation functions defined


In [3]:
# Fast scoring using numba (same as exp_005)
@njit
def make_polygon_template():
    tw=0.15; th=0.2; bw=0.7; mw=0.4; ow=0.25
    tip=0.8; t1=0.5; t2=0.25; base=0.0; tbot=-th
    x=np.array([0,ow/2,ow/4,mw/2,mw/4,bw/2,tw/2,tw/2,-tw/2,-tw/2,-bw/2,-mw/4,-mw/2,-ow/4,-ow/2],np.float64)
    y=np.array([tip,t1,t1,t2,t2,base,base,tbot,tbot,base,base,t2,t2,t1,t1],np.float64)
    return x,y

@njit
def score_group(xs, ys, degs, tx, ty):
    """Fast scoring using numba."""
    n = xs.size
    V = tx.size
    mnx = 1e300; mny = 1e300; mxx = -1e300; mxy = -1e300
    for i in range(n):
        r = degs[i] * np.pi / 180.0
        c = np.cos(r); s = np.sin(r)
        xi = xs[i]; yi = ys[i]
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xi
            Y = s * tx[j] + c * ty[j] + yi
            if X < mnx: mnx = X
            if X > mxx: mxx = X
            if Y < mny: mny = Y
            if Y > mxy: mxy = Y
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    return np.array([float(str(v).replace("s","")) for v in a], np.float64)

tx, ty = make_polygon_template()
print("Scoring functions compiled")

Scoring functions compiled


In [4]:
# Load baseline (which passed Kaggle validation)
baseline_path = '/home/code/experiments/001_fix_overlaps/submission.csv'
baseline_df = pd.read_csv(baseline_path)
baseline_df["N"] = baseline_df["id"].astype(str).str.split("_").str[0].astype(int)

# Parse baseline into structured format
baseline_trees = {}
baseline_scores = {}

for n, g in baseline_df.groupby("N"):
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    baseline_trees[n] = trees
    
    xs = strip(g["x"].to_numpy())
    ys = strip(g["y"].to_numpy())
    ds = strip(g["deg"].to_numpy())
    baseline_scores[n] = score_group(xs, ys, ds, tx, ty)

baseline_total = sum(baseline_scores.values())
print(f"Baseline total score: {baseline_total:.6f}")

Baseline total score: 70.622435


In [5]:
# Verify baseline passes strict validation
print("Verifying baseline passes strict validation...")
baseline_overlaps = []
for n in range(1, 201):
    is_valid, msg = validate_no_overlap_strict(baseline_trees[n])
    if not is_valid:
        baseline_overlaps.append((n, msg))

if baseline_overlaps:
    print(f"WARNING: Baseline has {len(baseline_overlaps)} N values with overlaps!")
    for n, msg in baseline_overlaps[:5]:
        print(f"  N={n}: {msg}")
else:
    print("Baseline passes strict validation for all N values!")

Verifying baseline passes strict validation...


Baseline passes strict validation for all N values!


In [6]:
# Find all snapshot submissions
print("\n" + "=" * 60)
print("STEP 1: COLLECTING AND VALIDATING SNAPSHOT SUBMISSIONS")
print("=" * 60)

snapshot_dir = '/home/nonroot/snapshots/santa-2025/'
sources = []

for subdir in sorted(os.listdir(snapshot_dir)):
    csv_path = os.path.join(snapshot_dir, subdir, 'submission', 'submission.csv')
    if os.path.exists(csv_path):
        sources.append(csv_path)

print(f"Found {len(sources)} snapshot submissions")


STEP 1: COLLECTING AND VALIDATING SNAPSHOT SUBMISSIONS
Found 88 snapshot submissions


In [7]:
# For each N, track best VALID solution across all sources
print("\n" + "=" * 60)
print("STEP 2: ENSEMBLE WITH STRICT VALIDATION")
print("=" * 60)

# Initialize with baseline (known to be valid)
best = {}
for n in range(1, 201):
    best[n] = {
        "score": baseline_scores[n],
        "trees": baseline_trees[n],
        "src": "baseline"
    }

start_time = time.time()
processed = 0
errors = 0
improvements_found = 0

for source_path in sources:
    try:
        df = pd.read_csv(source_path)
        if not {"id","x","y","deg"}.issubset(df.columns):
            errors += 1
            continue
        
        df["N"] = df["id"].astype(str).str.split("_").str[0].astype(int)
        
        for n, g in df.groupby("N"):
            if n < 1 or n > 200:
                continue
            if len(g) != n:
                continue
            
            # Parse trees
            trees = []
            for _, row in g.iterrows():
                x = str(row['x']).replace('s', '')
                y = str(row['y']).replace('s', '')
                deg = str(row['deg']).replace('s', '')
                trees.append({'x': x, 'y': y, 'deg': deg})
            
            # Calculate score
            xs = strip(g["x"].to_numpy())
            ys = strip(g["y"].to_numpy())
            ds = strip(g["deg"].to_numpy())
            sc = score_group(xs, ys, ds, tx, ty)
            
            # Only consider if better than current best
            if sc < best[n]["score"] - 1e-9:
                # Validate with strict precision
                is_valid, msg = validate_no_overlap_strict(trees)
                
                if is_valid:
                    best[n]["score"] = float(sc)
                    best[n]["trees"] = trees
                    best[n]["src"] = source_path
                    improvements_found += 1
        
        processed += 1
        if processed % 20 == 0:
            print(f"  Processed {processed}/{len(sources)} sources, {improvements_found} improvements found...")
            
    except Exception as e:
        errors += 1
        continue

print(f"\nProcessed {processed} sources, {errors} errors")
print(f"Improvements found: {improvements_found}")
print(f"Time: {time.time() - start_time:.1f}s")


STEP 2: ENSEMBLE WITH STRICT VALIDATION


  Processed 20/88 sources, 1 improvements found...


  Processed 40/88 sources, 4 improvements found...


  Processed 60/88 sources, 60 improvements found...


  Processed 80/88 sources, 111 improvements found...



Processed 87 sources, 1 errors
Improvements found: 112
Time: 49.7s


In [None]:
# Calculate ensemble score
ensemble_total = sum(best[n]["score"] for n in range(1, 201))
print(f"\nEnsemble total score: {ensemble_total:.6f}")
print(f"Baseline total score: {baseline_total:.6f}")
print(f"Improvement: {baseline_total - ensemble_total:.6f}")

In [None]:
# Analyze sources
from collections import Counter
source_counts = Counter(best[n]["src"] for n in range(1, 201))

print(f"\nUnique sources used: {len(source_counts)}")
print("\nSource distribution:")
for src, count in source_counts.most_common():
    src_name = src.split('/')[-3] if '/' in src else src
    print(f"  {src_name}: {count} N values")

In [None]:
# Final validation of ensemble
print("\n" + "=" * 60)
print("STEP 3: FINAL VALIDATION")
print("=" * 60)

final_overlaps = []
for n in range(1, 201):
    is_valid, msg = validate_no_overlap_strict(best[n]["trees"])
    if not is_valid:
        final_overlaps.append((n, msg))

if final_overlaps:
    print(f"\nWARNING: {len(final_overlaps)} N values have overlaps!")
    for n, msg in final_overlaps[:10]:
        print(f"  N={n}: {msg}")
else:
    print("\nâœ… All N values pass strict validation!")

In [None]:
# Create submission
print("\n" + "=" * 60)
print("STEP 4: CREATE SUBMISSION")
print("=" * 60)

rows = []
for n in range(1, 201):
    trees = best[n]["trees"]
    for i, t in enumerate(trees):
        rows.append({
            'id': f"{n:03d}_{i}",
            'x': f"s{t['x']}",
            'y': f"s{t['y']}",
            'deg': f"s{t['deg']}"
        })

submission_df = pd.DataFrame(rows)
print(f"Submission shape: {submission_df.shape}")

submission_df.to_csv('/home/code/experiments/006_validated_ensemble/submission.csv', index=False)
submission_df.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved!")

In [None]:
# Save metrics
metrics = {
    'cv_score': ensemble_total,
    'baseline_score': baseline_total,
    'improvement': baseline_total - ensemble_total,
    'sources_used': len(source_counts),
    'improvements_found': improvements_found,
    'final_overlaps': len(final_overlaps),
    'target': 68.888293,
    'gap': ensemble_total - 68.888293
}

with open('/home/code/experiments/006_validated_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("\nMetrics saved!")
print(f"\n" + "=" * 60)
print("FINAL RESULTS")
print("=" * 60)
print(f"Baseline score: {baseline_total:.6f}")
print(f"Ensemble score: {ensemble_total:.6f}")
print(f"Improvement: {baseline_total - ensemble_total:.6f}")
print(f"Overlaps in final: {len(final_overlaps)}")
print(f"Target: 68.888293")
print(f"Gap to target: {ensemble_total - 68.888293:.6f}")