# Loop 9 Analysis: Verify santa-2025.csv and Find Path Forward

Key finding from data_findings: santa-2025.csv scores 70.376 with ZERO overlaps - 0.24 points better than our best (70.615).

Let's verify this and understand why we haven't used it.

In [None]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
import json
import os

getcontext().prec = 30
SCALE_FACTOR = Decimal('1e18')

# Tree shape
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, angle):
    x, y, angle = float(x), float(y), float(angle)
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def create_high_precision_tree(x, y, angle):
    x = Decimal(str(x))
    y = Decimal(str(y))
    angle = Decimal(str(angle))
    sf = SCALE_FACTOR
    vertices = [
        (float(Decimal('0.0') * sf), float(Decimal('0.8') * sf)),
        (float(Decimal('0.125') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('-0.125') * sf), float(Decimal('0.5') * sf)),
    ]
    poly = Polygon(vertices)
    poly = affinity.rotate(poly, float(angle), origin=(0, 0))
    poly = affinity.translate(poly, xoff=float(x * sf), yoff=float(y * sf))
    return poly

def validate_no_overlap_strict(trees_data):
    if len(trees_data) <= 1:
        return True
    polygons = [create_high_precision_tree(t['x'], t['y'], t['deg']) for t in trees_data]
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                return False
    return True

def get_bbox_side(trees):
    if len(trees) == 0:
        return 0
    polygons = [create_tree_polygon(t['x'], t['y'], t['deg']) for t in trees]
    union = unary_union(polygons)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def get_score(trees, n):
    side = get_bbox_side(trees)
    return (side ** 2) / n

print("Functions defined")

In [None]:
# Load santa-2025.csv from kaggle_datasets
print("Loading santa-2025.csv...")
df_santa = pd.read_csv('/home/code/kaggle_datasets/santa-2025.csv')
df_santa["N"] = df_santa["id"].astype(str).str.split("_").str[0].astype(int)

santa_trees = {}
santa_scores = {}

for n, g in df_santa.groupby("N"):
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    santa_trees[n] = trees
    santa_scores[n] = get_score(trees, n)

santa_total = sum(santa_scores.values())
print(f"santa-2025.csv total score: {santa_total:.6f}")

In [None]:
# Load our current best (exp_008)
print("\nLoading exp_008 (our current best)...")
df_best = pd.read_csv('/home/code/experiments/008_fractional_translation/submission.csv')
df_best["N"] = df_best["id"].astype(str).str.split("_").str[0].astype(int)

best_trees = {}
best_scores = {}

for n, g in df_best.groupby("N"):
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    best_trees[n] = trees
    best_scores[n] = get_score(trees, n)

best_total = sum(best_scores.values())
print(f"exp_008 total score: {best_total:.6f}")

In [None]:
# Compare per-N scores
print("\n" + "="*60)
print("PER-N COMPARISON: santa-2025.csv vs exp_008")
print("="*60)

santa_better = []
best_better = []

for n in range(1, 201):
    santa_s = santa_scores[n]
    best_s = best_scores[n]
    diff = best_s - santa_s  # positive = santa is better
    
    if diff > 1e-9:
        santa_better.append((n, diff))
    elif diff < -1e-9:
        best_better.append((n, -diff))

print(f"\nsanta-2025.csv is better on {len(santa_better)} N values")
print(f"exp_008 is better on {len(best_better)} N values")

if santa_better:
    print(f"\nTop 10 N values where santa-2025.csv is better:")
    for n, diff in sorted(santa_better, key=lambda x: -x[1])[:10]:
        print(f"  N={n:3d}: +{diff:.6f}")

if best_better:
    print(f"\nTop 10 N values where exp_008 is better:")
    for n, diff in sorted(best_better, key=lambda x: -x[1])[:10]:
        print(f"  N={n:3d}: +{diff:.6f}")

In [None]:
# Validate santa-2025.csv for overlaps
print("\n" + "="*60)
print("VALIDATING santa-2025.csv FOR OVERLAPS")
print("="*60)

santa_overlaps = []
for n in range(1, 201):
    if not validate_no_overlap_strict(santa_trees[n]):
        santa_overlaps.append(n)
    if n % 50 == 0:
        print(f"  Validated N=1-{n}...")

print(f"\nN values with overlaps: {len(santa_overlaps)}")
if santa_overlaps:
    print(f"Overlap N values: {santa_overlaps[:20]}...")

In [None]:
# If santa-2025.csv has overlaps, check which N values are valid and better
print("\n" + "="*60)
print("ENSEMBLE POTENTIAL")
print("="*60)

valid_improvements = []
for n, diff in santa_better:
    if n not in santa_overlaps:
        valid_improvements.append((n, diff))

print(f"\nN values where santa-2025.csv is BOTH better AND valid: {len(valid_improvements)}")
if valid_improvements:
    total_potential = sum(diff for _, diff in valid_improvements)
    print(f"Total potential improvement: {total_potential:.6f}")
    print(f"\nTop 10 valid improvements:")
    for n, diff in sorted(valid_improvements, key=lambda x: -x[1])[:10]:
        print(f"  N={n:3d}: +{diff:.6f}")

In [None]:
# Create ensemble: best per-N from both sources (only valid solutions)
print("\n" + "="*60)
print("CREATING ENSEMBLE")
print("="*60)

ensemble_trees = {}
ensemble_scores = {}
ensemble_source = {}

for n in range(1, 201):
    # Check if santa is better AND valid
    if n in [x[0] for x in valid_improvements]:
        ensemble_trees[n] = santa_trees[n]
        ensemble_scores[n] = santa_scores[n]
        ensemble_source[n] = 'santa-2025.csv'
    else:
        ensemble_trees[n] = best_trees[n]
        ensemble_scores[n] = best_scores[n]
        ensemble_source[n] = 'exp_008'

ensemble_total = sum(ensemble_scores.values())
print(f"\nEnsemble score: {ensemble_total:.6f}")
print(f"Improvement over exp_008: {best_total - ensemble_total:.6f}")
print(f"Gap to target (68.888293): {ensemble_total - 68.888293:.6f}")

# Count sources
from collections import Counter
source_counts = Counter(ensemble_source.values())
print(f"\nSource breakdown:")
for source, count in source_counts.items():
    print(f"  {source}: {count} N values")

In [None]:
# Final validation of ensemble
print("\n" + "="*60)
print("FINAL VALIDATION OF ENSEMBLE")
print("="*60)

ensemble_overlaps = []
for n in range(1, 201):
    if not validate_no_overlap_strict(ensemble_trees[n]):
        ensemble_overlaps.append(n)

if ensemble_overlaps:
    print(f"WARNING: {len(ensemble_overlaps)} N values have overlaps!")
    print(f"Overlap N values: {ensemble_overlaps}")
else:
    print("âœ… All N values pass strict validation!")

In [None]:
# Save ensemble if valid
if not ensemble_overlaps:
    print("\n" + "="*60)
    print("SAVING ENSEMBLE SUBMISSION")
    print("="*60)
    
    os.makedirs('/home/code/experiments/009_santa_ensemble', exist_ok=True)
    
    rows = []
    for n in range(1, 201):
        trees = ensemble_trees[n]
        for i, t in enumerate(trees):
            rows.append({
                'id': f"{n:03d}_{i}",
                'x': f"s{t['x']}",
                'y': f"s{t['y']}",
                'deg': f"s{t['deg']}"
            })
    
    submission_df = pd.DataFrame(rows)
    submission_df.to_csv('/home/code/experiments/009_santa_ensemble/submission.csv', index=False)
    submission_df.to_csv('/home/submission/submission.csv', index=False)
    print(f"Submission saved! Shape: {submission_df.shape}")
    
    # Save metrics
    metrics = {
        'cv_score': ensemble_total,
        'baseline_score': best_total,
        'improvement': best_total - ensemble_total,
        'n_from_santa': source_counts.get('santa-2025.csv', 0),
        'n_from_exp008': source_counts.get('exp_008', 0),
        'final_overlaps': len(ensemble_overlaps),
        'target': 68.888293,
        'gap': ensemble_total - 68.888293
    }
    
    with open('/home/code/experiments/009_santa_ensemble/metrics.json', 'w') as f:
        json.dump(metrics, f, indent=2)
    
    print(f"\nFINAL RESULTS:")
    print(f"  Ensemble score: {ensemble_total:.6f}")
    print(f"  Improvement: {best_total - ensemble_total:.6f}")
    print(f"  Gap to target: {ensemble_total - 68.888293:.6f}")
else:
    print("Cannot save - ensemble has overlaps!")