# Safe Ensemble with Kaggle-Compatible Validation

The previous ensemble (exp_002) failed with "Overlapping trees in group 002".
We need to use Kaggle's integer-scaling (1e18) validation method.

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
from shapely.strtree import STRtree
import os
import json
import math

# Set high precision for decimal arithmetic
getcontext().prec = 25
SCALE = Decimal("1e18")

print("Imports done!")

Imports done!


In [2]:
# Kaggle-compatible ChristmasTree class with integer scaling
class ChristmasTree:
    """Tree with Kaggle-compatible integer scaling."""
    def __init__(self, center_x="0", center_y="0", angle="0"):
        self.center_x = Decimal(str(center_x))
        self.center_y = Decimal(str(center_y))
        self.angle = Decimal(str(angle))
        
        # Tree dimensions
        trunk_w = Decimal("0.15")
        trunk_h = Decimal("0.2")
        base_w = Decimal("0.7")
        mid_w = Decimal("0.4")
        top_w = Decimal("0.25")
        tip_y = Decimal("0.8")
        tier_1_y = Decimal("0.5")
        tier_2_y = Decimal("0.25")
        base_y = Decimal("0.0")
        trunk_bottom_y = -trunk_h
        
        # Create polygon with integer scaling
        initial_polygon = Polygon([
            (float(Decimal("0.0") * SCALE), float(tip_y * SCALE)),
            (float(top_w / Decimal("2") * SCALE), float(tier_1_y * SCALE)),
            (float(top_w / Decimal("4") * SCALE), float(tier_1_y * SCALE)),
            (float(mid_w / Decimal("2") * SCALE), float(tier_2_y * SCALE)),
            (float(mid_w / Decimal("4") * SCALE), float(tier_2_y * SCALE)),
            (float(base_w / Decimal("2") * SCALE), float(base_y * SCALE)),
            (float(trunk_w / Decimal("2") * SCALE), float(base_y * SCALE)),
            (float(trunk_w / Decimal("2") * SCALE), float(trunk_bottom_y * SCALE)),
            (float(-(trunk_w / Decimal("2")) * SCALE), float(trunk_bottom_y * SCALE)),
            (float(-(trunk_w / Decimal("2")) * SCALE), float(base_y * SCALE)),
            (float(-(base_w / Decimal("2")) * SCALE), float(base_y * SCALE)),
            (float(-(mid_w / Decimal("4")) * SCALE), float(tier_2_y * SCALE)),
            (float(-(mid_w / Decimal("2")) * SCALE), float(tier_2_y * SCALE)),
            (float(-(top_w / Decimal("4")) * SCALE), float(tier_1_y * SCALE)),
            (float(-(top_w / Decimal("2")) * SCALE), float(tier_1_y * SCALE)),
        ])
        
        # Rotate and translate
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(
            rotated,
            xoff=float(self.center_x * SCALE),
            yoff=float(self.center_y * SCALE)
        )

def has_overlap_kaggle(trees):
    """Check overlaps using Kaggle's method."""
    if len(trees) <= 1:
        return False, []
    polygons = [t.polygon for t in trees]
    tree_index = STRtree(polygons)
    overlapping_pairs = []
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx <= i:
                continue
            if poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                overlapping_pairs.append((i, idx))
    return len(overlapping_pairs) > 0, overlapping_pairs

print("ChristmasTree class defined!")

ChristmasTree class defined!


In [3]:
# Standard tree polygon for score calculation
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, angle):
    """Create a tree polygon at position (x, y) with rotation angle (degrees)."""
    poly = Polygon(zip(TX, TY))
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def parse_submission(df):
    """Parse submission dataframe into dict of n -> list of (x, y, angle)."""
    solutions = {}
    for _, row in df.iterrows():
        id_parts = row['id'].split('_')
        n = int(id_parts[0])
        x_str = str(row['x'])
        y_str = str(row['y'])
        deg_str = str(row['deg'])
        x = float(x_str[1:] if x_str.startswith('s') else x_str)
        y = float(y_str[1:] if y_str.startswith('s') else y_str)
        angle = float(deg_str[1:] if deg_str.startswith('s') else deg_str)
        if n not in solutions:
            solutions[n] = []
        solutions[n].append((x, y, angle))
    return solutions

def calculate_side(trees):
    """Calculate the bounding box side length for a set of trees."""
    polys = [create_tree_polygon(*t) for t in trees]
    union = unary_union(polys)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def calculate_score_for_n(trees, n):
    """Calculate score contribution for N trees."""
    side = calculate_side(trees)
    return (side ** 2) / n

print("Helper functions defined!")

Helper functions defined!


In [4]:
def validate_n_kaggle(trees_tuples):
    """Validate N trees using Kaggle's method."""
    trees = [ChristmasTree(str(x), str(y), str(angle)) for x, y, angle in trees_tuples]
    has_overlap, pairs = has_overlap_kaggle(trees)
    return not has_overlap, pairs

# Test the validation
test_trees = [(0, 0, 45)]
valid, pairs = validate_n_kaggle(test_trees)
print(f"Test validation: valid={valid}, pairs={pairs}")

Test validation: valid=True, pairs=[]


In [5]:
# Load the baseline that passed Kaggle (exp_001 with LB=70.615107)
# This is snapshot 21145966992
baseline_path = '/home/nonroot/snapshots/santa-2025/21145966992/submission/submission.csv'
df_baseline = pd.read_csv(baseline_path)
baseline_solutions = parse_submission(df_baseline)

# Calculate baseline scores
baseline_per_n = {n: calculate_score_for_n(baseline_solutions[n], n) for n in range(1, 201)}
baseline_total = sum(baseline_per_n.values())
print(f"Baseline total score: {baseline_total:.6f}")

Baseline total score: 70.572798


In [6]:
# Validate baseline with Kaggle method
print("Validating baseline with Kaggle method...")
baseline_invalid_n = []
for n in range(1, 201):
    valid, pairs = validate_n_kaggle(baseline_solutions[n])
    if not valid:
        baseline_invalid_n.append(n)
        if len(baseline_invalid_n) <= 5:
            print(f"  N={n} has overlaps: {pairs[:3]}...")

print(f"\nBaseline has {len(baseline_invalid_n)} N values with Kaggle overlaps")
if baseline_invalid_n:
    print(f"First 10: {baseline_invalid_n[:10]}")

Validating baseline with Kaggle method...
  N=2 has overlaps: [(0, 1)]...
  N=4 has overlaps: [(0, 3), (0, 1), (1, 2)]...
  N=5 has overlaps: [(0, 2), (0, 1), (0, 3)]...
  N=16 has overlaps: [(0, 3), (0, 4), (0, 12)]...
  N=35 has overlaps: [(0, 22), (0, 7), (1, 3)]...



Baseline has 78 N values with Kaggle overlaps
First 10: [2, 4, 5, 16, 35, 36, 40, 46, 47, 48]


In [7]:
# Load all snapshots
snapshot_base = '/home/nonroot/snapshots/santa-2025/'
snapshot_dirs = sorted(os.listdir(snapshot_base))
print(f"Found {len(snapshot_dirs)} snapshot directories")

# Start with baseline as the best valid solution for each N
best_valid_per_n = {
    n: {
        'score': baseline_per_n[n],
        'trees': baseline_solutions[n],
        'source': 'baseline'
    } for n in range(1, 201)
}

Found 114 snapshot directories


In [None]:
# Process each snapshot and only accept VALID improvements
improvements_found = 0
improvements_rejected = 0
snapshots_processed = 0

for snap_dir in snapshot_dirs:
    sub_path = os.path.join(snapshot_base, snap_dir, 'submission', 'submission.csv')
    if not os.path.exists(sub_path):
        continue
    
    try:
        df = pd.read_csv(sub_path)
        solutions = parse_submission(df)
        
        # Check each N
        for n in range(1, 201):
            if n not in solutions:
                continue
            
            trees = solutions[n]
            score = calculate_score_for_n(trees, n)
            
            # Only accept if better score
            if score < best_valid_per_n[n]['score'] - 1e-10:
                # Validate with Kaggle method
                valid, pairs = validate_n_kaggle(trees)
                if valid:
                    best_valid_per_n[n]['score'] = score
                    best_valid_per_n[n]['trees'] = trees
                    best_valid_per_n[n]['source'] = snap_dir
                    improvements_found += 1
                else:
                    improvements_rejected += 1
        
        snapshots_processed += 1
        if snapshots_processed % 20 == 0:
            print(f"Processed {snapshots_processed} snapshots, {improvements_found} valid improvements...")
                    
    except Exception as e:
        pass

print(f"\nTotal: {snapshots_processed} snapshots processed")
print(f"Valid improvements found: {improvements_found}")
print(f"Improvements rejected (Kaggle overlaps): {improvements_rejected}")

In [None]:
# Calculate new total score
new_total = sum(best_valid_per_n[n]['score'] for n in range(1, 201))
print(f"New total score: {new_total:.6f}")
print(f"Baseline total: {baseline_total:.6f}")
print(f"Improvement: {baseline_total - new_total:.6f}")

# Count unique sources
sources = set(best_valid_per_n[n]['source'] for n in range(1, 201))
print(f"\nUnique sources used: {len(sources)}")

In [None]:
# Show improvements
print("\nN values with valid improvements:")
improved_n = []
for n in range(1, 201):
    if best_valid_per_n[n]['source'] != 'baseline':
        old_score = baseline_per_n[n]
        new_score = best_valid_per_n[n]['score']
        improved_n.append((n, old_score - new_score, best_valid_per_n[n]['source']))

for n, improvement, source in sorted(improved_n, key=lambda x: -x[1])[:20]:
    print(f"  N={n}: improved by {improvement:.6f} from {source}")

print(f"\nTotal N values improved: {len(improved_n)}")

In [None]:
# Final validation of the entire ensemble with Kaggle method
print("\nFinal validation with Kaggle method...")
final_invalid_n = []
for n in range(1, 201):
    valid, pairs = validate_n_kaggle(best_valid_per_n[n]['trees'])
    if not valid:
        final_invalid_n.append(n)
        print(f"  N={n} INVALID!")

if len(final_invalid_n) == 0:
    print("✅ All 200 N values pass Kaggle validation!")
else:
    print(f"\n❌ {len(final_invalid_n)} N values fail Kaggle validation: {final_invalid_n}")

In [None]:
# Create submission dataframe
def create_submission_df(best_per_n):
    rows = []
    for n in range(1, 201):
        trees = best_per_n[n]['trees']
        for idx, (x, y, angle) in enumerate(trees):
            rows.append({
                'id': f'{n:03d}_{idx}',
                'x': f's{x}',
                'y': f's{y}',
                'deg': f's{angle}'
            })
    return pd.DataFrame(rows)

if len(final_invalid_n) == 0:
    df_ensemble = create_submission_df(best_valid_per_n)
    df_ensemble.to_csv('/home/submission/submission.csv', index=False)
    print(f"Saved submission with {len(df_ensemble)} rows")
    
    # Verify
    df_verify = pd.read_csv('/home/submission/submission.csv')
    sol_verify = parse_submission(df_verify)
    verify_score = sum(calculate_score_for_n(sol_verify[n], n) for n in range(1, 201))
    print(f"Verified score: {verify_score:.6f}")
else:
    print("Cannot save - some N values are invalid!")

In [None]:
# Save metrics
metrics = {
    'cv_score': new_total,
    'baseline_score': baseline_total,
    'improvement': baseline_total - new_total,
    'valid_improvements_found': improvements_found,
    'improvements_rejected_kaggle': improvements_rejected,
    'unique_sources': len(sources),
    'n_values_improved': len(improved_n),
    'final_invalid_n': final_invalid_n
}

with open('/home/code/experiments/003_safe_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("Metrics saved.")
print(f"CV Score: {new_total:.6f}")