# Experiment 003: Safe Ensemble

The previous ensemble submissions failed due to tiny overlaps (1e-31 to 1e-22 area).
Kaggle's validation is extremely strict.

Strategy:
1. Start with the baseline (known to pass Kaggle validation)
2. For each N, check if the ensemble has a better score AND no overlap
3. Only replace if both conditions are met
4. Use ultra-strict overlap detection (any intersection area > 0 is rejected)

In [None]:
import sys
sys.path.insert(0, '/home/code')

import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
import json

getcontext().prec = 50
scale_factor = Decimal('1e18')

# Tree geometry
trunk_w = Decimal('0.15')
trunk_h = Decimal('0.2')
base_w = Decimal('0.7')
mid_w = Decimal('0.4')
top_w = Decimal('0.25')
tip_y = Decimal('0.8')
tier_1_y = Decimal('0.5')
tier_2_y = Decimal('0.25')
base_y = Decimal('0.0')
trunk_bottom_y = -trunk_h

print('Utilities loaded')

In [None]:
def get_tree_polygon(cx, cy, angle):
    initial_polygon = Polygon([
        (float(Decimal('0.0') * scale_factor), float(tip_y * scale_factor)),
        (float(top_w / Decimal('2') * scale_factor), float(tier_1_y * scale_factor)),
        (float(top_w / Decimal('4') * scale_factor), float(tier_1_y * scale_factor)),
        (float(mid_w / Decimal('2') * scale_factor), float(tier_2_y * scale_factor)),
        (float(mid_w / Decimal('4') * scale_factor), float(tier_2_y * scale_factor)),
        (float(base_w / Decimal('2') * scale_factor), float(base_y * scale_factor)),
        (float(trunk_w / Decimal('2') * scale_factor), float(base_y * scale_factor)),
        (float(trunk_w / Decimal('2') * scale_factor), float(trunk_bottom_y * scale_factor)),
        (float(-(trunk_w / Decimal('2')) * scale_factor), float(trunk_bottom_y * scale_factor)),
        (float(-(trunk_w / Decimal('2')) * scale_factor), float(base_y * scale_factor)),
        (float(-(base_w / Decimal('2')) * scale_factor), float(base_y * scale_factor)),
        (float(-(mid_w / Decimal('4')) * scale_factor), float(tier_2_y * scale_factor)),
        (float(-(mid_w / Decimal('2')) * scale_factor), float(tier_2_y * scale_factor)),
        (float(-(top_w / Decimal('4')) * scale_factor), float(tier_1_y * scale_factor)),
        (float(-(top_w / Decimal('2')) * scale_factor), float(tier_1_y * scale_factor)),
    ])
    rotated = affinity.rotate(initial_polygon, float(angle), origin=(0, 0))
    return affinity.translate(rotated, xoff=float(cx * scale_factor), yoff=float(cy * scale_factor))

def check_n_for_overlaps_strict(df, n):
    """Check if N has ANY overlap (area > 0)"""
    prefix = f'{n:03d}_'
    n_rows = df[df['id'].str.startswith(prefix)]
    if len(n_rows) == 0:
        return True, 0  # No data = overlap
    
    trees = []
    for _, row in n_rows.iterrows():
        x = Decimal(str(row['x'])[1:])
        y = Decimal(str(row['y'])[1:])
        deg = Decimal(str(row['deg'])[1:])
        trees.append(get_tree_polygon(x, y, deg))
    
    for i in range(len(trees)):
        for j in range(i+1, len(trees)):
            intersection = trees[i].intersection(trees[j])
            if intersection.area > 0:
                return True, intersection.area / float(scale_factor**2)
    
    return False, 0

def get_score_for_n(df, n):
    """Calculate score for N"""
    prefix = f'{n:03d}_'
    n_rows = df[df['id'].str.startswith(prefix)]
    if len(n_rows) == 0:
        return float('inf')
    
    trees = []
    for _, row in n_rows.iterrows():
        x = Decimal(str(row['x'])[1:])
        y = Decimal(str(row['y'])[1:])
        deg = Decimal(str(row['deg'])[1:])
        trees.append(get_tree_polygon(x, y, deg))
    
    # Get bounding box
    all_coords = []
    for t in trees:
        coords = np.array(t.exterior.coords)
        all_coords.append(coords)
    all_coords = np.vstack(all_coords) / float(scale_factor)
    
    min_x, min_y = all_coords.min(axis=0)
    max_x, max_y = all_coords.max(axis=0)
    side = max(max_x - min_x, max_y - min_y)
    
    return side**2 / n

print('Functions defined')

In [None]:
# Load baseline and ensemble
baseline_df = pd.read_csv('/home/code/experiments/000_baseline/submission.csv')
ensemble_df = pd.read_csv('/home/code/experiments/001_ensemble/submission.csv')

print(f'Baseline shape: {baseline_df.shape}')
print(f'Ensemble shape: {ensemble_df.shape}')

In [None]:
# Create safe ensemble: start with baseline, replace only where ensemble is better AND has no overlap
safe_df = baseline_df.copy()
improvements = []
rejected = []

for n in range(1, 201):
    baseline_score = get_score_for_n(baseline_df, n)
    ensemble_score = get_score_for_n(ensemble_df, n)
    
    if ensemble_score < baseline_score - 1e-12:  # Ensemble is better
        # Check if ensemble has overlap
        has_overlap, area = check_n_for_overlaps_strict(ensemble_df, n)
        
        if not has_overlap:
            # Safe to use ensemble
            prefix = f'{n:03d}_'
            safe_df = safe_df[~safe_df['id'].str.startswith(prefix)]
            ensemble_n = ensemble_df[ensemble_df['id'].str.startswith(prefix)]
            safe_df = pd.concat([safe_df, ensemble_n], ignore_index=True)
            improvements.append((n, baseline_score - ensemble_score))
        else:
            rejected.append((n, baseline_score - ensemble_score, area))
    
    if n % 50 == 0:
        print(f'Processed N=1-{n}')

print(f'\nImprovements accepted: {len(improvements)}')
print(f'Improvements rejected (overlap): {len(rejected)}')
print(f'\nTop 10 improvements: {sorted(improvements, key=lambda x: -x[1])[:10]}')
print(f'\nRejected N values: {[x[0] for x in rejected]}')
print(f'Total improvement: {sum(x[1] for x in improvements):.6f}')

In [None]:
# Sort and save
safe_df['n'] = safe_df['id'].apply(lambda x: int(x.split('_')[0]))
safe_df['tree_idx'] = safe_df['id'].apply(lambda x: int(x.split('_')[1]))
safe_df = safe_df.sort_values(['n', 'tree_idx']).drop(columns=['n', 'tree_idx']).reset_index(drop=True)

print(f'Safe ensemble shape: {safe_df.shape}')

In [None]:
# Verify safe ensemble has no overlaps
print('Verifying safe ensemble...')
overlapping_ns = []
for n in range(1, 201):
    has_overlap, area = check_n_for_overlaps_strict(safe_df, n)
    if has_overlap:
        overlapping_ns.append((n, area))

print(f'Overlapping N values: {len(overlapping_ns)}')
if overlapping_ns:
    print(f'Details: {overlapping_ns[:10]}')
else:
    print('NO OVERLAPS - Safe to submit!')

In [None]:
# Calculate final score
total_score = 0
for n in range(1, 201):
    score = get_score_for_n(safe_df, n)
    total_score += score

print(f'\nSafe ensemble score: {total_score:.6f}')
print(f'Baseline score: 70.676102')
print(f'Improvement: {70.676102 - total_score:.6f}')
print(f'\nTarget: 68.888293')
print(f'Gap to target: {total_score - 68.888293:.6f}')

In [None]:
# Save if valid
if len(overlapping_ns) == 0:
    # Save to experiment folder
    safe_df.to_csv('/home/code/experiments/003_safe_ensemble/submission.csv', index=False)
    print('Saved to experiments/003_safe_ensemble/submission.csv')
    
    # Copy to submission folder
    import shutil
    shutil.copy('/home/code/experiments/003_safe_ensemble/submission.csv', '/home/submission/submission.csv')
    print('Copied to /home/submission/submission.csv')
    
    # Save metrics
    metrics = {
        'cv_score': total_score,
        'baseline_score': 70.676102,
        'improvements_accepted': len(improvements),
        'improvements_rejected': len(rejected),
        'is_valid': True
    }
    with open('/home/code/experiments/003_safe_ensemble/metrics.json', 'w') as f:
        json.dump(metrics, f, indent=2)
    print(f'Metrics: {metrics}')
else:
    print('ERROR: Safe ensemble still has overlaps!')