# Evolver Loop 2 Analysis

## Issue: Submission failed with 'Overlapping trees in group 004'

The fix_direction function may have introduced overlaps. Need to:
1. Investigate the overlap issue
2. Create a proper ensemble from all available submissions
3. Implement robust validation

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.strtree import STRtree
import os

getcontext().prec = 30

# Define the ChristmasTree class
class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(center_x)
        self.center_y = Decimal(center_y)
        self.angle = Decimal(angle)

        trunk_w = Decimal('0.15')
        trunk_h = Decimal('0.2')
        base_w = Decimal('0.7')
        mid_w = Decimal('0.4')
        top_w = Decimal('0.25')
        tip_y = Decimal('0.8')
        tier_1_y = Decimal('0.5')
        tier_2_y = Decimal('0.25')
        base_y = Decimal('0.0')
        trunk_bottom_y = -trunk_h

        initial_polygon = Polygon([
            (float(0), float(tip_y)),
            (float(top_w / 2), float(tier_1_y)),
            (float(top_w / 4), float(tier_1_y)),
            (float(mid_w / 2), float(tier_2_y)),
            (float(mid_w / 4), float(tier_2_y)),
            (float(base_w / 2), float(base_y)),
            (float(trunk_w / 2), float(base_y)),
            (float(trunk_w / 2), float(trunk_bottom_y)),
            (float(-trunk_w / 2), float(trunk_bottom_y)),
            (float(-trunk_w / 2), float(base_y)),
            (float(-base_w / 2), float(base_y)),
            (float(-mid_w / 4), float(tier_2_y)),
            (float(-mid_w / 2), float(tier_2_y)),
            (float(-top_w / 4), float(tier_1_y)),
            (float(-top_w / 2), float(tier_1_y)),
        ])

        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=float(self.center_x), yoff=float(self.center_y))

print("ChristmasTree class defined")

ChristmasTree class defined


In [2]:
def load_trees_for_n(df, n):
    prefix = f"{n:03d}_"
    subset = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in subset.iterrows():
        x = str(row['x']).lstrip('s')
        y = str(row['y']).lstrip('s')
        deg = str(row['deg']).lstrip('s')
        trees.append(ChristmasTree(x, y, deg))
    return trees

def has_overlap(trees):
    if len(trees) <= 1:
        return False
    polygons = [t.polygon for t in trees]
    tree_index = STRtree(polygons)
    
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx != i:
                if poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                    intersection = poly.intersection(polygons[idx])
                    if intersection.area > 1e-12:
                        return True
    return False

def get_bounding_box_side(trees):
    if not trees:
        return 0
    all_coords = []
    for tree in trees:
        coords = np.array(tree.polygon.exterior.coords)
        all_coords.append(coords)
    all_coords = np.vstack(all_coords)
    x_range = all_coords[:, 0].max() - all_coords[:, 0].min()
    y_range = all_coords[:, 1].max() - all_coords[:, 1].min()
    return max(x_range, y_range)

def score_submission(df, max_n=200):
    total_score = 0
    overlaps = []
    for n in range(1, max_n + 1):
        trees = load_trees_for_n(df, n)
        if len(trees) != n:
            print(f"Warning: n={n} has {len(trees)} trees instead of {n}")
            continue
        if has_overlap(trees):
            overlaps.append(n)
        side = get_bounding_box_side(trees)
        score_n = (side ** 2) / n
        total_score += score_n
    return total_score, overlaps

print("Scoring functions defined")

Scoring functions defined


In [3]:
# Check the failed submission for overlaps
print("Checking the failed submission...")
df_failed = pd.read_csv('/home/submission/submission.csv')
print(f"Shape: {df_failed.shape}")

# Check group 004 specifically
trees_4 = load_trees_for_n(df_failed, 4)
print(f"\nGroup 004 has {len(trees_4)} trees")
print(f"Has overlap: {has_overlap(trees_4)}")

# Check all groups for overlaps
print("\nChecking all groups for overlaps...")
failed_score, failed_overlaps = score_submission(df_failed)
print(f"Score: {failed_score:.6f}")
print(f"Overlapping groups: {failed_overlaps}")

Checking the failed submission...
Shape: (20100, 4)

Group 004 has 4 trees
Has overlap: False

Checking all groups for overlaps...


Score: 70.676099
Overlapping groups: []


In [4]:
# Check the original santa-2025.csv for overlaps
print("Checking original santa-2025.csv...")
df_original = pd.read_csv('/home/code/preoptimized/santa-2025.csv')
orig_score, orig_overlaps = score_submission(df_original)
print(f"Score: {orig_score:.6f}")
print(f"Overlapping groups: {orig_overlaps}")

Checking original santa-2025.csv...


Score: 70.676102
Overlapping groups: []


In [5]:
# The issue is likely precision - Kaggle uses stricter overlap detection
# Let's check the coordinates of group 004 in detail
print("Checking group 004 coordinates in failed submission:")
df_failed = pd.read_csv('/home/submission/submission.csv')
group_004 = df_failed[df_failed['id'].str.startswith('004_')]
print(group_004)

print("\nChecking group 004 coordinates in original santa-2025.csv:")
df_original = pd.read_csv('/home/code/preoptimized/santa-2025.csv')
group_004_orig = df_original[df_original['id'].str.startswith('004_')]
print(group_004_orig)

# Check if they're different
print("\nAre they different?")
for col in ['x', 'y', 'deg']:
    diff = group_004[col].values != group_004_orig[col].values
    if any(diff):
        print(f"  {col}: DIFFERENT")
        print(f"    Failed: {group_004[col].values}")
        print(f"    Original: {group_004_orig[col].values}")

Checking group 004 coordinates in failed submission:
      id                    x                    y                  deg
6  004_0  s-0.324747789589372   s0.132109978088185   s156.3706221456364
7  004_1   s0.315354346242638   s0.132109978063475   s156.3706222692641
8  004_2   s0.324747789592379  s-0.732109978069476    s336.370622269264
9  004_3  s-0.315354348134818  s-0.732109978094186  s336.37062214563645

Checking group 004 coordinates in original santa-2025.csv:
      id                       x                       y  \
6  004_0  s-0.324747789589372171   s0.132109978088185392   
7  004_1   s0.315354346242637695   s0.132109978063475492   
8  004_2   s0.324747789592379210  s-0.732109978069475531   
9  004_3  s-0.315354348134818330  s-0.732109978094185987   

                       deg  
6  s156.370622145636389178  
7  s156.370622269264089255  
8  s336.370622269264003990  
9  s336.370622145636446021  

Are they different?
  x: DIFFERENT
    Failed: ['s-0.324747789589372' 's0.315354

In [6]:
# The precision loss during fix_direction caused the overlap
# Solution: Use the original santa-2025.csv directly without modifications
import shutil

print("Copying original santa-2025.csv to submission...")
shutil.copy('/home/code/preoptimized/santa-2025.csv', '/home/submission/submission.csv')

# Verify
df_verify = pd.read_csv('/home/submission/submission.csv')
verify_score, verify_overlaps = score_submission(df_verify)
print(f"Final verification:")
print(f"  Score: {verify_score:.6f}")
print(f"  Overlaps: {verify_overlaps}")
print(f"  Shape: {df_verify.shape}")
print(f"  Target: 68.922808")
print(f"  Gap: {verify_score - 68.922808:.6f}")

# Check precision is preserved
print("\nPrecision check for group 004:")
group_004 = df_verify[df_verify['id'].str.startswith('004_')]
print(group_004)

Copying original santa-2025.csv to submission...


Final verification:
  Score: 70.676102
  Overlaps: []
  Shape: (20100, 4)
  Target: 68.922808
  Gap: 1.753294

Precision check for group 004:
      id                       x                       y  \
6  004_0  s-0.324747789589372171   s0.132109978088185392   
7  004_1   s0.315354346242637695   s0.132109978063475492   
8  004_2   s0.324747789592379210  s-0.732109978069475531   
9  004_3  s-0.315354348134818330  s-0.732109978094185987   

                       deg  
6  s156.370622145636389178  
7  s156.370622269264089255  
8  s336.370622269264003990  
9  s336.370622145636446021  


In [None]:
# Load all available pre-optimized submissions and check for overlaps
submissions = {
    'santa-2025.csv': '/home/code/preoptimized/santa-2025.csv',
    'bucket-of-chump': '/home/code/preoptimized/bucket-of-chump/submission.csv',
    'chistyakov': '/home/code/preoptimized/chistyakov/submission_best.csv',
    'telegram-71.97': '/home/code/preoptimized/telegram/71.97.csv',
    'telegram-72.49': '/home/code/preoptimized/telegram/72.49.csv',
}

results = {}
for name, path in submissions.items():
    if os.path.exists(path):
        df = pd.read_csv(path)
        score, overlaps = score_submission(df)
        results[name] = {'score': score, 'overlaps': overlaps, 'df': df}
        print(f"{name}: score={score:.6f}, overlaps={overlaps}")
    else:
        print(f"{name}: FILE NOT FOUND")

In [None]:
# Create ensemble - for each N, pick the best configuration from all submissions
def create_ensemble(submissions_dict):
    """Create ensemble by picking best config for each N from all submissions."""
    ensemble_rows = []
    
    for n in range(1, 201):
        best_side = float('inf')
        best_source = None
        best_trees = None
        
        for name, data in submissions_dict.items():
            if 'df' not in data:
                continue
            df = data['df']
            trees = load_trees_for_n(df, n)
            if len(trees) != n:
                continue
            if has_overlap(trees):
                continue  # Skip overlapping configs
            side = get_bounding_box_side(trees)
            if side < best_side:
                best_side = side
                best_source = name
                best_trees = trees
        
        if best_trees is None:
            print(f"Warning: No valid config for n={n}")
            continue
            
        # Add to ensemble
        for i, tree in enumerate(best_trees):
            ensemble_rows.append({
                'id': f"{n:03d}_{i}",
                'x': f"s{float(tree.center_x)}",
                'y': f"s{float(tree.center_y)}",
                'deg': f"s{float(tree.angle)}"
            })
    
    return pd.DataFrame(ensemble_rows)

print("Creating ensemble from all valid submissions...")
df_ensemble = create_ensemble(results)
print(f"Ensemble shape: {df_ensemble.shape}")

# Score the ensemble
ensemble_score, ensemble_overlaps = score_submission(df_ensemble)
print(f"Ensemble score: {ensemble_score:.6f}")
print(f"Ensemble overlaps: {ensemble_overlaps}")

In [None]:
# Analyze per-N scores to find worst performers
def analyze_per_n_scores(df):
    """Analyze score contribution per N."""
    scores = []
    for n in range(1, 201):
        trees = load_trees_for_n(df, n)
        if len(trees) != n:
            continue
        side = get_bounding_box_side(trees)
        score_n = (side ** 2) / n
        scores.append({'n': n, 'side': side, 'score': score_n, 'efficiency': side / np.sqrt(n)})
    return pd.DataFrame(scores)

print("Analyzing per-N scores for best submission (santa-2025.csv)...")
df_analysis = analyze_per_n_scores(results['santa-2025.csv']['df'])
print(f"\nTop 10 worst N values by score contribution:")
print(df_analysis.nlargest(10, 'score')[['n', 'side', 'score', 'efficiency']])

In [None]:
# Save the ensemble as the new submission
print("\nSaving ensemble submission...")
os.makedirs('/home/submission', exist_ok=True)
df_ensemble.to_csv('/home/submission/submission.csv', index=False)

# Final verification
df_verify = pd.read_csv('/home/submission/submission.csv')
verify_score, verify_overlaps = score_submission(df_verify)
print(f"Final verification:")
print(f"  Score: {verify_score:.6f}")
print(f"  Overlaps: {verify_overlaps}")
print(f"  Shape: {df_verify.shape}")
print(f"  Target: 68.922808")
print(f"  Gap: {verify_score - 68.922808:.6f}")