# Evolver Loop 8 Analysis: Comprehensive External Ensemble

We now have access to multiple external datasets via Kaggle API:
1. bucket-of-chump (jazivxt)
2. telegram solutions (72.49.csv, 71.97.csv)
3. saspav/santa-2025.csv
4. chistyakov/submission_best.csv
5. SmartManoj GitHub

This notebook will create a comprehensive ensemble from ALL sources.

In [1]:
import math
import numpy as np
import pandas as pd
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.ops import unary_union
import glob
import os

getcontext().prec = 30
scale_factor = Decimal('1e15')

print('Libraries loaded')

Libraries loaded


In [2]:
class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(str(center_x))
        self.center_y = Decimal(str(center_y))
        self.angle = Decimal(str(angle))
        trunk_w = Decimal('0.15')
        trunk_h = Decimal('0.2')
        base_w = Decimal('0.7')
        mid_w = Decimal('0.4')
        top_w = Decimal('0.25')
        tip_y = Decimal('0.8')
        tier_1_y = Decimal('0.5')
        tier_2_y = Decimal('0.25')
        base_y = Decimal('0.0')
        trunk_bottom_y = -trunk_h
        initial_polygon = Polygon([
            (Decimal('0.0') * scale_factor, tip_y * scale_factor),
            (top_w / Decimal('2') * scale_factor, tier_1_y * scale_factor),
            (top_w / Decimal('4') * scale_factor, tier_1_y * scale_factor),
            (mid_w / Decimal('2') * scale_factor, tier_2_y * scale_factor),
            (mid_w / Decimal('4') * scale_factor, tier_2_y * scale_factor),
            (base_w / Decimal('2') * scale_factor, base_y * scale_factor),
            (trunk_w / Decimal('2') * scale_factor, base_y * scale_factor),
            (trunk_w / Decimal('2') * scale_factor, trunk_bottom_y * scale_factor),
            (-(trunk_w / Decimal('2')) * scale_factor, trunk_bottom_y * scale_factor),
            (-(trunk_w / Decimal('2')) * scale_factor, base_y * scale_factor),
            (-(base_w / Decimal('2')) * scale_factor, base_y * scale_factor),
            (-(mid_w / Decimal('4')) * scale_factor, tier_2_y * scale_factor),
            (-(mid_w / Decimal('2')) * scale_factor, tier_2_y * scale_factor),
            (-(top_w / Decimal('4')) * scale_factor, tier_1_y * scale_factor),
            (-(top_w / Decimal('2')) * scale_factor, tier_1_y * scale_factor),
        ])
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=float(self.center_x * scale_factor), yoff=float(self.center_y * scale_factor))

print('ChristmasTree class defined')

ChristmasTree class defined


In [3]:
def get_side_length(trees):
    all_polygons = [t.polygon for t in trees]
    bounds = unary_union(all_polygons).bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1]) / float(scale_factor)

def get_score(trees, n):
    if not trees:
        return float('inf')
    side = get_side_length(trees)
    return side ** 2 / n

def has_collision(trees):
    if len(trees) <= 1:
        return False
    for i, tree1 in enumerate(trees):
        for j, tree2 in enumerate(trees):
            if i < j:
                if tree1.polygon.intersects(tree2.polygon) and not tree1.polygon.touches(tree2.polygon):
                    return True
    return False

def load_configuration_from_df(n, df):
    group_data = df[df['id'].str.startswith(f'{n:03d}_')]
    trees = []
    for _, row in group_data.iterrows():
        x = str(row['x'])[1:] if str(row['x']).startswith('s') else str(row['x'])
        y = str(row['y'])[1:] if str(row['y']).startswith('s') else str(row['y'])
        deg = str(row['deg'])[1:] if str(row['deg']).startswith('s') else str(row['deg'])
        if x and y and deg:
            trees.append(ChristmasTree(x, y, deg))
    return trees

print('Helper functions defined')

Helper functions defined


In [4]:
# Load all available CSV files
csv_files = {
    'current_best': '/home/submission/submission.csv',
    'bucket_of_chump': '/home/code/external_data/bucket-of-chump/submission.csv',
    'telegram_72.49': '/home/code/external_data/telegram/72.49.csv',
    'telegram_71.97': '/home/code/external_data/telegram/71.97.csv',
    'saspav': '/home/code/external_data/saspav/santa-2025.csv',
    'chistyakov': '/home/code/external_data/chistyakov/submission_best.csv',
}

# Load all dataframes
dataframes = {}
for name, path in csv_files.items():
    try:
        df = pd.read_csv(path)
        dataframes[name] = df
        print(f'Loaded {name}: {len(df)} rows')
    except Exception as e:
        print(f'Failed to load {name}: {e}')

print(f'\nTotal sources loaded: {len(dataframes)}')

Loaded current_best: 20100 rows
Loaded bucket_of_chump: 20100 rows
Loaded telegram_72.49: 20100 rows
Loaded telegram_71.97: 20100 rows
Loaded saspav: 20100 rows
Loaded chistyakov: 20100 rows

Total sources loaded: 6


In [5]:
# Calculate scores for each source and each N
print('Calculating scores for all sources and all N values...')
print('This may take a few minutes...\n')

all_scores = {}  # {source_name: {n: score}}
all_configs = {}  # {source_name: {n: trees}}
all_valid = {}  # {source_name: {n: is_valid}}

for name, df in dataframes.items():
    all_scores[name] = {}
    all_configs[name] = {}
    all_valid[name] = {}
    
    for n in range(1, 201):
        trees = load_configuration_from_df(n, df)
        if len(trees) == n:
            score = get_score(trees, n)
            is_valid = not has_collision(trees)
            all_scores[name][n] = score
            all_configs[name][n] = trees
            all_valid[name][n] = is_valid
        else:
            all_scores[name][n] = float('inf')
            all_configs[name][n] = None
            all_valid[name][n] = False
    
    # Calculate total score for this source
    valid_total = sum(all_scores[name][n] for n in range(1, 201) if all_valid[name].get(n, False))
    invalid_count = sum(1 for n in range(1, 201) if not all_valid[name].get(n, False))
    print(f'{name}: total={valid_total:.6f}, invalid={invalid_count}')

print('\nDone calculating scores')

Calculating scores for all sources and all N values...
This may take a few minutes...



current_best: total=70.676104, invalid=0


bucket_of_chump: total=66.737056, invalid=11


telegram_72.49: total=72.495739, invalid=0


telegram_71.97: total=71.972027, invalid=0


saspav: total=66.470080, invalid=12


chistyakov: total=70.926150, invalid=0

Done calculating scores


In [6]:
# Create ensemble: for each N, pick the best valid configuration
print('Creating ensemble from all sources...')

ensemble_configs = {}
ensemble_scores = {}
ensemble_sources = {}

for n in range(1, 201):
    best_score = float('inf')
    best_config = None
    best_source = None
    
    for name in dataframes.keys():
        if all_valid[name].get(n, False) and all_scores[name][n] < best_score:
            best_score = all_scores[name][n]
            best_config = all_configs[name][n]
            best_source = name
    
    ensemble_configs[n] = best_config
    ensemble_scores[n] = best_score
    ensemble_sources[n] = best_source

# Calculate ensemble total
ensemble_total = sum(ensemble_scores.values())
print(f'\nEnsemble total score: {ensemble_total:.6f}')
print(f'Gap to target: {ensemble_total - 68.922808:.6f}')

# Compare to current best
current_total = sum(all_scores['current_best'].values())
print(f'\nCurrent best total: {current_total:.6f}')
print(f'Improvement: {current_total - ensemble_total:.6f}')

# Count sources used
from collections import Counter
source_counts = Counter(ensemble_sources.values())
print(f'\nSources used:')
for source, count in source_counts.most_common():
    print(f'  {source}: {count} N values')

Creating ensemble from all sources...

Ensemble total score: 70.676104
Gap to target: 1.753296

Current best total: 70.676104
Improvement: 0.000000

Sources used:
  current_best: 200 N values


In [None]:
# Investigate why saspav and bucket_of_chump aren't being used
# They have better scores but some overlaps

print("Investigating saspav (score 66.47):")
print("N values with overlaps:")
for n in range(1, 201):
    if not all_valid['saspav'].get(n, False):
        print(f"  N={n}: overlap")

print("\nInvestigating bucket_of_chump (score 66.74):")
print("N values with overlaps:")
for n in range(1, 201):
    if not all_valid['bucket_of_chump'].get(n, False):
        print(f"  N={n}: overlap")

In [None]:
# Show which N values improved
print('\nN values where ensemble is better than current_best:')
improvements = []
for n in range(1, 201):
    current_score = all_scores['current_best'].get(n, float('inf'))
    ensemble_score = ensemble_scores.get(n, float('inf'))
    if ensemble_score < current_score:
        improvement = current_score - ensemble_score
        source = ensemble_sources[n]
        improvements.append((n, current_score, ensemble_score, improvement, source))
        print(f'  N={n}: {current_score:.6f} -> {ensemble_score:.6f} (improvement: {improvement:.6f}) from {source}')

print(f'\nTotal improvements: {len(improvements)}')
if improvements:
    total_improvement = sum(i[3] for i in improvements)
    print(f'Total score improvement: {total_improvement:.6f}')

In [None]:
# Save the ensemble submission
print('\nSaving ensemble submission...')

rows = []
for n in range(1, 201):
    trees = ensemble_configs[n]
    if trees is None:
        # Fallback to current_best
        trees = all_configs['current_best'][n]
    
    for i, tree in enumerate(trees):
        rows.append({
            'id': f'{n:03d}_{i}',
            'x': f's{tree.center_x}',
            'y': f's{tree.center_y}',
            'deg': f's{tree.angle}'
        })

ensemble_df = pd.DataFrame(rows)
ensemble_df.to_csv('/home/code/experiments/009_comprehensive_ensemble/submission.csv', index=False)
ensemble_df.to_csv('/home/submission/submission.csv', index=False)

print(f'Saved ensemble with score {ensemble_total:.6f}')
print(f'Sample rows:')
print(ensemble_df.head())

In [None]:
# Verify the saved submission
print('\nVerifying saved submission...')
verify_df = pd.read_csv('/home/submission/submission.csv')

verify_total = 0
overlap_count = 0
for n in range(1, 201):
    trees = load_configuration_from_df(n, verify_df)
    score = get_score(trees, n)
    verify_total += score
    if has_collision(trees):
        overlap_count += 1
        print(f'  N={n}: OVERLAP!')

print(f'\nVerified total score: {verify_total:.6f}')
print(f'Overlaps: {overlap_count}')
print(f'Gap to target: {verify_total - 68.922808:.6f}')