# Loop 4 Analysis: Expanding the Ensemble

## Key Insight from Evaluator
The jonathanchan kernel combines 15+ solution sources, but we've only tried 3 (boc, saspav, smartmanoj).
The C++ SA optimizer found 0 improvements because the ensemble is too narrow.

## New Sources Downloaded:
1. telegram-public-shared-solution: 71.97.csv, 72.49.csv
2. chistyakov: submission_best.csv
3. bucket-of-chump: submission.csv (already have)

Let's combine ALL sources and find the best per-N configuration.

In [None]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.ops import unary_union
import json
import os

getcontext().prec = 30
scale_factor = 1

class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(str(center_x))
        self.center_y = Decimal(str(center_y))
        self.angle = Decimal(str(angle))

        trunk_w = Decimal('0.15')
        trunk_h = Decimal('0.2')
        base_w = Decimal('0.7')
        mid_w = Decimal('0.4')
        top_w = Decimal('0.25')
        tip_y = Decimal('0.8')
        tier_1_y = Decimal('0.5')
        tier_2_y = Decimal('0.25')
        base_y = Decimal('0.0')
        trunk_bottom_y = -trunk_h

        initial_polygon = Polygon([
            (float(Decimal('0.0') * scale_factor), float(tip_y * scale_factor)),
            (float(top_w / Decimal('2') * scale_factor), float(tier_1_y * scale_factor)),
            (float(top_w / Decimal('4') * scale_factor), float(tier_1_y * scale_factor)),
            (float(mid_w / Decimal('2') * scale_factor), float(tier_2_y * scale_factor)),
            (float(mid_w / Decimal('4') * scale_factor), float(tier_2_y * scale_factor)),
            (float(base_w / Decimal('2') * scale_factor), float(base_y * scale_factor)),
            (float(trunk_w / Decimal('2') * scale_factor), float(base_y * scale_factor)),
            (float(trunk_w / Decimal('2') * scale_factor), float(trunk_bottom_y * scale_factor)),
            (float(-(trunk_w / Decimal('2')) * scale_factor), float(trunk_bottom_y * scale_factor)),
            (float(-(trunk_w / Decimal('2')) * scale_factor), float(base_y * scale_factor)),
            (float(-(base_w / Decimal('2')) * scale_factor), float(base_y * scale_factor)),
            (float(-(mid_w / Decimal('4')) * scale_factor), float(tier_2_y * scale_factor)),
            (float(-(mid_w / Decimal('2')) * scale_factor), float(tier_2_y * scale_factor)),
            (float(-(top_w / Decimal('4')) * scale_factor), float(tier_1_y * scale_factor)),
            (float(-(top_w / Decimal('2')) * scale_factor), float(tier_1_y * scale_factor)),
        ])
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated,
                                          xoff=float(self.center_x * scale_factor),
                                          yoff=float(self.center_y * scale_factor))

def get_tree_list_side_length(tree_list):
    all_polygons = [t.polygon for t in tree_list]
    bounds = unary_union(all_polygons).bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def load_solution(csv_path):
    df = pd.read_csv(csv_path)
    df['x'] = df['x'].astype(str).str.strip().str.lstrip('s')
    df['y'] = df['y'].astype(str).str.strip().str.lstrip('s')
    df['deg'] = df['deg'].astype(str).str.strip().str.lstrip('s')
    df[['group_id', 'item_id']] = df['id'].str.split('_', n=2, expand=True)
    
    solution = {}
    for group_id, group_data in df.groupby('group_id'):
        n = int(group_id)
        trees = [(row['x'], row['y'], row['deg']) for _, row in group_data.iterrows()]
        solution[n] = trees
    
    return solution

def score_config(trees_data):
    tree_list = [ChristmasTree(x, y, deg) for x, y, deg in trees_data]
    side = get_tree_list_side_length(tree_list)
    n = len(trees_data)
    return side ** 2 / n

print('Functions defined.')

In [None]:
# Load all available solutions
solutions = {}

# 1. Current best (bucket-of-chump)
solutions['boc'] = load_solution('/home/code/datasets/bucket-of-chump/submission.csv')
print(f'Loaded boc: {len(solutions["boc"])} N values')

# 2. Telegram 71.97
solutions['telegram_71.97'] = load_solution('/home/code/datasets/telegram/71.97.csv')
print(f'Loaded telegram_71.97: {len(solutions["telegram_71.97"])} N values')

# 3. Telegram 72.49
solutions['telegram_72.49'] = load_solution('/home/code/datasets/telegram/72.49.csv')
print(f'Loaded telegram_72.49: {len(solutions["telegram_72.49"])} N values')

# 4. Chistyakov best
solutions['chistyakov'] = load_solution('/home/code/datasets/chistyakov/submission_best.csv')
print(f'Loaded chistyakov: {len(solutions["chistyakov"])} N values')

# 5. saspav (if available)
try:
    solutions['saspav'] = load_solution('/home/code/exploration/preoptimized/submission.csv')
    print(f'Loaded saspav: {len(solutions["saspav"])} N values')
except:
    print('saspav not found')

In [None]:
# Calculate scores for each solution
print('Calculating scores for each solution...')
scores = {}
for name, sol in solutions.items():
    sol_scores = {n: score_config(sol[n]) for n in range(1, 201)}
    total = sum(sol_scores.values())
    scores[name] = {'per_n': sol_scores, 'total': total}
    print(f'{name}: {total:.6f}')

print(f'\nTarget: 68.919154')

In [None]:
# Find best per-N from all solutions
print('\nFinding best per-N configuration...')
best_ensemble = {}
best_sources = {}

for n in range(1, 201):
    best_score = float('inf')
    best_source = None
    best_config = None
    
    for name, sol in solutions.items():
        if n in sol:
            score = scores[name]['per_n'][n]
            if score < best_score:
                best_score = score
                best_source = name
                best_config = sol[n]
    
    best_ensemble[n] = best_config
    best_sources[n] = best_source

# Count wins per source
from collections import Counter
wins = Counter(best_sources.values())
print('\nWins per source:')
for name, count in wins.most_common():
    print(f'  {name}: {count}')

# Calculate ensemble total score
ensemble_scores = {n: score_config(best_ensemble[n]) for n in range(1, 201)}
ensemble_total = sum(ensemble_scores.values())
print(f'\nEnsemble total score: {ensemble_total:.6f}')
print(f'Target: 68.919154')
print(f'Gap: {ensemble_total - 68.919154:.6f}')

In [None]:
# Show which N values improved from the new sources
print('\nN values where new sources beat boc:')
improvements = []
for n in range(1, 201):
    boc_score = scores['boc']['per_n'][n]
    best_score = ensemble_scores[n]
    if best_score < boc_score - 1e-9:
        source = best_sources[n]
        improvement = boc_score - best_score
        improvements.append((n, source, boc_score, best_score, improvement))
        print(f'  N={n:3d}: {source:15s} {boc_score:.6f} -> {best_score:.6f} (improvement: {improvement:.9f})')

print(f'\nTotal improvements: {len(improvements)}')
if improvements:
    total_improvement = sum(imp[4] for imp in improvements)
    print(f'Total score improvement: {total_improvement:.9f}')

In [None]:
# Save the best ensemble as submission
rows = []
for n in range(1, 201):
    for i, (x, y, deg) in enumerate(best_ensemble[n]):
        rows.append({
            'id': f'{n:03d}_{i}',
            'x': f's{x}',
            'y': f's{y}',
            'deg': f's{deg}'
        })

submission_df = pd.DataFrame(rows)
submission_df.to_csv('/home/submission/submission.csv', index=False)
print(f'Saved submission with {len(submission_df)} rows')
print(f'Final score: {ensemble_total:.6f}')

In [None]:
# Save metrics
metrics = {
    'cv_score': ensemble_total,
    'sources_used': list(solutions.keys()),
    'wins_per_source': dict(wins),
    'num_improvements_over_boc': len(improvements),
    'total_improvement_over_boc': sum(imp[4] for imp in improvements) if improvements else 0
}

with open('/home/code/exploration/loop4_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f'Metrics saved: {metrics}')