# Evolver Loop 15 Analysis

## Current State
- Best CV: 70.316708 (exp_018)
- Target: 68.876781
- Gap: 1.44 points (2.05%)

## Key Questions
1. What sources haven't been ensembled yet?
2. Is the rebuild from corners bug real?
3. What's the theoretical lower bound?

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
import glob
import os

# Tree shape
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

def create_tree_polygon(x, y, angle):
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = affinity.rotate(poly, float(angle), origin=(0, 0))
    poly = affinity.translate(poly, float(x), float(y))
    return poly

def get_bbox_side(trees):
    if len(trees) == 0:
        return 0
    polygons = [create_tree_polygon(t['x'], t['y'], t['deg']) for t in trees]
    union = unary_union(polygons)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def get_score(trees, n):
    side = get_bbox_side(trees)
    return (side ** 2) / n

print('Functions defined')

Functions defined


In [2]:
# Load current best
df = pd.read_csv('/home/code/experiments/018_rebuild_corners_fixed/submission.csv')
df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)

current_scores = {}
for n, g in df.groupby('N'):
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    current_scores[n] = get_score(trees, n)

print(f'Current total: {sum(current_scores.values()):.6f}')
print(f'Target: 68.876781')
print(f'Gap: {sum(current_scores.values()) - 68.876781:.6f}')

Current total: 70.316708
Target: 68.876781
Gap: 1.439927


In [3]:
# Check which sources we have
sources = glob.glob('/home/code/kaggle_datasets/*/submission*.csv')
print(f'Total sources: {len(sources)}')
for s in sorted(sources):
    print(f'  {s.split("/")[-2]}/{s.split("/")[-1]}')

Total sources: 46
  aikhmelnytskyy_sa/submission.csv
  bucket-of-chump/submission.csv
  chistyakov/submission_best.csv
  chistyakov_best_public/submission_best.csv
  chistyakov_best_public_new/submission_best.csv
  chistyakov_packed/submission_best.csv
  chistyakov_slow/submission.csv
  crodoc_backpacking/submission.csv
  datafad_boxes_shrunk/submission.csv
  datafad_bronze/submission.csv
  datafad_bronze_new/submission.csv
  datafad_newyear/submission.csv
  egortrushin_improved_sa/submission.csv
  egortrushin_improved_sa_new/submission.csv
  egortrushin_sa_translations/submission.csv
  eyestrain_blending/submission.csv
  eyestrain_blending_new/submission.csv
  hvanphucs_ensemble/submission.csv
  hvanphucs_ensemble/submission_ensemble.csv
  jazivxt_team_blend/submission.csv
  jazivxt_team_blend/submission_ensemble.csv
  jazivxt_why_not/submission.csv
  kumarandatascientist_ensemble/submission.csv
  kumarandatascientist_ensemble/submission_ensembled.csv
  nctuan_challenge/submission.csv

In [4]:
# Check new sources that might not have been ensembled
new_sources = [
    '/home/code/kaggle_datasets/kumarandatascientist_ensemble/submission.csv',
    '/home/code/kaggle_datasets/kumarandatascientist_ensemble/submission_ensembled.csv',
    '/home/code/kaggle_datasets/crodoc_backpacking/submission.csv',
    '/home/code/kaggle_datasets/chistyakov_slow/submission.csv',
    '/home/code/kaggle_datasets/zaburo_aligned/submission.csv',
    '/home/code/kaggle_datasets/datafad_newyear/submission.csv',
]

for src in new_sources:
    if os.path.exists(src):
        try:
            df_src = pd.read_csv(src)
            print(f'{src.split("/")[-2]}: {len(df_src)} rows')
        except Exception as e:
            print(f'{src.split("/")[-2]}: ERROR - {e}')

kumarandatascientist_ensemble: 20100 rows
kumarandatascientist_ensemble: 20100 rows
crodoc_backpacking: 20100 rows
chistyakov_slow: 20100 rows
zaburo_aligned: 20100 rows
datafad_newyear: 20100 rows


In [5]:
# Analyze score breakdown by N range
ranges = [(1, 1), (2, 5), (6, 10), (11, 50), (51, 100), (101, 200)]
for start, end in ranges:
    range_score = sum(current_scores[n] for n in range(start, end+1))
    pct = range_score / sum(current_scores.values()) * 100
    print(f'N={start:3d}-{end:3d}: {range_score:8.4f} ({pct:5.1f}%)')

N=  1-  1:   0.6612 (  0.9%)
N=  2-  5:   1.7189 (  2.4%)
N=  6- 10:   1.9445 (  2.8%)
N= 11- 50:  14.6279 ( 20.8%)
N= 51-100:  17.4772 ( 24.9%)
N=101-200:  33.8869 ( 48.2%)


In [6]:
# Test rebuild from corners with POLYGON BOUNDS (the fix)
def rebuild_from_corners_fixed(large_layout, target_n, current_best_score):
    """Extract subset of trees closest to each corner using POLYGON BOUNDS."""
    if len(large_layout) <= target_n:
        return None
    
    # Get bounds of entire layout
    all_polys = [create_tree_polygon(t['x'], t['y'], t['deg']) for t in large_layout]
    all_union = unary_union(all_polys)
    minx, miny, maxx, maxy = all_union.bounds
    
    corners = [(minx, miny), (minx, maxy), (maxx, miny), (maxx, maxy)]
    
    best_subset = None
    best_score = current_best_score
    
    for corner_x, corner_y in corners:
        # Sort trees by max POLYGON BOUNDS distance from corner (Chebyshev)
        trees_with_dist = []
        for i, t in enumerate(large_layout):
            poly = create_tree_polygon(t['x'], t['y'], t['deg'])
            b = poly.bounds  # (minx, miny, maxx, maxy)
            # Use polygon bounds, not tree center!
            dist = max(
                abs(b[0] - corner_x),  # polygon minx
                abs(b[2] - corner_x),  # polygon maxx
                abs(b[1] - corner_y),  # polygon miny
                abs(b[3] - corner_y),  # polygon maxy
            )
            trees_with_dist.append((dist, i, t))
        
        trees_with_dist.sort(key=lambda x: x[0])
        subset = [t for _, _, t in trees_with_dist[:target_n]]
        
        score = get_score(subset, target_n)
        if score < best_score - 1e-9:
            best_score = score
            best_subset = subset
    
    return (best_subset, best_score) if best_subset else None

print('Rebuild from corners function defined (with polygon bounds fix)')

Rebuild from corners function defined (with polygon bounds fix)


In [7]:
# Test on a few N values
print('Testing rebuild from corners with polygon bounds fix...')
print('Loading N=200 layout as source...')

df_200 = df[df['N'] == 200]
trees_200 = []
for _, row in df_200.iterrows():
    x = str(row['x']).replace('s', '')
    y = str(row['y']).replace('s', '')
    deg = str(row['deg']).replace('s', '')
    trees_200.append({'x': x, 'y': y, 'deg': deg})

print(f'N=200 has {len(trees_200)} trees')
print(f'N=200 score: {current_scores[200]:.6f}')

# Test on N=50, 100, 150
for target_n in [50, 100, 150]:
    result = rebuild_from_corners_fixed(trees_200, target_n, current_scores[target_n])
    if result:
        subset, score = result
        improvement = current_scores[target_n] - score
        print(f'N={target_n}: IMPROVED by {improvement:.6f} (from {current_scores[target_n]:.6f} to {score:.6f})')
    else:
        print(f'N={target_n}: No improvement found')

Testing rebuild from corners with polygon bounds fix...
Loading N=200 layout as source...
N=200 has 200 trees
N=200 score: 0.337548


N=50: No improvement found


N=100: No improvement found


N=150: No improvement found


In [None]:
# Check if new sources have improvements over current best
from decimal import Decimal, getcontext
getcontext().prec = 30
SCALE_FACTOR = Decimal('1e18')

def create_high_precision_tree(x, y, angle):
    x = Decimal(str(x))
    y = Decimal(str(y))
    angle = Decimal(str(angle))
    sf = SCALE_FACTOR
    vertices = [
        (float(Decimal('0.0') * sf), float(Decimal('0.8') * sf)),
        (float(Decimal('0.125') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('-0.125') * sf), float(Decimal('0.5') * sf)),
    ]
    poly = Polygon(vertices)
    poly = affinity.rotate(poly, float(angle), origin=(0, 0))
    poly = affinity.translate(poly, xoff=float(x * sf), yoff=float(y * sf))
    return poly

def validate_no_overlap_strict(trees_data):
    if len(trees_data) <= 1:
        return True
    polygons = [create_high_precision_tree(t['x'], t['y'], t['deg']) for t in trees_data]
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                return False
    return True

def load_trees_from_csv(csv_path, n):
    try:
        df = pd.read_csv(csv_path)
        if 'id' in df.columns:
            df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        elif 'n' in df.columns:
            df['N'] = df['n']
        else:
            return None
        
        g = df[df['N'] == n]
        if len(g) != n:
            return None
        
        trees = []
        for _, row in g.iterrows():
            x = str(row['x']).replace('s', '')
            y = str(row['y']).replace('s', '')
            deg = str(row['deg']).replace('s', '')
            trees.append({'x': x, 'y': y, 'deg': deg})
        return trees
    except Exception as e:
        return None

print('Validation functions defined')

In [None]:
# Check new sources for improvements
new_sources_to_check = [
    '/home/code/kaggle_datasets/kumarandatascientist_ensemble/submission.csv',
    '/home/code/kaggle_datasets/kumarandatascientist_ensemble/submission_ensembled.csv',
    '/home/code/kaggle_datasets/crodoc_backpacking/submission.csv',
    '/home/code/kaggle_datasets/chistyakov_slow/submission.csv',
    '/home/code/kaggle_datasets/zaburo_aligned/submission.csv',
    '/home/code/kaggle_datasets/datafad_newyear/submission.csv',
]

print('Checking new sources for improvements...')
total_improvements = []

for csv_file in new_sources_to_check:
    source_name = csv_file.split('/')[-2]
    source_improvements = 0
    
    for n in range(1, 201):
        trees = load_trees_from_csv(csv_file, n)
        if trees is None:
            continue
        
        score = get_score(trees, n)
        
        if score < current_scores[n] - 1e-9:
            # Validate
            if validate_no_overlap_strict(trees):
                improvement = current_scores[n] - score
                total_improvements.append((n, improvement, source_name))
                source_improvements += 1
    
    print(f'{source_name}: {source_improvements} improvements')

print(f'\\nTotal improvements found: {len(total_improvements)}')