# Evolver Loop 10 - LB Feedback Analysis

## Key Results
- exp_009 submitted: CV 70.3733 = LB 70.3733 (EXACT MATCH!)
- Gap to target: 1.49 points (68.879 target)
- CV-LB alignment is PERFECT

## Strategy: Ensemble ALL Available Sources

We have many untapped sources:
1. kaggle_datasets/71.97.csv
2. kaggle_datasets/72.49.csv  
3. kaggle_datasets/submission.csv
4. kaggle_datasets/chistyakov/*.csv
5. kaggle_datasets/santa25-public/*.csv (17 files!)

In [2]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from decimal import Decimal, getcontext
from shapely.geometry import Polygon
from shapely import affinity
import json

getcontext().prec = 30
SCALE = Decimal('1e18')

# Define tree shape (from getting-started kernel)
def get_tree_polygon_scaled(center_x, center_y, angle):
    """Get tree polygon at position (x, y) with rotation angle, using 1e18 scaling."""
    trunk_w = Decimal('0.15')
    trunk_h = Decimal('0.2')
    base_w = Decimal('0.7')
    mid_w = Decimal('0.4')
    top_w = Decimal('0.25')
    tip_y = Decimal('0.8')
    tier_1_y = Decimal('0.5')
    tier_2_y = Decimal('0.25')
    base_y = Decimal('0.0')
    trunk_bottom_y = -trunk_h

    vertices = [
        (Decimal('0.0') * SCALE, tip_y * SCALE),
        (top_w / Decimal('2') * SCALE, tier_1_y * SCALE),
        (top_w / Decimal('4') * SCALE, tier_1_y * SCALE),
        (mid_w / Decimal('2') * SCALE, tier_2_y * SCALE),
        (mid_w / Decimal('4') * SCALE, tier_2_y * SCALE),
        (base_w / Decimal('2') * SCALE, base_y * SCALE),
        (trunk_w / Decimal('2') * SCALE, base_y * SCALE),
        (trunk_w / Decimal('2') * SCALE, trunk_bottom_y * SCALE),
        (-(trunk_w / Decimal('2')) * SCALE, trunk_bottom_y * SCALE),
        (-(trunk_w / Decimal('2')) * SCALE, base_y * SCALE),
        (-(base_w / Decimal('2')) * SCALE, base_y * SCALE),
        (-(mid_w / Decimal('4')) * SCALE, tier_2_y * SCALE),
        (-(mid_w / Decimal('2')) * SCALE, tier_2_y * SCALE),
        (-(top_w / Decimal('4')) * SCALE, tier_1_y * SCALE),
        (-(top_w / Decimal('2')) * SCALE, tier_1_y * SCALE),
    ]
    
    initial_polygon = Polygon([(float(x), float(y)) for x, y in vertices])
    rotated = affinity.rotate(initial_polygon, float(angle), origin=(0, 0))
    translated = affinity.translate(rotated, 
                                    xoff=float(Decimal(str(center_x)) * SCALE),
                                    yoff=float(Decimal(str(center_y)) * SCALE))
    return translated

def compute_bbox_score(trees):
    """Compute bounding box score for a list of trees."""
    all_coords = []
    for tree in trees:
        poly = get_tree_polygon_scaled(tree['x'], tree['y'], tree['deg'])
        all_coords.extend(list(poly.exterior.coords))
    if not all_coords:
        return float('inf')
    xs = [c[0] for c in all_coords]
    ys = [c[1] for c in all_coords]
    width = (max(xs) - min(xs)) / float(SCALE)
    height = (max(ys) - min(ys)) / float(SCALE)
    return max(width, height)

def validate_no_overlap_strict(trees):
    """Validate no overlaps using 1e18 scaling."""
    polygons = []
    for tree in trees:
        poly = get_tree_polygon_scaled(tree['x'], tree['y'], tree['deg'])
        polygons.append(poly)
    
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                return False
    return True

print('Functions loaded')

Functions loaded


In [3]:
def parse_submission(filepath):
    """Parse a submission CSV into dict of N -> list of trees."""
    df = pd.read_csv(filepath)
    solutions = {}
    for _, row in df.iterrows():
        id_parts = row['id'].split('_')
        n = int(id_parts[0])
        x = float(str(row['x']).replace('s', ''))
        y = float(str(row['y']).replace('s', ''))
        deg = float(str(row['deg']).replace('s', ''))
        if n not in solutions:
            solutions[n] = []
        solutions[n].append({'x': x, 'y': y, 'deg': deg})
    return solutions

def compute_total_score(solutions):
    """Compute total score across all N."""
    total = 0
    for n in range(1, 201):
        if n in solutions:
            score = compute_bbox_score(solutions[n])
            total += score
    return total

def compute_per_n_scores(solutions):
    """Compute score for each N."""
    scores = {}
    for n in range(1, 201):
        if n in solutions:
            scores[n] = compute_bbox_score(solutions[n])
    return scores

print('Parsing functions loaded')

Parsing functions loaded


In [4]:
# Load current best (exp_009)
best_path = '/home/code/experiments/009_santa_ensemble/submission.csv'
best_solutions = parse_submission(best_path)
best_per_n = compute_per_n_scores(best_solutions)
best_total = sum(best_per_n.values())
print(f'Current best (exp_009): {best_total:.6f}')

Current best (exp_009): 1111.679106


In [5]:
# Collect ALL available CSV sources
sources = []

# Main kaggle_datasets
for f in ['71.97.csv', '72.49.csv', 'submission.csv']:
    path = f'/home/code/kaggle_datasets/{f}'
    if os.path.exists(path):
        sources.append(path)

# Chistyakov
for f in os.listdir('/home/code/kaggle_datasets/chistyakov'):
    if f.endswith('.csv'):
        sources.append(f'/home/code/kaggle_datasets/chistyakov/{f}')

# Santa25-public
for f in os.listdir('/home/code/kaggle_datasets/santa25-public'):
    if f.endswith('.csv'):
        sources.append(f'/home/code/kaggle_datasets/santa25-public/{f}')

print(f'Found {len(sources)} additional sources:')
for s in sources:
    print(f'  {s}')

Found 21 additional sources:
  /home/code/kaggle_datasets/71.97.csv
  /home/code/kaggle_datasets/72.49.csv
  /home/code/kaggle_datasets/submission.csv
  /home/code/kaggle_datasets/chistyakov/submission_best.csv
  /home/code/kaggle_datasets/chistyakov/70.378875862989_20260126_045659.csv
  /home/code/kaggle_datasets/santa25-public/submission_JKoT4.csv
  /home/code/kaggle_datasets/santa25-public/New_Tree_144_196.csv
  /home/code/kaggle_datasets/santa25-public/submission_JKoT3.csv
  /home/code/kaggle_datasets/santa25-public/santa2025_ver2_v61.csv
  /home/code/kaggle_datasets/santa25-public/submission_JKoT2.csv
  /home/code/kaggle_datasets/santa25-public/santa2025_ver2_v67.csv
  /home/code/kaggle_datasets/santa25-public/santa2025_ver2_v76.csv
  /home/code/kaggle_datasets/santa25-public/submission_70_936673758122.csv
  /home/code/kaggle_datasets/santa25-public/santa2025_ver2_v65.csv
  /home/code/kaggle_datasets/santa25-public/submission_70_926149550346.csv
  /home/code/kaggle_datasets/santa2

In [6]:
# Evaluate each source
source_scores = {}
source_solutions = {}

for source_path in sources:
    try:
        solutions = parse_submission(source_path)
        total = compute_total_score(solutions)
        source_scores[source_path] = total
        source_solutions[source_path] = solutions
        print(f'{os.path.basename(source_path)}: {total:.6f}')
    except Exception as e:
        print(f'{os.path.basename(source_path)}: ERROR - {e}')

71.97.csv: 1125.859260


72.49.csv: 1130.321650


submission.csv: 1113.668219


submission_best.csv: 1116.142291


70.378875862989_20260126_045659.csv: 1111.750202


submission_JKoT4.csv: 1130.261224


New_Tree_144_196.csv: 1134.510516


submission_JKoT3.csv: 1130.261121


santa2025_ver2_v61.csv: 1134.771009


submission_JKoT2.csv: 1130.259807


santa2025_ver2_v67.csv: 1134.625115


santa2025_ver2_v76.csv: 1133.430070


submission_70_936673758122.csv: 1116.257766


santa2025_ver2_v65.csv: 1134.591565


submission_70_926149550346.csv: 1116.142291


santa2025_ver2_v66.csv: 1134.625509


santa2025_ver2_v63.csv: 1134.721577


santa2025_ver2_v69.csv: 1133.671167


submission_JKoT1.csv: 1130.261042


submission_opt1.csv: 1116.821766


santa2025_ver2_v68.csv: 1134.632788


In [None]:
# Sort sources by score
sorted_sources = sorted(source_scores.items(), key=lambda x: x[1])
print('\nSources ranked by total score:')
for path, score in sorted_sources:
    print(f'  {score:.4f}: {os.path.basename(path)}')

In [None]:
# Now ensemble: for each N, find the BEST solution across ALL sources
# that passes strict validation

all_sources = {'exp_009': best_solutions}
for path, solutions in source_solutions.items():
    all_sources[os.path.basename(path)] = solutions

print(f'Total sources for ensemble: {len(all_sources)}')

# Track best per-N
ensemble_solutions = {}
ensemble_sources = {}
improvements = []

for n in range(1, 201):
    best_score = best_per_n.get(n, float('inf'))
    best_source = 'exp_009'
    best_trees = best_solutions.get(n, [])
    
    for source_name, solutions in all_sources.items():
        if source_name == 'exp_009':
            continue
        if n not in solutions:
            continue
        
        trees = solutions[n]
        score = compute_bbox_score(trees)
        
        if score < best_score - 1e-9:  # Found improvement
            # Validate no overlaps
            if validate_no_overlap_strict(trees):
                best_score = score
                best_source = source_name
                best_trees = trees
    
    ensemble_solutions[n] = best_trees
    ensemble_sources[n] = best_source
    
    if best_source != 'exp_009':
        improvement = best_per_n.get(n, 0) - best_score
        improvements.append((n, improvement, best_source))

print(f'\nFound {len(improvements)} improvements over exp_009')
if improvements:
    improvements.sort(key=lambda x: -x[1])
    print('\nTop 20 improvements:')
    for n, imp, src in improvements[:20]:
        print(f'  N={n}: +{imp:.6f} from {src}')

In [None]:
# Compute ensemble total score
ensemble_per_n = compute_per_n_scores(ensemble_solutions)
ensemble_total = sum(ensemble_per_n.values())

print(f'\nExp_009 total: {best_total:.6f}')
print(f'Ensemble total: {ensemble_total:.6f}')
print(f'Improvement: {best_total - ensemble_total:.6f}')
print(f'\nTarget: 68.879467')
print(f'Gap: {ensemble_total - 68.879467:.6f}')

In [None]:
# Count sources used
from collections import Counter
source_counts = Counter(ensemble_sources.values())
print('\nSources used in ensemble:')
for src, count in source_counts.most_common():
    print(f'  {src}: {count} N values')

In [None]:
# Save ensemble if it's better
if ensemble_total < best_total - 0.0001:
    print('\n=== SAVING IMPROVED ENSEMBLE ===')
    
    # Create submission CSV
    rows = []
    for n in range(1, 201):
        trees = ensemble_solutions[n]
        for i, tree in enumerate(trees):
            rows.append({
                'id': f'{n:03d}_{i}',
                'x': f"s{tree['x']}",
                'y': f"s{tree['y']}",
                'deg': f"s{tree['deg']}"
            })
    
    submission_df = pd.DataFrame(rows)
    
    # Save to experiment folder
    exp_folder = '/home/code/experiments/010_full_ensemble'
    os.makedirs(exp_folder, exist_ok=True)
    submission_df.to_csv(f'{exp_folder}/submission.csv', index=False)
    
    # Save metrics
    metrics = {
        'cv_score': ensemble_total,
        'baseline_score': best_total,
        'improvement': best_total - ensemble_total,
        'n_improved': len(improvements),
        'sources_used': dict(source_counts),
        'target': 68.879467,
        'gap': ensemble_total - 68.879467
    }
    with open(f'{exp_folder}/metrics.json', 'w') as f:
        json.dump(metrics, f, indent=2)
    
    # Copy to submission folder
    submission_df.to_csv('/home/submission/submission.csv', index=False)
    
    print(f'Saved to {exp_folder}')
    print(f'Score: {ensemble_total:.6f}')
    print(f'Improvement: {best_total - ensemble_total:.6f}')
else:
    print('\nNo improvement found from additional sources')