# Evolver Loop 4 Analysis: Comprehensive Source Exploration

## Key Insight from Evaluator
The evaluator correctly identified that we've only used 4 sources, but there are 15+ available pre-optimized CSVs. The Jonathan Chan kernel achieves better scores by ensembling from many more sources.

## Goal
1. Scan ALL available pre-optimized CSVs
2. Score each one
3. Build comprehensive ensemble picking best per-N
4. Identify which sources win for which N values

In [None]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
import glob
import os
from tqdm import tqdm

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def parse_value(s):
    if isinstance(s, str) and s.startswith('s'):
        return float(s[1:])
    return float(s)

def create_tree_polygon(x, y, deg):
    angle_rad = np.radians(deg)
    cos_a, sin_a = np.cos(angle_rad), np.sin(angle_rad)
    vertices = [(tx * cos_a - ty * sin_a + x, tx * sin_a + ty * cos_a + y) for tx, ty in zip(TX, TY)]
    return Polygon(vertices)

def compute_bounding_side(polygons):
    if not polygons:
        return 0
    all_points = []
    for poly in polygons:
        all_points.extend(list(poly.exterior.coords))
    all_points = np.array(all_points)
    return max(all_points.max(axis=0) - all_points.min(axis=0))

def compute_score_for_n(df, n):
    prefix = f"{n:03d}_"
    trees = df[df['id'].str.startswith(prefix)]
    if len(trees) != n:
        return float('inf')
    polygons = [create_tree_polygon(parse_value(row['x']), parse_value(row['y']), parse_value(row['deg'])) for _, row in trees.iterrows()]
    side = compute_bounding_side(polygons)
    return side**2 / n

def compute_total_score(df):
    return sum(compute_score_for_n(df, n) for n in range(1, 201))

print("Functions defined")

In [None]:
# Step 1: Find ALL CSV files in preoptimized directories

base_paths = [
    '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/',
    '/home/code/external_data/',
]

all_csvs = []
for base in base_paths:
    csvs = glob.glob(os.path.join(base, '**/*.csv'), recursive=True)
    all_csvs.extend(csvs)

print(f"Found {len(all_csvs)} CSV files:")
for csv in sorted(all_csvs):
    print(f"  {csv}")

In [None]:
# Step 2: Score each CSV file

print("\nScoring all CSV files...")
scores = {}
valid_dfs = {}

for csv_path in tqdm(all_csvs):
    try:
        df = pd.read_csv(csv_path)
        # Check if it has the right columns
        if 'id' not in df.columns or 'x' not in df.columns or 'y' not in df.columns or 'deg' not in df.columns:
            print(f"  Skipping {csv_path} - missing columns")
            continue
        # Check if it has 200 configurations (N=1 to 200)
        n_configs = len(df['id'].str[:3].unique())
        if n_configs < 200:
            print(f"  Skipping {csv_path} - only {n_configs} configurations")
            continue
        
        score = compute_total_score(df)
        scores[csv_path] = score
        valid_dfs[csv_path] = df
        print(f"  {os.path.basename(csv_path)}: {score:.6f}")
    except Exception as e:
        print(f"  Error with {csv_path}: {e}")

print(f"\nSuccessfully scored {len(scores)} CSV files")

In [None]:
# Step 3: Rank all sources by total score

print("\nRanking all sources by total score:")
print("="*80)
ranked = sorted(scores.items(), key=lambda x: x[1])
for i, (path, score) in enumerate(ranked[:20], 1):
    print(f"{i:2d}. {score:.6f} - {os.path.basename(path)}")

print(f"\nBest source: {os.path.basename(ranked[0][0])} with score {ranked[0][1]:.6f}")
print(f"Current baseline: 70.659959")
print(f"Target: 68.919154")

In [None]:
# Step 4: Build comprehensive ensemble - pick best per-N from ALL sources

print("\nBuilding comprehensive ensemble (best per-N from all sources)...")

# For each N, find the best source
best_per_n = {}  # n -> (score, source_path)
for n in range(1, 201):
    best_score = float('inf')
    best_source = None
    for path, df in valid_dfs.items():
        score_n = compute_score_for_n(df, n)
        if score_n < best_score:
            best_score = score_n
            best_source = path
    best_per_n[n] = (best_score, best_source)

# Count wins per source
wins_per_source = {}
for n, (score, source) in best_per_n.items():
    source_name = os.path.basename(source)
    wins_per_source[source_name] = wins_per_source.get(source_name, 0) + 1

print("\nWins per source:")
for source, wins in sorted(wins_per_source.items(), key=lambda x: -x[1]):
    print(f"  {source}: {wins} N values")

# Calculate ensemble score
ensemble_score = sum(score for score, _ in best_per_n.values())
print(f"\nEnsemble score (best per-N from all sources): {ensemble_score:.6f}")
print(f"Best single source score: {ranked[0][1]:.6f}")
print(f"Improvement from ensemble: {ranked[0][1] - ensemble_score:.6f}")

In [None]:
# Step 5: Create the ensemble submission

print("\nCreating ensemble submission...")
ensemble_rows = []

for n in range(1, 201):
    score_n, source_path = best_per_n[n]
    df = valid_dfs[source_path]
    prefix = f"{n:03d}_"
    trees = df[df['id'].str.startswith(prefix)]
    for _, row in trees.iterrows():
        ensemble_rows.append(row.to_dict())

ensemble_df = pd.DataFrame(ensemble_rows)
print(f"Ensemble has {len(ensemble_df)} rows")

# Verify score
verify_score = compute_total_score(ensemble_df)
print(f"Verified ensemble score: {verify_score:.6f}")

# Save
ensemble_df.to_csv('/home/submission/submission.csv', index=False)
print("Saved to /home/submission/submission.csv")

In [None]:
# Step 6: Analyze per-N improvements

print("\nPer-N analysis: Where did ensemble improve over best single source?")
print("="*80)

best_single_df = valid_dfs[ranked[0][0]]
improvements = []

for n in range(1, 201):
    single_score = compute_score_for_n(best_single_df, n)
    ensemble_score_n = best_per_n[n][0]
    diff = single_score - ensemble_score_n
    if diff > 1e-9:
        improvements.append((n, diff, single_score, ensemble_score_n, os.path.basename(best_per_n[n][1])))

print(f"\nEnsemble improved {len(improvements)} N values over best single source:")
for n, diff, single, ens, source in sorted(improvements, key=lambda x: -x[1])[:20]:
    print(f"  N={n:3d}: {single:.6f} -> {ens:.6f} (improved by {diff:.6f}) from {source}")

In [None]:
# Step 7: Summary

print("="*80)
print("COMPREHENSIVE ENSEMBLE SUMMARY")
print("="*80)
print(f"\nTotal sources scanned: {len(all_csvs)}")
print(f"Valid sources: {len(valid_dfs)}")
print(f"\nBest single source: {os.path.basename(ranked[0][0])}")
print(f"Best single source score: {ranked[0][1]:.6f}")
print(f"\nEnsemble score: {verify_score:.6f}")
print(f"Improvement: {ranked[0][1] - verify_score:.6f}")
print(f"\nTarget: 68.919154")
print(f"Gap to target: {verify_score - 68.919154:.6f}")
print(f"\nSources contributing to ensemble:")
for source, wins in sorted(wins_per_source.items(), key=lambda x: -x[1]):
    print(f"  {source}: {wins} N values")