# Evolver Loop 2 Analysis

## Key Findings from Research:
1. **Ensemble approach is critical** - Top kernels combine best solutions from 15+ sources per N
2. **Lattice/crystallization patterns** - Blue Phase (0° ± 90°) + Pink Phase (180° ± 90°) interlock efficiently
3. **Long SA runs needed** - sa_v1_parallel with -n 15000+ iterations, -r 80+ rounds
4. **Fractional translation** - Fine-tuning with steps [0.001, 0.0005, 0.0002, 0.0001, 0.00005, 0.00002, 0.00001]

## Current Status:
- Best CV: 70.673023 (exp_001)
- Best LB: 70.676102 (exp_000)
- Target: 68.894234
- Gap: 1.78 points (2.5%)

In [None]:
import pandas as pd
import numpy as np
import os
import glob
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
import math
from tqdm import tqdm

# Tree geometry
TREE_VERTICES = [
    (0, 0.8), (-0.125, 0.5), (-0.05, 0.5), (-0.2, 0.25), (-0.1, 0.25),
    (-0.35, 0), (-0.075, 0), (-0.075, -0.2), (0.075, -0.2), (0.075, 0),
    (0.35, 0), (0.1, 0.25), (0.2, 0.25), (0.05, 0.5), (0.125, 0.5),
]

def create_tree_polygon(x, y, angle_deg):
    poly = Polygon(TREE_VERTICES)
    poly = rotate(poly, angle_deg, origin=(0, 0))
    poly = translate(poly, x, y)
    return poly

def get_bounding_box_side(polygons):
    if not polygons:
        return 0
    all_coords = []
    for poly in polygons:
        all_coords.extend(list(poly.exterior.coords))
    xs = [c[0] for c in all_coords]
    ys = [c[1] for c in all_coords]
    width = max(xs) - min(xs)
    height = max(ys) - min(ys)
    return max(width, height)

def parse_value(val):
    if isinstance(val, str) and val.startswith('s'):
        return float(val[1:])
    return float(val)

print("Functions loaded")

In [None]:
# Find all available CSV files for ensemble
csv_sources = []

# Snapshots
snapshot_csvs = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
csv_sources.extend(snapshot_csvs)

# Local preoptimized
local_csvs = glob.glob('/home/code/preoptimized/**/*.csv', recursive=True)
csv_sources.extend(local_csvs)

print(f"Found {len(csv_sources)} CSV files")
print("\nSample files:")
for f in csv_sources[:10]:
    print(f"  {f}")

In [None]:
# Calculate per-N scores for all CSVs and find best per N
def calculate_per_n_scores(csv_path):
    """Calculate score contribution for each N in a submission."""
    try:
        df = pd.read_csv(csv_path)
        if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
            return None
        
        df['x_val'] = df['x'].apply(parse_value)
        df['y_val'] = df['y'].apply(parse_value)
        df['deg_val'] = df['deg'].apply(parse_value)
        df['n'] = df['id'].apply(lambda x: int(str(x).split('_')[0]))
        
        scores = {}
        for n in range(1, 201):
            n_data = df[df['n'] == n]
            if len(n_data) != n:
                continue
            polygons = [create_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) 
                       for _, row in n_data.iterrows()]
            side = get_bounding_box_side(polygons)
            scores[n] = side**2 / n
        return scores
    except Exception as e:
        return None

# Process all CSVs (this may take a while)
all_scores = {}
valid_csvs = []

print("Processing CSVs...")
for csv_path in tqdm(csv_sources[:50]):  # Limit to first 50 for speed
    scores = calculate_per_n_scores(csv_path)
    if scores and len(scores) == 200:
        name = os.path.basename(csv_path)
        all_scores[name] = scores
        valid_csvs.append(csv_path)
        total = sum(scores.values())
        print(f"  {name}: {total:.6f}")

In [None]:
# Find best per-N across all sources
if all_scores:
    best_per_n = {}
    best_source_per_n = {}
    
    for n in range(1, 201):
        best_score = float('inf')
        best_source = None
        for name, scores in all_scores.items():
            if n in scores and scores[n] < best_score:
                best_score = scores[n]
                best_source = name
        best_per_n[n] = best_score
        best_source_per_n[n] = best_source
    
    ensemble_score = sum(best_per_n.values())
    print(f"\nEnsemble score (best per N): {ensemble_score:.6f}")
    print(f"Current best: 70.676102")
    print(f"Improvement: {70.676102 - ensemble_score:.6f}")
    
    # Show which sources contribute most
    source_counts = {}
    for n, source in best_source_per_n.items():
        source_counts[source] = source_counts.get(source, 0) + 1
    
    print("\nSource contributions:")
    for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
        print(f"  {source}: {count} N values")
else:
    print("No valid CSVs found")

In [None]:
# Analyze the baseline solution to understand the lattice pattern
baseline_path = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025.csv'
df = pd.read_csv(baseline_path)
df['x_val'] = df['x'].apply(parse_value)
df['y_val'] = df['y'].apply(parse_value)
df['deg_val'] = df['deg'].apply(parse_value)
df['n'] = df['id'].apply(lambda x: int(str(x).split('_')[0]))

# Analyze angle distribution for large N
print("Angle distribution analysis for large N:")
for n in [50, 100, 150, 200]:
    n_data = df[df['n'] == n]
    angles = n_data['deg_val'].values % 360
    
    # Blue phase: 0° ± 90° (i.e., -90 to 90 or 270 to 360 and 0 to 90)
    # Pink phase: 180° ± 90° (i.e., 90 to 270)
    blue_count = sum((angles <= 90) | (angles > 270))
    pink_count = sum((angles > 90) & (angles <= 270))
    
    print(f"\nN={n}:")
    print(f"  Blue phase (up): {blue_count} ({100*blue_count/n:.1f}%)")
    print(f"  Pink phase (down): {pink_count} ({100*pink_count/n:.1f}%)")
    print(f"  Angle mean: {angles.mean():.1f}°, std: {angles.std():.1f}°")

In [None]:
# Analyze spacing patterns for large N
import matplotlib.pyplot as plt

for n in [100, 200]:
    n_data = df[df['n'] == n].copy()
    x = n_data['x_val'].values
    y = n_data['y_val'].values
    angles = n_data['deg_val'].values % 360
    
    # Calculate pairwise distances
    from scipy.spatial.distance import pdist
    distances = pdist(np.column_stack([x, y]))
    
    print(f"\nN={n} spacing analysis:")
    print(f"  Min distance: {distances.min():.4f}")
    print(f"  Mean distance: {distances.mean():.4f}")
    print(f"  Most common distance: {np.median(distances):.4f}")
    
    # Find nearest neighbor distances
    from scipy.spatial import cKDTree
    tree = cKDTree(np.column_stack([x, y]))
    nn_dists, _ = tree.query(np.column_stack([x, y]), k=2)
    nn_dists = nn_dists[:, 1]  # Second nearest (first is self)
    
    print(f"  Nearest neighbor mean: {nn_dists.mean():.4f}")
    print(f"  Nearest neighbor std: {nn_dists.std():.4f}")

In [None]:
# Check if there are better CSVs in the bucket-of-chump dataset
bucket_path = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/bucket-of-chump'
if os.path.exists(bucket_path):
    bucket_csvs = glob.glob(f'{bucket_path}/**/*.csv', recursive=True)
    print(f"Found {len(bucket_csvs)} CSVs in bucket-of-chump")
    
    for csv_path in bucket_csvs[:5]:
        scores = calculate_per_n_scores(csv_path)
        if scores:
            total = sum(scores.values())
            print(f"  {os.path.basename(csv_path)}: {total:.6f}")
else:
    print("bucket-of-chump not found")

In [None]:
# Summary of findings
print("="*60)
print("ANALYSIS SUMMARY")
print("="*60)
print(f"\n1. Current best score: 70.676102")
print(f"2. Target score: 68.894234")
print(f"3. Gap to close: 1.78 points (2.5%)")
print(f"\n4. Key insights:")
print(f"   - Baseline is at a VERY tight local optimum")
print(f"   - Standard optimization (bbox3, SA) cannot improve it")
print(f"   - Need ENSEMBLE approach: combine best per-N from multiple sources")
print(f"   - Need LONG optimization runs (hours, not minutes)")
print(f"   - Lattice patterns: Blue (up) + Pink (down) phases interlock")
print(f"\n5. Next steps:")
print(f"   - Implement ensemble from all available CSVs")
print(f"   - Run sa_v1_parallel with -n 15000 -r 80 for hours")
print(f"   - Try lattice-based generation for large N")
print(f"   - Use fractional translation for fine-tuning")