# Loop 6 Analysis: Finding Improvement Opportunities

## Goal: Identify which N values have room for improvement

After 6 experiments with zero improvement, we need to understand:
1. What is the theoretical minimum for each N?
2. Which N values are furthest from optimal?
3. What approaches haven't been tried?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.ops import unary_union

getcontext().prec = 25
scale_factor = Decimal('1e15')

# Load baseline
baseline_path = '/home/code/experiments/001_baseline/santa-2025.csv'
df = pd.read_csv(baseline_path)
print(f'Loaded {len(df)} rows')

In [None]:
# Calculate per-N scores
def get_per_n_scores(df):
    """Calculate the score contribution for each N"""
    scores = {}
    for n in range(1, 201):
        group_data = df[df['id'].str.startswith(f'{n:03d}_')]
        if len(group_data) != n:
            print(f'Warning: N={n} has {len(group_data)} trees')
            continue
        
        # Get bounding box
        xs = group_data['x'].str[1:].astype(float).values
        ys = group_data['y'].str[1:].astype(float).values
        degs = group_data['deg'].str[1:].astype(float).values
        
        # Tree vertices (before rotation)
        TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
        TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])
        
        all_x = []
        all_y = []
        for i in range(n):
            angle_rad = np.radians(degs[i])
            cos_a, sin_a = np.cos(angle_rad), np.sin(angle_rad)
            rot_x = TX * cos_a - TY * sin_a + xs[i]
            rot_y = TX * sin_a + TY * cos_a + ys[i]
            all_x.extend(rot_x)
            all_y.extend(rot_y)
        
        min_x, max_x = min(all_x), max(all_x)
        min_y, max_y = min(all_y), max(all_y)
        side = max(max_x - min_x, max_y - min_y)
        score = side**2 / n
        scores[n] = {'side': side, 'score': score}
    
    return scores

scores = get_per_n_scores(df)
print(f'Calculated scores for {len(scores)} N values')

In [None]:
# Analyze score distribution
df_scores = pd.DataFrame([
    {'n': n, 'side': v['side'], 'score': v['score']}
    for n, v in scores.items()
])

print('Per-N Score Analysis:')
print(f'Total score: {df_scores["score"].sum():.6f}')
print(f'Target: 68.919154')
print(f'Gap: {df_scores["score"].sum() - 68.919154:.6f}')
print()
print('Top 10 highest score contributions:')
print(df_scores.nlargest(10, 'score')[['n', 'side', 'score']])

In [None]:
# Calculate theoretical minimum for each N
# For N trees, the minimum bounding box is achieved when trees are packed optimally
# A single tree has bounding box ~0.813 x 1.0 (at 0 degrees) or ~0.813 x 0.813 (at 45 degrees)

# Tree dimensions at different angles
def get_tree_bbox(angle_deg):
    """Get bounding box of a single tree at given angle"""
    TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
    TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])
    
    angle_rad = np.radians(angle_deg)
    cos_a, sin_a = np.cos(angle_rad), np.sin(angle_rad)
    rot_x = TX * cos_a - TY * sin_a
    rot_y = TX * sin_a + TY * cos_a
    
    width = max(rot_x) - min(rot_x)
    height = max(rot_y) - min(rot_y)
    return width, height

# Find minimum bounding box for single tree
min_side = float('inf')
best_angle = 0
for angle in range(0, 360, 1):
    w, h = get_tree_bbox(angle)
    side = max(w, h)
    if side < min_side:
        min_side = side
        best_angle = angle

print(f'Single tree minimum bounding box: {min_side:.6f} at {best_angle} degrees')
print(f'N=1 baseline score: {scores[1]["score"]:.6f}')
print(f'N=1 theoretical minimum: {min_side**2:.6f}')

In [None]:
# Calculate efficiency ratio: actual_score / theoretical_minimum
# For N trees, theoretical minimum is approximately: (tree_area * N) / packing_efficiency
# But for bounding box, it's more complex

# Let's compute the "efficiency" as: side / sqrt(N * single_tree_area)
# Where single_tree_area is the area of one tree

# Tree area (approximate from polygon)
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

# Shoelace formula for polygon area
def polygon_area(x, y):
    n = len(x)
    area = 0
    for i in range(n):
        j = (i + 1) % n
        area += x[i] * y[j]
        area -= x[j] * y[i]
    return abs(area) / 2

tree_area = polygon_area(TX, TY)
print(f'Single tree area: {tree_area:.6f}')

# For N trees, the minimum possible bounding box side is:
# side >= sqrt(N * tree_area / packing_efficiency)
# For irregular shapes, packing efficiency is typically 0.6-0.8

df_scores['theoretical_min_side'] = np.sqrt(df_scores['n'] * tree_area / 0.7)  # Assume 70% packing efficiency
df_scores['efficiency'] = df_scores['theoretical_min_side'] / df_scores['side']

print('\nEfficiency analysis (higher = better packed):')
print(df_scores.nsmallest(10, 'efficiency')[['n', 'side', 'theoretical_min_side', 'efficiency', 'score']])

In [None]:
# Plot score vs N
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Score per N
ax = axes[0, 0]
ax.bar(df_scores['n'], df_scores['score'], alpha=0.7)
ax.set_xlabel('N')
ax.set_ylabel('Score contribution')
ax.set_title('Score contribution by N')
ax.axhline(y=68.919154/200, color='r', linestyle='--', label='Target avg')
ax.legend()

# Side length vs N
ax = axes[0, 1]
ax.scatter(df_scores['n'], df_scores['side'], alpha=0.5, s=10)
ax.set_xlabel('N')
ax.set_ylabel('Side length')
ax.set_title('Bounding box side length by N')

# Efficiency vs N
ax = axes[1, 0]
ax.scatter(df_scores['n'], df_scores['efficiency'], alpha=0.5, s=10)
ax.set_xlabel('N')
ax.set_ylabel('Efficiency')
ax.set_title('Packing efficiency by N (higher = better)')

# Cumulative score
ax = axes[1, 1]
df_scores_sorted = df_scores.sort_values('n')
ax.plot(df_scores_sorted['n'], df_scores_sorted['score'].cumsum())
ax.axhline(y=68.919154, color='r', linestyle='--', label='Target')
ax.set_xlabel('N')
ax.set_ylabel('Cumulative score')
ax.set_title('Cumulative score by N')
ax.legend()

plt.tight_layout()
plt.savefig('/home/code/exploration/loop6_score_analysis.png', dpi=100)
plt.show()

In [None]:
# Identify N values with most room for improvement
# Compare to target: if target is 68.919154 and current is 70.676102
# Gap = 1.756948, which is 2.55% improvement needed

# If we could improve each N by 2.55%, we'd reach target
# But some N values may have more room than others

df_scores['target_score'] = df_scores['score'] * (68.919154 / 70.676102)
df_scores['improvement_needed'] = df_scores['score'] - df_scores['target_score']

print('Improvement needed per N (if uniform 2.55% improvement):')
print(df_scores.nlargest(20, 'improvement_needed')[['n', 'score', 'target_score', 'improvement_needed']])

In [None]:
# Check if there are any N values where the baseline might be suboptimal
# by comparing to known good configurations from other sources

import os
import glob

# Load all available CSV files
snapshot_dir = '/home/nonroot/snapshots/santa-2025/21117626902/code/data'
csv_files = glob.glob(f'{snapshot_dir}/**/*.csv', recursive=True)
print(f'Found {len(csv_files)} CSV files in snapshots')

# Also check telegram and other sources
for pattern in ['/home/nonroot/snapshots/**/*.csv']:
    files = glob.glob(pattern, recursive=True)
    csv_files.extend(files)

csv_files = list(set(csv_files))
print(f'Total unique CSV files: {len(csv_files)}')

# Show first few
for f in csv_files[:10]:
    print(f'  {f}')

In [None]:
# Compare per-N scores across all sources
def get_per_n_scores_from_file(filepath):
    """Get per-N scores from a CSV file"""
    try:
        df = pd.read_csv(filepath)
        if 'x' not in df.columns or 'y' not in df.columns:
            return None
        return get_per_n_scores(df)
    except Exception as e:
        return None

# Collect scores from all sources
all_scores = {}
for filepath in csv_files[:20]:  # Limit to first 20 for speed
    scores_dict = get_per_n_scores_from_file(filepath)
    if scores_dict:
        name = os.path.basename(filepath)
        all_scores[name] = scores_dict
        total = sum(s['score'] for s in scores_dict.values())
        print(f'{name}: {total:.6f}')

print(f'\nLoaded scores from {len(all_scores)} files')

In [None]:
# Find N values where different sources have different scores
if len(all_scores) > 1:
    baseline_name = 'santa-2025.csv'
    if baseline_name not in all_scores:
        baseline_name = list(all_scores.keys())[0]
    
    print(f'\nComparing to baseline: {baseline_name}')
    print('\nN values where other sources are better:')
    
    improvements = []
    for n in range(1, 201):
        baseline_score = all_scores[baseline_name].get(n, {}).get('score', float('inf'))
        for name, scores_dict in all_scores.items():
            if name == baseline_name:
                continue
            other_score = scores_dict.get(n, {}).get('score', float('inf'))
            if other_score < baseline_score - 0.0001:  # Significant improvement
                improvements.append({
                    'n': n,
                    'baseline_score': baseline_score,
                    'other_score': other_score,
                    'improvement': baseline_score - other_score,
                    'source': name
                })
    
    if improvements:
        df_imp = pd.DataFrame(improvements)
        print(df_imp.sort_values('improvement', ascending=False).head(20))
    else:
        print('No improvements found - all sources have same scores')

In [None]:
# Summary: What we've learned
print('='*60)
print('SUMMARY: Loop 6 Analysis')
print('='*60)
print()
print(f'Current total score: {df_scores["score"].sum():.6f}')
print(f'Target score: 68.919154')
print(f'Gap: {df_scores["score"].sum() - 68.919154:.6f} ({(df_scores["score"].sum() - 68.919154)/68.919154*100:.2f}%)')
print()
print('Key findings:')
print('1. All 30 pre-optimized sources converge to the same local optimum')
print('2. Local search (SA, bbox3, deletion cascade) cannot escape this optimum')
print('3. The gap to target (1.76 points) requires fundamentally different solutions')
print()
print('Approaches NOT yet tried:')
print('1. Egortrushin lattice + SA (optimizes grid translation parameters)')
print('2. Basin hopping (random large perturbations + local optimization)')
print('3. Genetic algorithm with crossover between different solutions')
print('4. Symmetric packing patterns (42-vote discussion mentions this)')
print('5. Mathematical analysis of optimal packing for specific N values')