# Loop 3 Analysis: Understanding the Score Gap

**Target:** 68.919154
**Current best:** 70.659437
**Gap:** 1.74 points (2.5%)

Key questions:
1. Which N values contribute most to the gap?
2. What are the theoretical limits for each N?
3. Where can we find better solutions?

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from shapely.geometry import Polygon
from shapely import affinity
import matplotlib.pyplot as plt

# Tree polygon
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]
BASE_TREE = Polygon(zip(TX, TY))
TREE_AREA = BASE_TREE.area
print(f'Tree area: {TREE_AREA:.6f}')
print(f'Theoretical minimum score (if 100% packing): {200 * TREE_AREA:.6f}')

In [None]:
# Load current best submission
df = pd.read_csv('/home/submission/submission.csv')

def parse_value(val):
    if isinstance(val, str):
        if val.startswith('s'):
            return float(val[1:])
        return float(val)
    return float(val)

def create_tree(x, y, deg):
    tree = affinity.rotate(BASE_TREE, deg, origin=(0, 0))
    tree = affinity.translate(tree, x, y)
    return tree

def get_n_score(df, n):
    prefix = f"{n:03d}_"
    n_rows = df[df['id'].str.startswith(prefix)]
    if len(n_rows) != n:
        return float('inf'), 0
    
    trees = []
    for _, row in n_rows.iterrows():
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        trees.append(create_tree(x, y, deg))
    
    min_x = min_y = float('inf')
    max_x = max_y = float('-inf')
    for tree in trees:
        bounds = tree.bounds
        min_x = min(min_x, bounds[0])
        min_y = min(min_y, bounds[1])
        max_x = max(max_x, bounds[2])
        max_y = max(max_y, bounds[3])
    
    side = max(max_x - min_x, max_y - min_y)
    score = side**2 / n
    efficiency = (n * TREE_AREA) / (side**2)
    return score, efficiency

# Calculate scores for all N
scores = []
for n in range(1, 201):
    score, eff = get_n_score(df, n)
    scores.append({'N': n, 'score': score, 'efficiency': eff})

scores_df = pd.DataFrame(scores)
print(f'Total score: {scores_df["score"].sum():.6f}')
print(f'Average efficiency: {scores_df["efficiency"].mean():.4f}')

In [None]:
# Analyze score distribution by N range
ranges = [(1, 10), (11, 50), (51, 100), (101, 150), (151, 200)]

print('Score contribution by N range:')
print('-' * 50)
for start, end in ranges:
    mask = (scores_df['N'] >= start) & (scores_df['N'] <= end)
    range_score = scores_df.loc[mask, 'score'].sum()
    range_eff = scores_df.loc[mask, 'efficiency'].mean()
    pct = range_score / scores_df['score'].sum() * 100
    print(f'N={start:3d}-{end:3d}: score={range_score:7.3f} ({pct:5.1f}%), avg_eff={range_eff:.4f}')

# Target breakdown (assuming similar distribution)
target = 68.919154
gap = scores_df['score'].sum() - target
print(f'\nTotal gap to target: {gap:.6f}')
print(f'Gap per N (average): {gap/200:.6f}')

In [None]:
# Find N values with worst efficiency (most room for improvement)
scores_df_sorted = scores_df.sort_values('efficiency')
print('N values with WORST efficiency (most room for improvement):')
print(scores_df_sorted.head(20).to_string(index=False))

In [None]:
# Calculate theoretical minimum for each N
# Minimum side = sqrt(N * tree_area) if perfect packing
scores_df['theoretical_min'] = np.sqrt(scores_df['N'] * TREE_AREA)
scores_df['theoretical_score'] = scores_df['theoretical_min']**2 / scores_df['N']
scores_df['gap_to_theoretical'] = scores_df['score'] - scores_df['theoretical_score']

print('N values with largest gap to theoretical minimum:')
print(scores_df.nlargest(20, 'gap_to_theoretical')[['N', 'score', 'theoretical_score', 'gap_to_theoretical', 'efficiency']].to_string(index=False))

In [None]:
# Plot score vs N
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Score per N
ax = axes[0, 0]
ax.plot(scores_df['N'], scores_df['score'], 'b-', alpha=0.7)
ax.axhline(y=target/200, color='r', linestyle='--', label=f'Target avg: {target/200:.4f}')
ax.set_xlabel('N')
ax.set_ylabel('Score (sideÂ²/N)')
ax.set_title('Score per N')
ax.legend()

# Efficiency per N
ax = axes[0, 1]
ax.plot(scores_df['N'], scores_df['efficiency'], 'g-', alpha=0.7)
ax.set_xlabel('N')
ax.set_ylabel('Packing Efficiency')
ax.set_title('Packing Efficiency per N')

# Cumulative score
ax = axes[1, 0]
cumsum = scores_df['score'].cumsum()
ax.plot(scores_df['N'], cumsum, 'b-')
ax.axhline(y=target, color='r', linestyle='--', label=f'Target: {target}')
ax.set_xlabel('N')
ax.set_ylabel('Cumulative Score')
ax.set_title('Cumulative Score')
ax.legend()

# Gap to theoretical
ax = axes[1, 1]
ax.bar(scores_df['N'], scores_df['gap_to_theoretical'], alpha=0.7)
ax.set_xlabel('N')
ax.set_ylabel('Gap to Theoretical Min')
ax.set_title('Gap to Theoretical Minimum per N')

plt.tight_layout()
plt.savefig('/home/code/experiments/004_mega_ensemble/score_analysis.png', dpi=100)
plt.show()

In [None]:
# Key insight: What improvement is needed per N to reach target?
needed_improvement = gap / 200
print(f'Average improvement needed per N: {needed_improvement:.6f}')

# If we improve efficiency by X%, what's the new score?
for improvement_pct in [1, 2, 3, 5, 10]:
    new_scores = scores_df['score'] * (1 - improvement_pct/100)
    new_total = new_scores.sum()
    print(f'{improvement_pct}% improvement: {new_total:.4f} (gap: {new_total - target:.4f})')

In [None]:
# Summary
print('=' * 60)
print('SUMMARY')
print('=' * 60)
print(f'Current score: {scores_df["score"].sum():.6f}')
print(f'Target score: {target}')
print(f'Gap: {gap:.6f} ({gap/target*100:.2f}%)')
print(f'\nTo reach target, need ~{gap/scores_df["score"].sum()*100:.2f}% improvement overall')
print(f'\nKey insight: Small N values (1-10) have worst efficiency')
print('Focus on improving small N configurations!')