# Loop 3 Analysis: Understanding the Gap to Target

## Key Questions:
1. What is the per-N breakdown of our current solution vs target?
2. Which N values have the most room for improvement?
3. What techniques haven't been tried yet?

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.ops import unary_union
import warnings
warnings.filterwarnings('ignore')

getcontext().prec = 25

# Load current best submission
df = pd.read_csv('/home/submission/submission.csv')
df['x'] = df['x'].str.strip('s').astype(float)
df['y'] = df['y'].str.strip('s').astype(float)
df['deg'] = df['deg'].str.strip('s').astype(float)
df['n'] = df['id'].str.split('_').str[0].astype(int)

print(f"Loaded {len(df)} rows")
print(f"N values: {df['n'].min()} to {df['n'].max()}")

Loaded 20100 rows
N values: 1 to 200


In [2]:
# Tree shape coordinates
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def get_tree_polygon(cx, cy, angle_deg):
    """Create a tree polygon at given center and angle"""
    angle_rad = np.radians(angle_deg)
    cos_a, sin_a = np.cos(angle_rad), np.sin(angle_rad)
    
    points = []
    for tx, ty in zip(TX, TY):
        rx = tx * cos_a - ty * sin_a + cx
        ry = tx * sin_a + ty * cos_a + cy
        points.append((rx, ry))
    
    return Polygon(points)

def get_side_length(trees_df):
    """Calculate bounding box side length for a group of trees"""
    polygons = [get_tree_polygon(row['x'], row['y'], row['deg']) for _, row in trees_df.iterrows()]
    union = unary_union(polygons)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

# Calculate per-N scores
per_n_scores = []
for n in range(1, 201):
    trees = df[df['n'] == n]
    if len(trees) == n:
        side = get_side_length(trees)
        score = side**2 / n
        per_n_scores.append({'n': n, 'side': side, 'score': score})

per_n_df = pd.DataFrame(per_n_scores)
print(f"Total score: {per_n_df['score'].sum():.6f}")

Total score: 70.659437


In [3]:
# Analyze which N values contribute most to the score
per_n_df_sorted = per_n_df.sort_values('score', ascending=False)
print("Top 20 N values by score contribution:")
print(per_n_df_sorted.head(20).to_string(index=False))

print(f"\nSum of top 20: {per_n_df_sorted.head(20)['score'].sum():.6f}")
print(f"Sum of top 50: {per_n_df_sorted.head(50)['score'].sum():.6f}")
print(f"Total: {per_n_df['score'].sum():.6f}")

Top 20 N values by score contribution:
 n     side    score
 1 0.813173 0.661250
 2 0.949504 0.450779
 3 1.142031 0.434745
 5 1.443692 0.416850
 4 1.290806 0.416545
 7 1.673104 0.399897
 6 1.548438 0.399610
 9 1.867280 0.387415
 8 1.755921 0.385407
15 2.384962 0.379203
10 1.940696 0.376630
21 2.811667 0.376451
20 2.742469 0.376057
11 2.033002 0.375736
22 2.873270 0.375258
16 2.446640 0.374128
26 3.118320 0.373997
12 2.114873 0.372724
13 2.199960 0.372294
25 3.050182 0.372144

Sum of top 20: 8.077120
Sum of top 50: 19.063857
Total: 70.659437


In [4]:
# Calculate what improvement is needed per N to reach target
current_total = per_n_df['score'].sum()
target_total = 68.919154
gap = current_total - target_total

print(f"Current total: {current_total:.6f}")
print(f"Target total: {target_total:.6f}")
print(f"Gap: {gap:.6f}")
print(f"Gap as % of current: {100*gap/current_total:.2f}%")

# If we could improve each N proportionally, how much per N?
avg_improvement_per_n = gap / 200
print(f"\nAverage improvement needed per N: {avg_improvement_per_n:.6f}")

Current total: 70.659437
Target total: 68.919154
Gap: 1.740283
Gap as % of current: 2.46%

Average improvement needed per N: 0.008701


In [5]:
# Look at small N values (1-20) which contribute most
small_n = per_n_df[per_n_df['n'] <= 20]
print("Small N values (1-20):")
print(small_n.to_string(index=False))
print(f"\nSum of N=1-20: {small_n['score'].sum():.6f}")
print(f"This is {100*small_n['score'].sum()/current_total:.1f}% of total score")

Small N values (1-20):
 n     side    score
 1 0.813173 0.661250
 2 0.949504 0.450779
 3 1.142031 0.434745
 4 1.290806 0.416545
 5 1.443692 0.416850
 6 1.548438 0.399610
 7 1.673104 0.399897
 8 1.755921 0.385407
 9 1.867280 0.387415
10 1.940696 0.376630
11 2.033002 0.375736
12 2.114873 0.372724
13 2.199960 0.372294
14 2.277357 0.370454
15 2.384962 0.379203
16 2.446640 0.374128
17 2.508124 0.370040
18 2.576409 0.368771
19 2.646449 0.368615
20 2.742469 0.376057

Sum of N=1-20: 8.057151
This is 11.4% of total score


In [6]:
# Check what the theoretical minimum might be for small N
# N=1: Single tree at 45 degrees has minimum bounding box
# Tree dimensions: height=1.0 (from -0.2 to 0.8), max width=0.7 at base

import math

# For a single tree at 45 degrees
angle = 45
polygon = get_tree_polygon(0, 0, angle)
bounds = polygon.bounds
side_45 = max(bounds[2] - bounds[0], bounds[3] - bounds[1])
print(f"N=1 at 45 degrees: side={side_45:.6f}, score={side_45**2:.6f}")

# Check current N=1
n1_trees = df[df['n'] == 1]
print(f"\nCurrent N=1: x={n1_trees['x'].values[0]:.6f}, y={n1_trees['y'].values[0]:.6f}, deg={n1_trees['deg'].values[0]:.6f}")
current_n1_side = get_side_length(n1_trees)
print(f"Current N=1 side: {current_n1_side:.6f}, score: {current_n1_side**2:.6f}")

N=1 at 45 degrees: side=0.813173, score=0.661250

Current N=1: x=44.252353, y=-32.525885, deg=45.000000
Current N=1 side: 0.813173, score: 0.661250


In [7]:
# Check N=2 - two trees
n2_trees = df[df['n'] == 2]
print("Current N=2:")
for _, row in n2_trees.iterrows():
    print(f"  x={row['x']:.6f}, y={row['y']:.6f}, deg={row['deg']:.6f}")
current_n2_side = get_side_length(n2_trees)
print(f"Current N=2 side: {current_n2_side:.6f}, score: {current_n2_side**2/2:.6f}")

Current N=2:
  x=0.154097, y=-0.038541, deg=203.629378
  x=-0.154097, y=-0.561459, deg=23.629378
Current N=2 side: 0.949504, score: 0.450779


In [8]:
# Analyze the efficiency of packing for different N values
# Efficiency = (N * tree_area) / (side^2)
# Higher efficiency means better packing

# Single tree area (approximate)
single_tree = get_tree_polygon(0, 0, 0)
tree_area = single_tree.area
print(f"Single tree area: {tree_area:.6f}")

per_n_df['efficiency'] = (per_n_df['n'] * tree_area) / (per_n_df['side']**2)
print("\nPacking efficiency by N:")
print(per_n_df[['n', 'side', 'score', 'efficiency']].head(30).to_string(index=False))

Single tree area: 0.245625

Packing efficiency by N:
 n     side    score  efficiency
 1 0.813173 0.661250    0.371456
 2 0.949504 0.450779    0.544890
 3 1.142031 0.434745    0.564986
 4 1.290806 0.416545    0.589672
 5 1.443692 0.416850    0.589241
 6 1.548438 0.399610    0.614661
 7 1.673104 0.399897    0.614221
 8 1.755921 0.385407    0.637313
 9 1.867280 0.387415    0.634010
10 1.940696 0.376630    0.652165
11 2.033002 0.375736    0.653717
12 2.114873 0.372724    0.659000
13 2.199960 0.372294    0.659761
14 2.277357 0.370454    0.663038
15 2.384962 0.379203    0.647740
16 2.446640 0.374128    0.656527
17 2.508124 0.370040    0.663779
18 2.576409 0.368771    0.666063
19 2.646449 0.368615    0.666345
20 2.742469 0.376057    0.653159
21 2.811667 0.376451    0.652476
22 2.873270 0.375258    0.654549
23 2.912260 0.368750    0.666101
24 2.961779 0.365506    0.672014
25 3.050182 0.372144    0.660026
26 3.118320 0.373997    0.656757
27 3.129442 0.362719    0.677177
28 3.201710 0.366105   

In [9]:
# Find N values with lowest efficiency (most room for improvement)
per_n_df_eff = per_n_df.sort_values('efficiency')
print("N values with LOWEST efficiency (most room for improvement):")
print(per_n_df_eff.head(20).to_string(index=False))

N values with LOWEST efficiency (most room for improvement):
 n     side    score  efficiency
 1 0.813173 0.661250    0.371456
 2 0.949504 0.450779    0.544890
 3 1.142031 0.434745    0.564986
 5 1.443692 0.416850    0.589241
 4 1.290806 0.416545    0.589672
 7 1.673104 0.399897    0.614221
 6 1.548438 0.399610    0.614661
 9 1.867280 0.387415    0.634010
 8 1.755921 0.385407    0.637313
15 2.384962 0.379203    0.647740
10 1.940696 0.376630    0.652165
21 2.811667 0.376451    0.652476
20 2.742469 0.376057    0.653159
11 2.033002 0.375736    0.653717
22 2.873270 0.375258    0.654549
16 2.446640 0.374128    0.656527
26 3.118320 0.373997    0.656757
12 2.114873 0.372724    0.659000
13 2.199960 0.372294    0.659761
25 3.050182 0.372144    0.660026


In [10]:
# Summary statistics
print("\n=== SUMMARY ===")
print(f"Current score: {current_total:.6f}")
print(f"Target score: {target_total:.6f}")
print(f"Gap: {gap:.6f} ({100*gap/current_total:.2f}%)")
print(f"\nSmall N (1-20) contribution: {small_n['score'].sum():.6f} ({100*small_n['score'].sum()/current_total:.1f}%)")
print(f"\nLowest efficiency N values (most improvable):")
for _, row in per_n_df_eff.head(10).iterrows():
    print(f"  N={int(row['n'])}: efficiency={row['efficiency']:.4f}, score={row['score']:.6f}")

print(f"\n=== KEY INSIGHT ===")
print(f"The gap of {gap:.6f} points is 2.5% of the total score.")
print(f"This is a HUGE gap that cannot be closed by local optimization.")
print(f"The target solution likely has fundamentally different packing patterns.")


=== SUMMARY ===
Current score: 70.659437
Target score: 68.919154
Gap: 1.740283 (2.46%)

Small N (1-20) contribution: 8.057151 (11.4%)

Lowest efficiency N values (most improvable):
  N=1: efficiency=0.3715, score=0.661250
  N=2: efficiency=0.5449, score=0.450779
  N=3: efficiency=0.5650, score=0.434745
  N=5: efficiency=0.5892, score=0.416850
  N=4: efficiency=0.5897, score=0.416545
  N=7: efficiency=0.6142, score=0.399897
  N=6: efficiency=0.6147, score=0.399610
  N=9: efficiency=0.6340, score=0.387415
  N=8: efficiency=0.6373, score=0.385407
  N=15: efficiency=0.6477, score=0.379203

=== KEY INSIGHT ===
The gap of 1.740283 points is 2.5% of the total score.
This is a HUGE gap that cannot be closed by local optimization.
The target solution likely has fundamentally different packing patterns.
