# Loop 9 Analysis: Understanding the Gap to Target

## Key Questions:
1. What is the theoretical minimum score?
2. What techniques have NOT been tried?
3. What do top kernels do differently?

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely.ops import unary_union
import matplotlib.pyplot as plt

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def parse_value(s):
    if isinstance(s, str) and s.startswith('s'):
        return float(s[1:])
    return float(s)

def create_tree_polygon(x, y, deg):
    angle_rad = np.radians(deg)
    cos_a, sin_a = np.cos(angle_rad), np.sin(angle_rad)
    vertices = [(tx * cos_a - ty * sin_a + x, tx * sin_a + ty * cos_a + y) for tx, ty in zip(TX, TY)]
    return Polygon(vertices)

def compute_score_for_n(df, n):
    prefix = f'{n:03d}_'
    trees = df[df['id'].str.startswith(prefix)]
    if len(trees) != n:
        return float('inf')
    
    all_points = []
    for _, row in trees.iterrows():
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        angle_rad = np.radians(deg)
        cos_a, sin_a = np.cos(angle_rad), np.sin(angle_rad)
        for tx, ty in zip(TX, TY):
            px = tx * cos_a - ty * sin_a + x
            py = tx * sin_a + ty * cos_a + y
            all_points.append([px, py])
    
    all_points = np.array(all_points)
    side = max(all_points.max(axis=0) - all_points.min(axis=0))
    return side**2 / n

print('Functions defined')

Functions defined


In [2]:
# Load baseline and analyze per-N scores
df_baseline = pd.read_csv('/home/code/external_data/saspav/santa-2025.csv')

per_n_scores = []
for n in range(1, 201):
    score = compute_score_for_n(df_baseline, n)
    per_n_scores.append({'n': n, 'score': score})

df_scores = pd.DataFrame(per_n_scores)
df_scores['theoretical'] = df_scores['n'].apply(lambda n: 0.7 * 1.0 / n)  # Rough theoretical minimum
df_scores['efficiency'] = df_scores['score'] / df_scores['theoretical']

print('Current total score:', df_scores['score'].sum())
print('Target score:', 68.919154)
print('Gap:', df_scores['score'].sum() - 68.919154)
print()
print('Top 10 worst efficiency (most room for improvement):')
print(df_scores.nlargest(10, 'efficiency')[['n', 'score', 'efficiency']])

Current total score: 70.65995922534738
Target score: 68.919154
Gap: 1.7408052253473727

Top 10 worst efficiency (most room for improvement):
       n     score  efficiency
199  200  0.337564   96.446908
198  199  0.338269   96.165171
197  198  0.337316   95.412331
196  197  0.335990   94.557291
195  196  0.333268   93.315094
194  195  0.332617   92.657575
193  194  0.332999   92.288433
192  193  0.333764   92.023519
191  192  0.335301   91.968211
190  191  0.336758   91.886876


In [3]:
# Calculate how much improvement is needed per N to reach target
target = 68.919154
current = df_scores['score'].sum()
gap = current - target

print(f'Total gap to close: {gap:.6f}')
print(f'Average improvement needed per N: {gap/200:.6f}')
print()

# If we could improve each N proportionally
df_scores['target_score'] = df_scores['score'] * (target / current)
df_scores['improvement_needed'] = df_scores['score'] - df_scores['target_score']

print('Top 10 N values with most improvement potential (absolute):')
print(df_scores.nlargest(10, 'improvement_needed')[['n', 'score', 'target_score', 'improvement_needed']])

Total gap to close: 1.740805
Average improvement needed per N: 0.008704

Top 10 N values with most improvement potential (absolute):
     n     score  target_score  improvement_needed
0    1  0.661250      0.644959            0.016291
1    2  0.450779      0.439674            0.011106
2    3  0.434745      0.424035            0.010711
4    5  0.416850      0.406580            0.010270
3    4  0.416545      0.406283            0.010262
6    7  0.399897      0.390045            0.009852
5    6  0.399610      0.389765            0.009845
8    9  0.387415      0.377871            0.009545
7    8  0.385407      0.375912            0.009495
14  15  0.379203      0.369861            0.009342


In [4]:
# Analyze the structure of the baseline solution
# What patterns are used? What angles are common?

df_baseline['x_val'] = df_baseline['x'].apply(parse_value)
df_baseline['y_val'] = df_baseline['y'].apply(parse_value)
df_baseline['deg_val'] = df_baseline['deg'].apply(parse_value)

print('Angle distribution:')
print(df_baseline['deg_val'].describe())
print()
print('Most common angles:')
print(df_baseline['deg_val'].round(0).value_counts().head(10))

Angle distribution:
count    20100.000000
mean       211.308982
std       1051.453600
min     -28310.942136
25%         76.774005
50%        206.085693
75%        257.903314
max      41477.841146
Name: deg_val, dtype: float64

Most common angles:
deg_val
68.0     1756
248.0    1754
75.0      539
255.0     533
338.0     531
158.0     492
257.0     492
260.0     486
77.0      476
170.0     473
Name: count, dtype: int64


In [5]:
# Check if there are any patterns in the best configurations
# Look at N=181-200 which have best efficiency

for n in [181, 190, 200]:
    prefix = f'{n:03d}_'
    trees = df_baseline[df_baseline['id'].str.startswith(prefix)]
    print(f'\nN={n}:')
    print(f'  X range: {trees["x_val"].min():.4f} to {trees["x_val"].max():.4f}')
    print(f'  Y range: {trees["y_val"].min():.4f} to {trees["y_val"].max():.4f}')
    print(f'  Angle range: {trees["deg_val"].min():.2f} to {trees["deg_val"].max():.2f}')
    print(f'  Unique angles: {trees["deg_val"].round(0).nunique()}')
    print(f'  Score: {compute_score_for_n(df_baseline, n):.6f}')


N=181:
  X range: -3.5994 to 3.5994
  Y range: -3.9168 to 3.3365
  Angle range: 67.74 to 13566.85
  Unique angles: 10
  Score: 0.329946

N=190:
  X range: -3.8003 to 3.8003
  Y range: -3.9783 to 3.3783
  Angle range: -1903.43 to 266.93
  Unique angles: 17
  Score: 0.338231

N=200:
  X range: -3.8969 to 3.8979
  Y range: -4.0772 to 3.4772
  Angle range: 76.81 to 293.63
  Unique angles: 16
  Score: 0.337564


In [6]:
# Key insight: The gap is 1.74 points (2.5%)
# This is a SIGNIFICANT gap that cannot be closed by local optimization
# 
# What approaches have NOT been tried:
# 1. Longer optimization runs (hours, not minutes)
# 2. Different construction approaches (rebuild from corners)
# 3. Constraint programming / exact solvers
# 4. Genetic algorithms with crossover
# 5. Multi-start from different initial configurations

print('SUMMARY OF SITUATION:')
print('='*60)
print(f'Current best: 70.659959')
print(f'Target: 68.919154')
print(f'Gap: 1.740805 (2.5%)')
print()
print('WHAT HAS BEEN TRIED:')
print('- Local SA optimization: 0 improvement')
print('- Lattice construction: Much worse (85-88)')
print('- Rotation optimization: 0 improvement')
print('- bbox3 optimizer (3.5 min): 0.0000006 improvement')
print('- Eazy optimizer: 0.000015 improvement (but failed on Kaggle)')
print()
print('WHAT HAS NOT BEEN TRIED:')
print('- MUCH LONGER optimization runs (hours, not minutes)')
print('- Rebuild from corners approach')
print('- Different starting configurations')
print('- Genetic algorithms with crossover')
print('- Constraint programming / exact solvers')

SUMMARY OF SITUATION:
Current best: 70.659959
Target: 68.919154
Gap: 1.740805 (2.5%)

WHAT HAS BEEN TRIED:
- Local SA optimization: 0 improvement
- Lattice construction: Much worse (85-88)
- Rotation optimization: 0 improvement
- bbox3 optimizer (3.5 min): 0.0000006 improvement
- Eazy optimizer: 0.000015 improvement (but failed on Kaggle)

WHAT HAS NOT BEEN TRIED:
- MUCH LONGER optimization runs (hours, not minutes)
- Rebuild from corners approach
- Different starting configurations
- Genetic algorithms with crossover
- Constraint programming / exact solvers
