# Loop 5 Analysis: Strategy Assessment

## Key Questions:
1. What is the gap between our best score and the target?
2. What approaches have NOT been tried?
3. What can we learn from the overlapping snapshot?
4. Can we run the C++ optimizer on our current best?

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
import os
import glob

# Tree geometry
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])
BASE_TREE = Polygon(zip(TX, TY))

def create_tree(x, y, deg):
    tree = affinity.rotate(BASE_TREE, deg, origin=(0, 0))
    tree = affinity.translate(tree, x, y)
    return tree

def get_side(trees):
    if not trees:
        return 0
    min_x = min_y = float('inf')
    max_x = max_y = float('-inf')
    for tree in trees:
        bounds = tree.bounds
        min_x = min(min_x, bounds[0])
        min_y = min(min_y, bounds[1])
        max_x = max(max_x, bounds[2])
        max_y = max(max_y, bounds[3])
    return max(max_x - min_x, max_y - min_y)

def check_overlap(trees, tolerance=1e-9):
    for i in range(len(trees)):
        for j in range(i + 1, len(trees)):
            if trees[i].intersects(trees[j]):
                intersection = trees[i].intersection(trees[j])
                if intersection.area > tolerance:
                    return True
    return False

print('Utilities loaded')

Utilities loaded


In [2]:
# Load current best submission
df_current = pd.read_csv('/home/submission/submission.csv')

# Calculate per-N scores
per_n_scores = {}
for n in range(1, 201):
    prefix = f'{n:03d}_'
    n_rows = df_current[df_current['id'].str.startswith(prefix)]
    trees = []
    for _, row in n_rows.iterrows():
        x = float(str(row['x']).replace('s', ''))
        y = float(str(row['y']).replace('s', ''))
        deg = float(str(row['deg']).replace('s', ''))
        trees.append(create_tree(x, y, deg))
    side = get_side(trees)
    per_n_scores[n] = side**2 / n

total_score = sum(per_n_scores.values())
print(f'Current total score: {total_score:.6f}')
print(f'Target: 68.919154')
print(f'Gap: {total_score - 68.919154:.6f} ({(total_score - 68.919154) / 68.919154 * 100:.2f}%)')

Current total score: 70.659437
Target: 68.919154
Gap: 1.740283 (2.53%)


In [3]:
# Analyze per-N efficiency
print('\nPer-N Efficiency Analysis (worst 20):')
print('=' * 60)

# Theoretical minimum per tree = tree_area = 0.245625
tree_area = BASE_TREE.area
print(f'Tree area: {tree_area:.6f}')

efficiencies = []
for n, score in per_n_scores.items():
    # Efficiency = theoretical_min / actual
    theoretical_min = tree_area  # per tree
    efficiency = theoretical_min / score * 100
    efficiencies.append((n, score, efficiency))

# Sort by efficiency (worst first)
efficiencies.sort(key=lambda x: x[2])

print(f'\n{"N":>4} {"Score":>12} {"Efficiency":>10} {"Contribution":>12}')
for n, score, eff in efficiencies[:20]:
    print(f'{n:4d} {score:12.6f} {eff:9.2f}% {score:12.6f}')


Per-N Efficiency Analysis (worst 20):
Tree area: 0.245625

   N        Score Efficiency Contribution
   1     0.661250     37.15%     0.661250
   2     0.450779     54.49%     0.450779
   3     0.434745     56.50%     0.434745
   5     0.416850     58.92%     0.416850
   4     0.416545     58.97%     0.416545
   7     0.399897     61.42%     0.399897
   6     0.399610     61.47%     0.399610
   9     0.387415     63.40%     0.387415
   8     0.385407     63.73%     0.385407
  15     0.379203     64.77%     0.379203
  10     0.376630     65.22%     0.376630
  21     0.376451     65.25%     0.376451
  20     0.376057     65.32%     0.376057
  11     0.375736     65.37%     0.375736
  22     0.375258     65.45%     0.375258
  16     0.374128     65.65%     0.374128
  26     0.373997     65.68%     0.373997
  12     0.372724     65.90%     0.372724
  13     0.372294     65.98%     0.372294
  25     0.372144     66.00%     0.372144


In [4]:
# Load the overlapping snapshot and compare
overlap_snapshot = '/home/nonroot/snapshots/santa-2025/21145966992/submission/submission.csv'
if os.path.exists(overlap_snapshot):
    df_overlap = pd.read_csv(overlap_snapshot)
    
    print('Comparing overlapping snapshot with current best:')
    print('=' * 70)
    
    improvements = []
    for n in range(1, 201):
        prefix = f'{n:03d}_'
        
        # Current best
        n_rows_current = df_current[df_current['id'].str.startswith(prefix)]
        trees_current = []
        for _, row in n_rows_current.iterrows():
            x = float(str(row['x']).replace('s', ''))
            y = float(str(row['y']).replace('s', ''))
            deg = float(str(row['deg']).replace('s', ''))
            trees_current.append(create_tree(x, y, deg))
        score_current = get_side(trees_current)**2 / n
        
        # Overlapping snapshot
        n_rows_overlap = df_overlap[df_overlap['id'].str.startswith(prefix)]
        trees_overlap = []
        for _, row in n_rows_overlap.iterrows():
            x = float(str(row['x']).replace('s', ''))
            y = float(str(row['y']).replace('s', ''))
            deg = float(str(row['deg']).replace('s', ''))
            trees_overlap.append(create_tree(x, y, deg))
        score_overlap = get_side(trees_overlap)**2 / n
        has_overlap = check_overlap(trees_overlap)
        
        if score_overlap < score_current:
            improvement = score_current - score_overlap
            improvements.append((n, score_current, score_overlap, improvement, has_overlap))
    
    print(f'\nFound {len(improvements)} N values where overlapping snapshot is better:')
    print(f'{"N":>4} {"Current":>12} {"Overlap":>12} {"Improve":>10} {"Has Overlap":>12}')
    
    total_potential = 0
    clean_improvements = 0
    for n, curr, ovl, imp, has_ovl in sorted(improvements, key=lambda x: -x[3])[:30]:
        status = 'OVERLAP' if has_ovl else 'CLEAN'
        print(f'{n:4d} {curr:12.6f} {ovl:12.6f} {imp:10.6f} {status:>12}')
        total_potential += imp
        if not has_ovl:
            clean_improvements += imp
    
    print(f'\nTotal potential improvement: {total_potential:.6f}')
    print(f'Clean (no overlap) improvement: {clean_improvements:.6f}')
else:
    print('Overlapping snapshot not found')

Comparing overlapping snapshot with current best:



Found 67 N values where overlapping snapshot is better:
   N      Current      Overlap    Improve  Has Overlap
   5     0.416850     0.394109   0.022740      OVERLAP
   2     0.450779     0.437328   0.013452      OVERLAP
  56     0.352281     0.340953   0.011327      OVERLAP
  55     0.355023     0.346789   0.008234      OVERLAP
  54     0.359200     0.352169   0.007031      OVERLAP
   4     0.416545     0.411056   0.005489      OVERLAP
  71     0.352232     0.348328   0.003905      OVERLAP
  53     0.361855     0.358787   0.003069      OVERLAP
  80     0.344880     0.343654   0.001227      OVERLAP
  94     0.352271     0.351150   0.001121      OVERLAP
  88     0.350671     0.349550   0.001121      OVERLAP
  47     0.357493     0.356418   0.001075      OVERLAP
 140     0.340098     0.339163   0.000935      OVERLAP
 108     0.343559     0.342627   0.000932      OVERLAP
 168     0.332475     0.331548   0.000927      OVERLAP
  77     0.351113     0.350211   0.000903      OVERLAP
  69    

In [5]:
# Check all snapshots for clean improvements
print('\nSearching ALL snapshots for clean improvements...')
print('=' * 70)

snapshot_dir = '/home/nonroot/snapshots/santa-2025/'
snapshots = [d for d in os.listdir(snapshot_dir) if os.path.isdir(os.path.join(snapshot_dir, d)) and d != '.git']

best_per_n = {n: {'score': per_n_scores[n], 'source': 'current', 'has_overlap': False} for n in range(1, 201)}

for snap in snapshots:
    csv_path = os.path.join(snapshot_dir, snap, 'submission', 'submission.csv')
    if not os.path.exists(csv_path):
        continue
    
    try:
        df_snap = pd.read_csv(csv_path)
        
        for n in range(1, 201):
            prefix = f'{n:03d}_'
            n_rows = df_snap[df_snap['id'].str.startswith(prefix)]
            if len(n_rows) != n:
                continue
            
            trees = []
            for _, row in n_rows.iterrows():
                x = float(str(row['x']).replace('s', ''))
                y = float(str(row['y']).replace('s', ''))
                deg = float(str(row['deg']).replace('s', ''))
                trees.append(create_tree(x, y, deg))
            
            has_overlap = check_overlap(trees)
            if has_overlap:
                continue  # Skip overlapping configurations
            
            score = get_side(trees)**2 / n
            
            if score < best_per_n[n]['score'] - 1e-9:
                best_per_n[n] = {'score': score, 'source': snap, 'has_overlap': False}
    except Exception as e:
        continue

# Count improvements
improvements = [(n, per_n_scores[n], best_per_n[n]['score'], best_per_n[n]['source']) 
                for n in range(1, 201) 
                if best_per_n[n]['source'] != 'current']

print(f'Found {len(improvements)} N values with clean improvements from snapshots:')
if improvements:
    total_improvement = sum(per_n_scores[n] - best_per_n[n]['score'] for n, _, _, _ in improvements)
    print(f'Total potential improvement: {total_improvement:.6f}')
    print(f'\n{"N":>4} {"Current":>12} {"Better":>12} {"Improve":>10} {"Source":>15}')
    for n, curr, better, source in sorted(improvements, key=lambda x: -(x[1] - x[2]))[:20]:
        print(f'{n:4d} {curr:12.6f} {better:12.6f} {curr-better:10.6f} {source:>15}')


Searching ALL snapshots for clean improvements...


Found 0 N values with clean improvements from snapshots:


In [6]:
# Check if we can run the C++ optimizer
import subprocess

print('Testing C++ optimizer...')
print('=' * 50)

# Check if optimizer exists
if os.path.exists('/home/code/sa_v1_parallel'):
    print('✓ sa_v1_parallel exists')
    
    # Try running it on a small test
    result = subprocess.run(['/home/code/sa_v1_parallel', '--help'], 
                          capture_output=True, text=True, timeout=5)
    print(f'Help output: {result.stdout[:200] if result.stdout else result.stderr[:200]}')
else:
    print('✗ sa_v1_parallel not found')

Testing C++ optimizer...
✓ sa_v1_parallel exists
Help output: Using 26 threads
Input: submission.csv, Output: submission_opt.csv
SA iterations: 15000, Restarts: 5



In [7]:
# Summary of findings
print('\n' + '=' * 70)
print('SUMMARY OF FINDINGS')
print('=' * 70)

print(f'\n1. Current Score: {total_score:.6f}')
print(f'   Target: 68.919154')
print(f'   Gap: {total_score - 68.919154:.6f} ({(total_score - 68.919154) / 68.919154 * 100:.2f}%)')

print(f'\n2. Worst efficiency N values (where most improvement is possible):')
for n, score, eff in efficiencies[:5]:
    print(f'   N={n}: efficiency={eff:.1f}%, score={score:.6f}')

print(f'\n3. Clean improvements from snapshots: {len(improvements)} N values')
if improvements:
    print(f'   Total potential: {total_improvement:.6f}')

print(f'\n4. Key insight: Top kernels use 19+ sources, we have ~7 unique sources')
print(f'   The gap is likely due to missing data sources, not algorithm quality')


SUMMARY OF FINDINGS

1. Current Score: 70.659437
   Target: 68.919154
   Gap: 1.740283 (2.53%)

2. Worst efficiency N values (where most improvement is possible):
   N=1: efficiency=37.1%, score=0.661250
   N=2: efficiency=54.5%, score=0.450779
   N=3: efficiency=56.5%, score=0.434745
   N=5: efficiency=58.9%, score=0.416850
   N=4: efficiency=59.0%, score=0.416545

3. Clean improvements from snapshots: 0 N values

4. Key insight: Top kernels use 19+ sources, we have ~7 unique sources
   The gap is likely due to missing data sources, not algorithm quality
