# Loop 37 Analysis: Critical Assessment

## Key Questions:
1. What is the actual gap to target?
2. What approaches have been tried vs what's left?
3. What do top solutions do differently?

In [1]:
import pandas as pd
import numpy as np
import json

# Load session state
with open('/home/code/session_state.json', 'r') as f:
    state = json.load(f)

# Analyze experiments
experiments = state['experiments']
print(f"Total experiments: {len(experiments)}")
print(f"\nScore progression:")
for exp in experiments[-15:]:
    print(f"  {exp['name']}: CV={exp['cv_score']:.6f}, LB={exp.get('lb_score', 'N/A')}")

# Best scores
cv_scores = [e['cv_score'] for e in experiments]
best_cv = min(cv_scores)
print(f"\nBest CV: {best_cv:.6f}")
print(f"Target: 68.866853")
print(f"Gap: {best_cv - 68.866853:.6f} ({(best_cv - 68.866853)/68.866853*100:.2f}%)")


Total experiments: 38

Score progression:
  023_branch_and_bound_small_n: CV=70.316492, LB=None
  024_lattice_packing: CV=70.316492, LB=None
  025_interlock_pattern: CV=70.316492, LB=None
  026_jostle_algorithm: CV=70.316492, LB=None
  027_blf_constructive: CV=70.316492, LB=None
  028_final_ensemble: CV=70.315653, LB=None
  029_final_ensemble_v2: CV=70.315537, LB=None
  030_extended_cpp_ensemble: CV=70.315393, LB=None
  031_safe_ensemble: CV=70.315389, LB=None
  032_extended_bbox3: CV=70.315389, LB=None
  033_extended_bbox3_revert: CV=70.315537, LB=None
  034_extended_bbox3_final: CV=70.315537, LB=None
  035_lattice_constructive: CV=70.315537, LB=None
  036_shake_algorithm: CV=70.315537, LB=None
  037_genetic_algorithm: CV=70.315537, LB=None

Best CV: 70.265730
Target: 68.866853
Gap: 1.398877 (2.03%)


In [2]:
# Categorize experiments by approach type
approach_types = {
    'baseline': [],
    'local_search': [],  # SA, exhaustive, NFP
    'ensemble': [],
    'constructive': [],  # lattice, BLF, interlock
    'population': [],  # GA
    'extended_opt': []  # long bbox3 runs
}

for exp in experiments:
    name = exp['name'].lower()
    notes = exp.get('notes', '').lower()
    
    if 'baseline' in name:
        approach_types['baseline'].append(exp)
    elif any(x in name for x in ['sa', 'annealing', 'exhaustive', 'nfp', 'shake', 'numba']):
        approach_types['local_search'].append(exp)
    elif 'ensemble' in name:
        approach_types['ensemble'].append(exp)
    elif any(x in name for x in ['lattice', 'blf', 'interlock', 'jostle', 'constructive']):
        approach_types['constructive'].append(exp)
    elif 'genetic' in name or 'ga' in name:
        approach_types['population'].append(exp)
    elif '8hr' in name or 'extended' in name:
        approach_types['extended_opt'].append(exp)
    else:
        approach_types['local_search'].append(exp)  # default

print("Experiments by approach type:")
for approach, exps in approach_types.items():
    if exps:
        best = min(e['cv_score'] for e in exps)
        print(f"  {approach}: {len(exps)} experiments, best={best:.6f}")


Experiments by approach type:
  baseline: 2 experiments, best=70.523320
  local_search: 11 experiments, best=70.315389
  ensemble: 13 experiments, best=70.265730
  constructive: 5 experiments, best=70.315537
  population: 3 experiments, best=70.315537
  extended_opt: 4 experiments, best=70.315389


In [3]:
# Analyze per-N scores from best submission
baseline_path = '/home/code/experiments/029_final_ensemble_v2/submission.csv'
df = pd.read_csv(baseline_path)

# Parse coordinates
def parse_coord(val):
    if isinstance(val, str) and val.startswith('s'):
        return float(val[1:])
    return float(val)

df['n'] = df['id'].apply(lambda x: int(str(x).split('_')[0]))
df['i'] = df['id'].apply(lambda x: int(str(x).split('_')[1]))
for col in ['x', 'y', 'deg']:
    df[col] = df[col].apply(parse_coord)

# Calculate per-N scores
per_n_scores = {}
for n in range(1, 201):
    n_df = df[df['n'] == n]
    if len(n_df) > 0:
        min_x = n_df['x'].min()
        max_x = n_df['x'].max()
        min_y = n_df['y'].min()
        max_y = n_df['y'].max()
        # This is approximate - need to account for tree geometry
        side = max(max_x - min_x, max_y - min_y) + 1.0  # rough tree size
        per_n_scores[n] = side**2 / n

print("Top 10 highest score contributors (worst N values):")
sorted_scores = sorted(per_n_scores.items(), key=lambda x: x[1], reverse=True)
for n, score in sorted_scores[:10]:
    print(f"  N={n}: {score:.4f}")

print("\nTotal score:", sum(per_n_scores.values()))


Top 10 highest score contributors (worst N values):
  N=2: 1.1596
  N=1: 1.0000
  N=4: 0.8688
  N=3: 0.8452
  N=5: 0.7382
  N=8: 0.6782
  N=6: 0.6762
  N=7: 0.6445
  N=12: 0.6027
  N=9: 0.5957

Total score: 89.35940420926349


In [4]:
# What's the theoretical minimum?
# For N trees, the minimum bounding box is limited by the tree geometry
# Tree dimensions: width ~0.7, height ~1.0

# Theoretical analysis:
# - N=1: Single tree, min side = max(0.7, 1.0) = 1.0, score = 1.0
# - But with rotation, we can get smaller bounding box
# - Optimal N=1 rotation gives ~0.813 (from baseline)

# The gap to target is 1.45 points (2.1%)
# This is distributed across all 200 N values
# Average improvement needed per N: 1.45/200 = 0.00725

print("Gap analysis:")
print(f"  Current best: 70.315537")
print(f"  Target: 68.866853")
print(f"  Gap: 1.448684 points")
print(f"  Average improvement needed per N: {1.448684/200:.6f}")
print(f"")
print("  If we improve 50 N values by 0.03 each: 1.5 points")
print("  If we improve 100 N values by 0.015 each: 1.5 points")
print("  If we improve 200 N values by 0.0075 each: 1.5 points")


Gap analysis:
  Current best: 70.315537
  Target: 68.866853
  Gap: 1.448684 points
  Average improvement needed per N: 0.007243

  If we improve 50 N values by 0.03 each: 1.5 points
  If we improve 100 N values by 0.015 each: 1.5 points
  If we improve 200 N values by 0.0075 each: 1.5 points


In [5]:
# Check what external data sources have been tried
print("External data sources mentioned in experiments:")
for exp in experiments:
    notes = exp.get('notes', '')
    if 'external' in notes.lower() or 'snapshot' in notes.lower() or 'csv' in notes.lower():
        print(f"  {exp['name']}: {notes[:200]}...")


External data sources mentioned in experiments:
  000_baseline: Baseline from best pre-optimized snapshot ensemble (21328309254/003_valid_ensemble). Score 70.523320 vs target 68.882921, gap of 1.64 points. N=1 is already optimal at 0.6612. Top score contributors a...
  001_valid_baseline: Valid baseline from snapshot 21337107511 that PASSED Kaggle validation with LB score 70.615106516706. This submission has high precision coordinates (20+ decimal places) which is required to pass Kagg...
  007_ensemble_fractional: BREAKTHROUGH: Ensemble from all snapshots achieved 70.265730 vs baseline 70.615102 - improvement of 0.349 points! Key insight: N=24 alone contributed 0.348 improvement (99% of total gain). Found 43 N ...
  008_snapshot_ensemble: Ensemble of best per-N solutions from 3512 snapshot files. Fractional translation found no improvements (baseline is at strong local optimum). Ensemble approach found 167/200 N values with improvement...
  009_highprec_ensemble: High-precision ensemb

In [6]:
# Key insight from research:
# Top teams run bbox3 for 24-72 HOURS with 24+ CPUs
# Our longest run was 53 minutes on 1 CPU
# That's 1/648th to 1/1944th of top competitor compute

print("COMPUTE TIME ANALYSIS:")
print("")
print("Top competitors:")
print("  - Run time: 24-72 hours")
print("  - CPUs: 24+")
print("  - Total compute: 576-1728 CPU-hours")
print("")
print("Our best attempt:")
print("  - Run time: 53 minutes")
print("  - CPUs: 1")
print("  - Total compute: 0.88 CPU-hours")
print("")
print("Ratio: Our compute is 1/655 to 1/1964 of top competitors")
print("")
print("CONCLUSION: We have NOT tried extended optimization at scale.")
print("This is the ONLY approach that top teams use that we haven't tried.")


COMPUTE TIME ANALYSIS:

Top competitors:
  - Run time: 24-72 hours
  - CPUs: 24+
  - Total compute: 576-1728 CPU-hours

Our best attempt:
  - Run time: 53 minutes
  - CPUs: 1
  - Total compute: 0.88 CPU-hours

Ratio: Our compute is 1/655 to 1/1964 of top competitors

CONCLUSION: We have NOT tried extended optimization at scale.
This is the ONLY approach that top teams use that we haven't tried.
