# Loop 3 Analysis: Understanding the Overlap Issue

The submission failed with 'Overlapping trees in group 008'. Let's analyze:
1. Why did bbox3 introduce overlaps?
2. What's the difference between validated and rejected submissions?
3. What approaches can actually improve the score?

In [None]:
import pandas as pd
import numpy as np
import math
import os
import json

# Tree geometry
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

print('Loaded')

In [None]:
# Check the validation status of different snapshots
print('=== Checking snapshot validation status ===')

# Check 21198893057 (validated)
path1 = '/home/nonroot/snapshots/santa-2025/21198893057/code/session_state.json'
if os.path.exists(path1):
    with open(path1) as f:
        state1 = json.load(f)
    print(f'\n21198893057 submissions (last 5):')
    for sub in state1.get('submissions', [])[-5:]:
        lb = sub.get('lb_score', 'N/A')
        err = sub.get('error', None)
        print(f"  CV: {sub.get('cv_score', 'N/A'):.6f}, LB: {lb}, Error: {err}")

# Check 21198927060 (source of rejected)
path2 = '/home/nonroot/snapshots/santa-2025/21198927060/code/session_state.json'
if os.path.exists(path2):
    with open(path2) as f:
        state2 = json.load(f)
    print(f'\n21198927060 submissions (last 5):')
    for sub in state2.get('submissions', [])[-5:]:
        lb = sub.get('lb_score', 'N/A')
        err = sub.get('error', None)
        print(f"  CV: {sub.get('cv_score', 'N/A'):.6f}, LB: {lb}, Error: {err}")

In [None]:
# KEY INSIGHT: Check if 21198927060's submission.csv was ever validated
print('=== Checking if 21198927060 submission was validated ===')

# The submission.csv in 21198927060 scores 70.624424
# But was it ever submitted to Kaggle?

if os.path.exists(path2):
    with open(path2) as f:
        state2 = json.load(f)
    
    # Find submissions with score close to 70.624424
    for sub in state2.get('submissions', []):
        cv = sub.get('cv_score', 0)
        if abs(cv - 70.624424) < 0.001:
            print(f"Found matching submission:")
            print(f"  CV: {cv}")
            print(f"  LB: {sub.get('lb_score', 'N/A')}")
            print(f"  Error: {sub.get('error', None)}")

In [None]:
# Check all submissions in 21198927060 to see which ones passed
print('=== All submissions in 21198927060 ===')
if os.path.exists(path2):
    with open(path2) as f:
        state2 = json.load(f)
    
    passed = []
    failed = []
    for sub in state2.get('submissions', []):
        cv = sub.get('cv_score', 0)
        lb = sub.get('lb_score', None)
        err = sub.get('error', None)
        
        if lb and not err:
            passed.append((cv, lb))
        elif err:
            failed.append((cv, err))
    
    print(f'\nPassed submissions: {len(passed)}')
    for cv, lb in passed[-5:]:
        print(f'  CV: {cv:.6f}, LB: {lb}')
    
    print(f'\nFailed submissions: {len(failed)}')
    for cv, err in failed[-5:]:
        print(f'  CV: {cv:.6f}, Error: {err}')

In [None]:
# CRITICAL FINDING: The submission.csv in 21198927060 (70.624424) was NEVER validated!
# It was the result of further optimization AFTER the last validated submission.
# The last validated submission in that snapshot was 70.626088.

# Let's find the best validated submission across all snapshots
print('=== Finding best validated submission ===')

snapshot_dir = '/home/nonroot/snapshots/santa-2025'
best_validated = None
best_lb = float('inf')

for snap in os.listdir(snapshot_dir):
    state_path = os.path.join(snapshot_dir, snap, 'code', 'session_state.json')
    if os.path.exists(state_path):
        with open(state_path) as f:
            state = json.load(f)
        
        for sub in state.get('submissions', []):
            lb = sub.get('lb_score', None)
            err = sub.get('error', None)
            if lb and not err and isinstance(lb, (int, float)):
                if lb < best_lb:
                    best_lb = lb
                    best_validated = {
                        'snapshot': snap,
                        'cv_score': sub.get('cv_score'),
                        'lb_score': lb,
                        'model': sub.get('model_name')
                    }

print(f'\nBest validated submission:')
print(f'  Snapshot: {best_validated["snapshot"]}')
print(f'  CV: {best_validated["cv_score"]}')
print(f'  LB: {best_validated["lb_score"]}')
print(f'  Model: {best_validated["model"]}')

In [None]:
# Now let's understand the gap and what approaches might work
print('=== Gap Analysis ===')
target = 68.892266
best_lb = 70.626088  # Best validated

gap = best_lb - target
gap_pct = (gap / target) * 100

print(f'Target: {target}')
print(f'Best validated LB: {best_lb}')
print(f'Gap: {gap:.6f} ({gap_pct:.2f}%)')

print('\n=== What this means ===')
print('1. We need to improve by 1.73 points (2.5%)')
print('2. bbox3 optimizer showed ZERO improvement on pre-optimized solution')
print('3. The solution is at a local optimum')
print('4. We need fundamentally different approaches')

In [None]:
# Check what approaches have been tried in the snapshots
print('=== Approaches tried in snapshots ===')

for snap in ['21198927060', '21198893057']:
    state_path = os.path.join(snapshot_dir, snap, 'code', 'session_state.json')
    if os.path.exists(state_path):
        with open(state_path) as f:
            state = json.load(f)
        
        print(f'\nSnapshot {snap}:')
        experiments = state.get('experiments', [])
        print(f'  Total experiments: {len(experiments)}')
        
        # Show unique model types
        model_types = set()
        for exp in experiments:
            model_types.add(exp.get('model_type', 'unknown'))
        print(f'  Model types: {model_types}')
        
        # Show best scores
        scores = [exp.get('cv_score', float('inf')) for exp in experiments]
        if scores:
            print(f'  Best CV: {min(scores):.6f}')
            print(f'  Worst CV: {max(scores):.6f}')

In [None]:
# STRATEGIC CONCLUSION
print('=' * 60)
print('STRATEGIC CONCLUSION')
print('=' * 60)

print('''
1. ROOT CAUSE OF FAILURE:
   - bbox3 started from snapshot 21198927060's submission.csv
   - This submission (70.624424) was NEVER validated by Kaggle
   - It already had overlaps that Kaggle detected
   - bbox3 did NOT introduce the overlaps - they were already there

2. CORRECT BASELINE:
   - Use snapshot 21198893057's submission.csv (LB: 70.627582)
   - This is the best VALIDATED submission
   - Or use snapshot 21198927060's candidate_024.csv (LB: 70.626088)

3. WHY OPTIMIZATION DOESN'T WORK:
   - The pre-optimized solution is at a local optimum
   - bbox3 ran 8 parameter combinations with ZERO improvement
   - More iterations won't help

4. WHAT MIGHT WORK:
   a. Try sa_fast_v2 (different algorithm - SA with fractional translation)
   b. Tessellation for large N (fundamentally different representation)
   c. Focus on small N values (1-20) with exhaustive rotation search
   d. External sources (GitHub, Telegram) - but we found they're worse

5. IMMEDIATE NEXT STEP:
   - Run sa_fast_v2 on the VALIDATED submission (70.627582)
   - If it also shows no improvement, pivot to tessellation
''')

In [None]:
# Find the validated candidate file
print('=== Finding validated candidate file ===')

# Check for candidate_024.csv in 21198927060
candidate_path = '/home/nonroot/snapshots/santa-2025/21198927060/code/submission_candidates/candidate_024.csv'
if os.path.exists(candidate_path):
    df = pd.read_csv(candidate_path)
    print(f'Found candidate_024.csv: {len(df)} rows')
    
    # Calculate score
    from numba import njit
    
    @njit
    def score_group(xs, ys, degs, tx, ty):
        n = xs.size
        V = tx.size
        mnx = mny = 1e300
        mxx = mxy = -1e300
        for i in range(n):
            r = degs[i] * math.pi / 180.0
            c, s = math.cos(r), math.sin(r)
            for j in range(V):
                X = c * tx[j] - s * ty[j] + xs[i]
                Y = s * tx[j] + c * ty[j] + ys[i]
                mnx, mxx = min(mnx, X), max(mxx, X)
                mny, mxy = min(mny, Y), max(mxy, Y)
        side = max(mxx - mnx, mxy - mny)
        return side * side / n
    
    total = 0
    for n in range(1, 201):
        mask = df['id'].str.startswith(f'{n:03d}_')
        group = df[mask]
        if len(group) == n:
            xs = group['x'].str[1:].astype(float).values
            ys = group['y'].str[1:].astype(float).values
            degs = group['deg'].str[1:].astype(float).values
            total += score_group(xs, ys, degs, TX, TY)
    
    print(f'Score: {total:.6f}')
else:
    print('candidate_024.csv not found')

# Also check the validated submission
validated_path = '/home/nonroot/snapshots/santa-2025/21198893057/code/submission.csv'
if os.path.exists(validated_path):
    df = pd.read_csv(validated_path)
    print(f'\nValidated submission.csv: {len(df)} rows')
    
    total = 0
    for n in range(1, 201):
        mask = df['id'].str.startswith(f'{n:03d}_')
        group = df[mask]
        if len(group) == n:
            xs = group['x'].str[1:].astype(float).values
            ys = group['y'].str[1:].astype(float).values
            degs = group['deg'].str[1:].astype(float).values
            total += score_group(xs, ys, degs, TX, TY)
    
    print(f'Score: {total:.6f}')