# Loop 16 Analysis: Full CSV Scan

## Goal: Find ALL CSV files and check for better per-N solutions

The evaluator identified that we have 3476 CSV files but only checked ~109 main submission files.
Let's do a comprehensive scan of ALL CSVs including preoptimized folders.

In [None]:
import glob
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
import os
from collections import defaultdict

getcontext().prec = 30

# Tree polygon vertices
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125], dtype=np.float64)
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5], dtype=np.float64)

def rotate_vertices(tx, ty, angle_deg):
    angle_rad = angle_deg * np.pi / 180.0
    cos_a = np.cos(angle_rad)
    sin_a = np.sin(angle_rad)
    rx = tx * cos_a - ty * sin_a
    ry = tx * sin_a + ty * cos_a
    return rx, ry

def compute_score_for_n(df, n):
    """Compute score for a single N value."""
    n_df = df[df['id'].str.startswith(f'{n:03d}_')]
    if len(n_df) != n:
        return None
    
    min_x = np.inf
    max_x = -np.inf
    min_y = np.inf
    max_y = -np.inf
    
    for _, row in n_df.iterrows():
        x = float(str(row['x']).replace('s', ''))
        y = float(str(row['y']).replace('s', ''))
        angle = float(str(row['deg']).replace('s', ''))
        
        rx, ry = rotate_vertices(TX, TY, angle)
        vx = rx + x
        vy = ry + y
        
        min_x = min(min_x, vx.min())
        max_x = max(max_x, vx.max())
        min_y = min(min_y, vy.min())
        max_y = max(max_y, vy.max())
    
    side = max(max_x - min_x, max_y - min_y)
    return (side ** 2) / n

print("Functions defined. Ready to scan.")
print(f"Total CSV files: {len(glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True))}")


In [None]:
# Find all CSV files
all_csvs = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
print(f"Found {len(all_csvs)} CSV files")

# Categorize by folder type
preoptimized_csvs = [f for f in all_csvs if 'preoptimized' in f]
submission_csvs = [f for f in all_csvs if 'submission' in f.lower() and 'preoptimized' not in f]
other_csvs = [f for f in all_csvs if f not in preoptimized_csvs and f not in submission_csvs]

print(f"\nBreakdown:")
print(f"  Preoptimized folder CSVs: {len(preoptimized_csvs)}")
print(f"  Submission CSVs: {len(submission_csvs)}")
print(f"  Other CSVs: {len(other_csvs)}")

# Show some examples of preoptimized CSVs
print(f"\nSample preoptimized CSVs:")
for csv in preoptimized_csvs[:10]:
    print(f"  {csv}")


In [None]:
# Load baseline for comparison
baseline_path = "/home/nonroot/snapshots/santa-2025/21337353543/submission/submission.csv"
baseline_df = pd.read_csv(baseline_path)

# Compute baseline per-N scores
baseline_scores = {}
for n in range(1, 201):
    score = compute_score_for_n(baseline_df, n)
    if score:
        baseline_scores[n] = score

baseline_total = sum(baseline_scores.values())
print(f"Baseline total score: {baseline_total:.6f}")
print(f"Baseline N=1: {baseline_scores[1]:.6f}")
print(f"Baseline N=10: {baseline_scores[10]:.6f}")
print(f"Baseline N=100: {baseline_scores[100]:.6f}")


In [None]:
# Scan ALL CSV files for better per-N solutions
import time

def scan_csv_for_improvements(csv_path, baseline_scores):
    """Scan a CSV file and return improvements over baseline."""
    try:
        df = pd.read_csv(csv_path)
        if 'id' not in df.columns or 'x' not in df.columns:
            return None, {}
        
        improvements = {}
        total_score = 0
        valid_n_count = 0
        
        for n in range(1, 201):
            score = compute_score_for_n(df, n)
            if score is not None:
                total_score += score
                valid_n_count += 1
                
                if n in baseline_scores and score < baseline_scores[n] - 1e-10:
                    improvements[n] = {
                        'baseline': baseline_scores[n],
                        'new': score,
                        'improvement': baseline_scores[n] - score
                    }
        
        if valid_n_count == 200:
            return total_score, improvements
        else:
            return None, improvements
    except Exception as e:
        return None, {}

# Scan preoptimized CSVs first (most likely to have different solutions)
print("Scanning preoptimized CSVs...")
start_time = time.time()

best_per_n = {n: {'score': baseline_scores[n], 'source': 'baseline'} for n in range(1, 201)}
all_improvements = []

for i, csv_path in enumerate(preoptimized_csvs):
    total_score, improvements = scan_csv_for_improvements(csv_path, baseline_scores)
    
    if improvements:
        for n, data in improvements.items():
            if data['new'] < best_per_n[n]['score']:
                best_per_n[n] = {'score': data['new'], 'source': csv_path}
                all_improvements.append((csv_path, n, data['improvement']))
    
    if (i + 1) % 20 == 0:
        print(f"  Scanned {i+1}/{len(preoptimized_csvs)} preoptimized CSVs...")

print(f"\nScanned {len(preoptimized_csvs)} preoptimized CSVs in {time.time() - start_time:.1f}s")
print(f"Found {len(all_improvements)} improvements")


In [None]:
# Now scan ALL other CSVs
print("Scanning all other CSVs...")
start_time = time.time()

for i, csv_path in enumerate(all_csvs):
    if csv_path in preoptimized_csvs:
        continue
    
    total_score, improvements = scan_csv_for_improvements(csv_path, baseline_scores)
    
    if improvements:
        for n, data in improvements.items():
            if data['new'] < best_per_n[n]['score']:
                best_per_n[n] = {'score': data['new'], 'source': csv_path}
                all_improvements.append((csv_path, n, data['improvement']))
    
    if (i + 1) % 500 == 0:
        print(f"  Scanned {i+1}/{len(all_csvs)} CSVs...")

print(f"\nScanned all CSVs in {time.time() - start_time:.1f}s")
print(f"Total improvements found: {len(all_improvements)}")


In [None]:
# Analyze improvements
print("=" * 60)
print("IMPROVEMENT ANALYSIS")
print("=" * 60)

# Count N values with improvements
improved_ns = [n for n in range(1, 201) if best_per_n[n]['source'] != 'baseline']
print(f"\nN values with improvements: {len(improved_ns)}")

# Calculate total potential improvement
new_total = sum(best_per_n[n]['score'] for n in range(1, 201))
print(f"\nBaseline total: {baseline_total:.6f}")
print(f"Best ensemble total: {new_total:.6f}")
print(f"Potential improvement: {baseline_total - new_total:.6f}")

# Show top improvements
if all_improvements:
    print(f"\nTop 20 improvements:")
    sorted_improvements = sorted(all_improvements, key=lambda x: -x[2])
    for csv_path, n, improvement in sorted_improvements[:20]:
        source_name = os.path.basename(csv_path)
        print(f"  N={n}: +{improvement:.8f} from {source_name}")

# Show unique sources
unique_sources = set(best_per_n[n]['source'] for n in range(1, 201) if best_per_n[n]['source'] != 'baseline')
print(f"\nUnique sources with improvements: {len(unique_sources)}")
for source in list(unique_sources)[:10]:
    print(f"  {source}")


In [None]:
# Check if the improvements come from the same snapshot that had overlaps
# (exp_011 found that snapshot 21145966992 had overlaps)

print("=" * 60)
print("CHECKING IMPROVEMENT SOURCES")
print("=" * 60)

# Check which snapshots the improvements come from
snapshot_improvements = defaultdict(list)
for n in range(1, 201):
    if best_per_n[n]['source'] != 'baseline':
        # Extract snapshot ID from path
        source = best_per_n[n]['source']
        parts = source.split('/')
        for part in parts:
            if part.isdigit() and len(part) > 10:
                snapshot_improvements[part].append(n)
                break

print(f"\nSnapshots with improvements:")
for snapshot_id, ns in sorted(snapshot_improvements.items(), key=lambda x: -len(x[1])):
    print(f"  {snapshot_id}: {len(ns)} N values improved")

# Check if 21145966992 is in the list (this was the one with overlaps)
if '21145966992' in snapshot_improvements:
    print(f"\n⚠️ WARNING: Snapshot 21145966992 (known to have overlaps) is in the improvements!")
    print(f"   N values from this snapshot: {snapshot_improvements['21145966992']}")


In [None]:
# Let's check the actual scores of the external data sources
print("=" * 60)
print("EXTERNAL DATA SOURCE SCORES")
print("=" * 60)

external_sources = [
    '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/71.97.csv',
    '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/72.49.csv',
    '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/bucket-of-chump/submission.csv',
    '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/submission_70_926149550346.csv',
    '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/submission_70_936673758122.csv',
]

for source in external_sources:
    if os.path.exists(source):
        try:
            df = pd.read_csv(source)
            total = 0
            valid = 0
            for n in range(1, 201):
                score = compute_score_for_n(df, n)
                if score:
                    total += score
                    valid += 1
            print(f"{os.path.basename(source)}: {total:.6f} ({valid} valid N)")
        except Exception as e:
            print(f"{os.path.basename(source)}: ERROR - {e}")
    else:
        print(f"{os.path.basename(source)}: NOT FOUND")

print(f"\nBaseline: {baseline_total:.6f}")


In [None]:
# Let's check the santa25-public files more carefully - they have scores in their names
print("=" * 60)
print("SANTA25-PUBLIC FILES ANALYSIS")
print("=" * 60)

santa25_public_dir = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/'
if os.path.exists(santa25_public_dir):
    files = os.listdir(santa25_public_dir)
    print(f"Files in santa25-public: {len(files)}")
    
    for f in sorted(files):
        if f.endswith('.csv'):
            path = os.path.join(santa25_public_dir, f)
            try:
                df = pd.read_csv(path)
                total = 0
                valid = 0
                for n in range(1, 201):
                    score = compute_score_for_n(df, n)
                    if score:
                        total += score
                        valid += 1
                print(f"  {f}: {total:.6f} ({valid} valid N)")
            except Exception as e:
                print(f"  {f}: ERROR - {e}")


In [None]:
# CRITICAL: Check if ANY CSV has a total score better than baseline
print("=" * 60)
print("FINDING BEST TOTAL SCORES ACROSS ALL CSVS")
print("=" * 60)

best_total_scores = []

for csv_path in all_csvs:
    try:
        df = pd.read_csv(csv_path)
        if 'id' not in df.columns or 'x' not in df.columns:
            continue
        
        total = 0
        valid = 0
        for n in range(1, 201):
            score = compute_score_for_n(df, n)
            if score:
                total += score
                valid += 1
        
        if valid == 200:
            best_total_scores.append((total, csv_path))
    except:
        continue

# Sort by score
best_total_scores.sort()

print(f"\nTop 20 CSVs by total score:")
for score, path in best_total_scores[:20]:
    rel_path = path.replace('/home/nonroot/snapshots/santa-2025/', '')
    print(f"  {score:.6f}: {rel_path}")

print(f"\nBaseline: {baseline_total:.6f}")
print(f"Best found: {best_total_scores[0][0]:.6f}")
print(f"Gap: {baseline_total - best_total_scores[0][0]:.6f}")
