# Experiment 005: Comprehensive Ensemble from ALL Sources

The evaluator identified that we haven't explored all pre-optimized sources.
There are 692 CSV files in the snapshots directory!

This experiment will:
1. Scan ALL CSV files
2. Score each per-N
3. Build ensemble picking best per-N from ALL sources
4. Compare to current best

In [None]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
import os
import glob
from tqdm import tqdm
import shutil

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def parse_value(s):
    if isinstance(s, str) and s.startswith('s'):
        return float(s[1:])
    return float(s)

def create_tree_polygon(x, y, deg):
    angle_rad = np.radians(deg)
    cos_a, sin_a = np.cos(angle_rad), np.sin(angle_rad)
    vertices = [(tx * cos_a - ty * sin_a + x, tx * sin_a + ty * cos_a + y) for tx, ty in zip(TX, TY)]
    return Polygon(vertices)

def compute_bounding_side(polygons):
    if not polygons:
        return 0
    all_points = []
    for poly in polygons:
        all_points.extend(list(poly.exterior.coords))
    all_points = np.array(all_points)
    return max(all_points.max(axis=0) - all_points.min(axis=0))

def compute_score_for_n(df, n):
    prefix = f"{n:03d}_"
    trees = df[df['id'].str.startswith(prefix)]
    if len(trees) != n:
        return float('inf'), None
    polygons = [create_tree_polygon(parse_value(row['x']), parse_value(row['y']), parse_value(row['deg'])) for _, row in trees.iterrows()]
    side = compute_bounding_side(polygons)
    return side**2 / n, trees

def compute_total_score(df):
    return sum(compute_score_for_n(df, n)[0] for n in range(1, 201))

print("Functions defined")

In [None]:
# Find ALL CSV files in snapshots
all_csvs = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
print(f"Found {len(all_csvs)} CSV files")

# Also add external data CSVs
external_csvs = glob.glob('/home/code/external_data/**/*.csv', recursive=True)
all_csvs.extend(external_csvs)
print(f"Total with external data: {len(all_csvs)} CSV files")

In [None]:
# Load and validate each CSV, keeping only valid submission files
valid_sources = []

for csv_path in tqdm(all_csvs, desc="Scanning CSVs"):
    try:
        df = pd.read_csv(csv_path)
        # Check if it's a valid submission file
        if 'id' in df.columns and 'x' in df.columns and 'y' in df.columns and 'deg' in df.columns:
            # Check if it has the right number of rows (20100 for complete submission)
            if len(df) >= 20000:  # Allow some flexibility
                valid_sources.append(csv_path)
    except Exception as e:
        pass

print(f"\nFound {len(valid_sources)} valid submission files")

In [None]:
# Score each valid source and find total score
print("Scoring each source...")
source_scores = []

for csv_path in tqdm(valid_sources, desc="Scoring sources"):
    try:
        df = pd.read_csv(csv_path)
        total_score = compute_total_score(df)
        if total_score < 200:  # Sanity check - valid scores should be < 200
            source_scores.append((csv_path, total_score))
    except Exception as e:
        pass

# Sort by score
source_scores.sort(key=lambda x: x[1])

print(f"\nTop 20 sources by total score:")
for path, score in source_scores[:20]:
    print(f"  {score:.6f}: {path.split('/')[-1]}")

In [None]:
# Build comprehensive ensemble: for each N, find the best source
print("\nBuilding comprehensive ensemble...")

# Load all valid dataframes
all_dfs = {}
for path, score in tqdm(source_scores, desc="Loading dataframes"):
    try:
        all_dfs[path] = pd.read_csv(path)
    except:
        pass

print(f"Loaded {len(all_dfs)} dataframes")

In [None]:
# For each N, find the best source
best_per_n = {}  # n -> (score, source_path, trees_df)

for n in tqdm(range(1, 201), desc="Finding best per N"):
    best_score = float('inf')
    best_source = None
    best_trees = None
    
    for path, df in all_dfs.items():
        score, trees = compute_score_for_n(df, n)
        if score < best_score:
            best_score = score
            best_source = path
            best_trees = trees
    
    best_per_n[n] = (best_score, best_source, best_trees)

# Compute ensemble total
ensemble_total = sum(best_per_n[n][0] for n in range(1, 201))
print(f"\nEnsemble total score: {ensemble_total:.6f}")

In [None]:
# Analyze which sources win for which N values
source_wins = {}
for n in range(1, 201):
    source = best_per_n[n][1]
    source_name = source.split('/')[-1] if source else 'None'
    source_wins[source_name] = source_wins.get(source_name, 0) + 1

print("Source wins distribution:")
for source, wins in sorted(source_wins.items(), key=lambda x: -x[1]):
    print(f"  {source}: {wins} N values")

In [None]:
# Compare to current best
df_baseline = pd.read_csv('/home/code/external_data/saspav/santa-2025.csv')
baseline_total = compute_total_score(df_baseline)

print(f"\nComparison:")
print(f"Baseline (saspav): {baseline_total:.6f}")
print(f"Comprehensive ensemble: {ensemble_total:.6f}")
print(f"Improvement: {baseline_total - ensemble_total:.9f}")

# Show N values where ensemble is different from baseline
print("\nN values where ensemble differs from baseline:")
for n in range(1, 201):
    baseline_score, _ = compute_score_for_n(df_baseline, n)
    ensemble_score = best_per_n[n][0]
    if abs(ensemble_score - baseline_score) > 1e-9:
        source_name = best_per_n[n][1].split('/')[-1] if best_per_n[n][1] else 'None'
        print(f"  N={n}: baseline={baseline_score:.6f}, ensemble={ensemble_score:.6f}, diff={ensemble_score-baseline_score:.9f}, source={source_name}")

In [None]:
# Save the ensemble submission
ensemble_rows = []
for n in range(1, 201):
    trees = best_per_n[n][2]
    if trees is not None:
        for _, row in trees.iterrows():
            ensemble_rows.append(row.to_dict())

ensemble_df = pd.DataFrame(ensemble_rows)
ensemble_df.to_csv('/home/submission/submission.csv', index=False)
print(f"Saved ensemble with {len(ensemble_df)} rows")

# Verify
df_verify = pd.read_csv('/home/submission/submission.csv')
verify_score = compute_total_score(df_verify)
print(f"Verified ensemble score: {verify_score:.6f}")

In [None]:
# Summary
print("="*60)
print("EXPERIMENT 005 SUMMARY: Comprehensive Ensemble")
print("="*60)
print(f"Total CSV files scanned: {len(all_csvs)}")
print(f"Valid submission files: {len(valid_sources)}")
print(f"Sources with valid scores: {len(source_scores)}")
print(f"\nBest single source: {source_scores[0][1]:.6f} ({source_scores[0][0].split('/')[-1]})")
print(f"Comprehensive ensemble: {verify_score:.6f}")
print(f"Improvement over best single: {source_scores[0][1] - verify_score:.9f}")
print("="*60)

In [None]:
# Model wrapper for submission
class ComprehensiveEnsemble:
    def __init__(self, data='single'):
        self.data = data
        
    def load_best(self):
        return pd.read_csv('/home/submission/submission.csv')
    
    def save_submission(self, path):
        df = self.load_best()
        df.to_csv(path, index=False)
        return df

model = ComprehensiveEnsemble(data='single')
print("Model wrapper defined")