# Experiment 033: Comprehensive Ensemble from ALL Sources

The ONLY approach that has yielded improvements is ensemble methods combining solutions from different sources.

This experiment will:
1. Collect ALL CSV files from all directories
2. For each N=1-200, load solutions from ALL sources
3. Validate each solution (no overlaps)
4. Pick the best valid solution for each N
5. Create ensemble submission

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
import glob
import os

# Tree shape
TREE_VERTICES = np.array([
    [0.0, 0.8], [0.125, 0.5], [0.0625, 0.5], [0.2, 0.25], [0.1, 0.25],
    [0.35, 0.0], [0.075, 0.0], [0.075, -0.2], [-0.075, -0.2], [-0.075, 0.0],
    [-0.35, 0.0], [-0.1, 0.25], [-0.2, 0.25], [-0.0625, 0.5], [-0.125, 0.5],
], dtype=np.float64)

def create_tree_polygon(x, y, deg):
    tree = Polygon(TREE_VERTICES)
    tree = rotate(tree, deg, origin=(0, 0))
    tree = translate(tree, x, y)
    return tree

def check_overlap(trees):
    n = len(trees)
    for i in range(n):
        for j in range(i + 1, n):
            if trees[i].overlaps(trees[j]) or trees[i].contains(trees[j]) or trees[j].contains(trees[i]):
                return True
    return False

def calculate_score(trees):
    all_bounds = [t.bounds for t in trees]
    min_x = min(b[0] for b in all_bounds)
    min_y = min(b[1] for b in all_bounds)
    max_x = max(b[2] for b in all_bounds)
    max_y = max(b[3] for b in all_bounds)
    side = max(max_x - min_x, max_y - min_y)
    return side * side / len(trees)

def parse_value(v):
    if isinstance(v, str) and v.startswith('s'):
        return float(v[1:])
    return float(v)

print("Functions defined")

Functions defined


In [2]:
# Find ALL CSV files
csv_files = []

# Main exploration datasets
csv_files.extend(glob.glob('/home/code/exploration/datasets/**/*.csv', recursive=True))
csv_files.extend(glob.glob('/home/code/exploration/*.csv'))

# Snapshot submissions
csv_files.extend(glob.glob('/home/nonroot/snapshots/santa-2025/*/submission/submission.csv'))

# Remove duplicates
csv_files = list(set(csv_files))
print(f"Found {len(csv_files)} CSV files")

# Show some examples
for f in csv_files[:10]:
    print(f"  {f}")

Found 107 CSV files
  /home/nonroot/snapshots/santa-2025/21165872902/submission/submission.csv
  /home/code/exploration/datasets/ensemble_best_v2.csv
  /home/nonroot/snapshots/santa-2025/21129620891/submission/submission.csv
  /home/nonroot/snapshots/santa-2025/21165878844/submission/submission.csv
  /home/nonroot/snapshots/santa-2025/21156853393/submission/submission.csv
  /home/code/exploration/datasets/santa25_public/santa2025_ver2_v66.csv
  /home/nonroot/snapshots/santa-2025/21222375510/submission/submission.csv
  /home/code/exploration/datasets/santa25_public/submission_JKoT4.csv
  /home/code/exploration/submission.csv
  /home/nonroot/snapshots/santa-2025/21122904233/submission/submission.csv


In [3]:
# Load baseline
baseline_df = pd.read_csv('/home/submission/submission.csv')

baseline_configs = {}
baseline_scores = {}

for n in range(1, 201):
    prefix = f"{n:03d}_"
    group = baseline_df[baseline_df["id"].str.startswith(prefix)].sort_values("id")
    configs = []
    for _, row in group.iterrows():
        x = parse_value(row["x"])
        y = parse_value(row["y"])
        deg = parse_value(row["deg"])
        configs.append((x, y, deg))
    baseline_configs[n] = configs
    trees = [create_tree_polygon(x, y, deg) for x, y, deg in configs]
    baseline_scores[n] = calculate_score(trees)

print(f"Baseline total score: {sum(baseline_scores.values()):.6f}")

Baseline total score: 70.624381


In [None]:
# Load all sources and find best per N
best_configs = dict(baseline_configs)
best_scores = dict(baseline_scores)
best_sources = {n: 'baseline' for n in range(1, 201)}

improvements_found = 0
total_improvement = 0.0

for filepath in csv_files:
    try:
        df = pd.read_csv(filepath)
        if 'id' not in df.columns or 'x' not in df.columns:
            continue
        
        for n in range(1, 201):
            prefix = f"{n:03d}_"
            group = df[df["id"].str.startswith(prefix)].sort_values("id")
            if len(group) != n:
                continue
            
            configs = []
            for _, row in group.iterrows():
                x = parse_value(row["x"])
                y = parse_value(row["y"])
                deg = parse_value(row["deg"])
                configs.append((x, y, deg))
            
            trees = [create_tree_polygon(x, y, deg) for x, y, deg in configs]
            
            # Check for overlaps
            if check_overlap(trees):
                continue
            
            score = calculate_score(trees)
            if score < best_scores[n]:
                improvement = best_scores[n] - score
                if improvement > 1e-6:
                    print(f"N={n}: {best_scores[n]:.6f} -> {score:.6f} (improvement: {improvement:.6f}) from {os.path.basename(filepath)}")
                    improvements_found += 1
                    total_improvement += improvement
                best_scores[n] = score
                best_configs[n] = configs
                best_sources[n] = filepath
    except Exception as e:
        continue

print(f"\nTotal improvements found: {improvements_found}")
print(f"Total improvement: {total_improvement:.6f}")

In [None]:
# Summary
print("\n" + "="*60)
print("ENSEMBLE SUMMARY")
print("="*60)

old_total = sum(baseline_scores.values())
new_total = sum(best_scores.values())

print(f"\nBaseline total: {old_total:.6f}")
print(f"Ensemble total: {new_total:.6f}")
print(f"Improvement: {old_total - new_total:.6f}")

# Count sources
source_counts = {}
for n, source in best_sources.items():
    source_name = os.path.basename(source) if source != 'baseline' else 'baseline'
    source_counts[source_name] = source_counts.get(source_name, 0) + 1

print(f"\nSources used:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f"  {source}: {count} N values")

In [None]:
# Validate all groups
print("\nValidating all groups...")
overlap_count = 0

for n in range(1, 201):
    configs = best_configs[n]
    trees = [create_tree_polygon(x, y, deg) for x, y, deg in configs]
    if check_overlap(trees):
        overlap_count += 1
        print(f"  Group {n:03d} has overlaps!")

if overlap_count == 0:
    print("All groups valid - no overlaps!")
else:
    print(f"\nWARNING: {overlap_count} groups have overlaps!")

In [None]:
# Save submission if improved and valid
final_score = sum(best_scores.values())
baseline_total = sum(baseline_scores.values())

if final_score < baseline_total and overlap_count == 0:
    print(f"\nSaving improved submission...")
    
    rows = []
    for n in range(1, 201):
        for i, (x, y, deg) in enumerate(best_configs[n]):
            rows.append({
                "id": f"{n:03d}_{i}",
                "x": f"s{x}",
                "y": f"s{y}",
                "deg": f"s{deg}",
            })
    
    new_df = pd.DataFrame(rows)
    new_df.to_csv("/home/submission/submission.csv", index=False)
    print(f"Saved to /home/submission/submission.csv")
    print(f"New total score: {final_score:.9f}")
else:
    print(f"\nNo improvement or invalid - keeping baseline")
    print(f"Baseline score: {baseline_total:.9f}")
    final_score = baseline_total

In [None]:
# Save metrics
import json

metrics = {
    'cv_score': final_score,
    'baseline_score': baseline_total,
    'improvement': baseline_total - final_score,
    'improvements_found': improvements_found,
    'total_sources_checked': len(csv_files),
    'overlap_count': overlap_count,
    'approach': 'Comprehensive ensemble from ALL CSV sources'
}

with open('/home/code/experiments/033_comprehensive_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("\nMetrics saved:")
for k, v in metrics.items():
    print(f"  {k}: {v}")