# Experiment 003: Ensemble Approach with Snapshot Solutions

This experiment implements the ENSEMBLE approach - the PRIMARY technique used by top kernels:
1. Collect all available snapshot solutions
2. For each N (1-200), pick the BEST configuration across all sources
3. Apply optimization pipeline to this ensemble baseline

**Critical Discovery:** Snapshot solutions exist with scores as low as 87.36!

In [1]:
import numpy as np
import pandas as pd
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.strtree import STRtree
from shapely.ops import unary_union
from scipy.spatial import ConvexHull
from scipy.optimize import minimize_scalar
import os
import subprocess
import shutil
import time
import glob

getcontext().prec = 25
scale_factor = Decimal("1e18")

print("Libraries loaded")

Libraries loaded


In [2]:
# Fast scoring using numpy (no Shapely for speed)
TX = np.array([0,0.125,0.0625,0.2,0.1,0.35,0.075,0.075,-0.075,-0.075,-0.35,-0.1,-0.2,-0.0625,-0.125])
TY = np.array([0.8,0.5,0.5,0.25,0.25,0,0,-0.2,-0.2,0,0,0.25,0.25,0.5,0.5])

def score_group_fast(xs, ys, degs):
    """Fast scoring for a single N configuration."""
    n = len(xs)
    if n == 0:
        return float('inf')
    
    all_x = []
    all_y = []
    
    for i in range(n):
        rad = np.radians(degs[i])
        c, s = np.cos(rad), np.sin(rad)
        px = TX * c - TY * s + xs[i]
        py = TX * s + TY * c + ys[i]
        all_x.extend(px)
        all_y.extend(py)
    
    all_x = np.array(all_x)
    all_y = np.array(all_y)
    
    side = max(all_x.max() - all_x.min(), all_y.max() - all_y.min())
    return side * side / n

def strip_s(val):
    """Remove 's' prefix from value."""
    s = str(val)
    return float(s[1:] if s.startswith('s') else s)

def score_csv_fast(filepath):
    """Score a CSV file quickly."""
    try:
        df = pd.read_csv(filepath)
        if not {'id', 'x', 'y', 'deg'}.issubset(df.columns):
            return float('inf'), {}
        
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        df['x_val'] = df['x'].apply(strip_s)
        df['y_val'] = df['y'].apply(strip_s)
        df['deg_val'] = df['deg'].apply(strip_s)
        
        total_score = 0.0
        scores_per_n = {}
        
        for n in range(1, 201):
            group = df[df['N'] == n]
            if len(group) == n:
                xs = group['x_val'].values
                ys = group['y_val'].values
                degs = group['deg_val'].values
                score = score_group_fast(xs, ys, degs)
                scores_per_n[n] = score
                total_score += score
            else:
                scores_per_n[n] = float('inf')
                total_score = float('inf')
                break
        
        return total_score, scores_per_n
    except Exception as e:
        return float('inf'), {}

print("Fast scoring functions defined")

Fast scoring functions defined


In [3]:
# Find all CSV files in snapshots
snapshot_csvs = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
print(f"Found {len(snapshot_csvs)} CSV files in snapshots")

# Also include our own candidates
our_csvs = glob.glob('/home/code/submission_candidates/*.csv')
our_csvs += glob.glob('/home/code/experiments/*/submission*.csv')
our_csvs += glob.glob('/home/code/experiments/*/*.csv')
print(f"Found {len(our_csvs)} CSV files from our experiments")

all_csvs = list(set(snapshot_csvs + our_csvs))
print(f"Total unique CSV files: {len(all_csvs)}")

Found 113 CSV files in snapshots
Found 19 CSV files from our experiments
Total unique CSV files: 125


In [4]:
# Score all CSV files and find the best ones
print("Scoring all CSV files...")

csv_scores = []
for i, csv_path in enumerate(all_csvs):
    if i % 20 == 0:
        print(f"  Processing {i+1}/{len(all_csvs)}...")
    
    total_score, scores_per_n = score_csv_fast(csv_path)
    if total_score < float('inf'):
        csv_scores.append({
            'path': csv_path,
            'total_score': total_score,
            'scores_per_n': scores_per_n
        })

print(f"\nSuccessfully scored {len(csv_scores)} valid CSV files")

# Sort by total score
csv_scores.sort(key=lambda x: x['total_score'])

# Show top 10
print("\nTop 10 CSV files by total score:")
for i, entry in enumerate(csv_scores[:10]):
    print(f"  {i+1}. Score: {entry['total_score']:.6f} - {entry['path'].split('/')[-1]}")

Scoring all CSV files...
  Processing 1/125...


  Processing 21/125...


  Processing 41/125...


  Processing 61/125...


  Processing 81/125...


  Processing 101/125...


  Processing 121/125...



Successfully scored 125 valid CSV files

Top 10 CSV files by total score:
  1. Score: 85.087800 - submission_85.087800.csv
  2. Score: 85.094752 - submission_85.094752.csv
  3. Score: 85.104411 - submission_85.104411.csv
  4. Score: 85.108229 - submission_85.108229.csv
  5. Score: 85.122311 - submission_85.122311.csv
  6. Score: 85.135823 - submission_85.135823.csv
  7. Score: 85.150818 - submission_85.150818.csv
  8. Score: 85.210990 - submission_85.210990.csv
  9. Score: 85.374712 - submission_85.374712.csv
  10. Score: 85.384617 - submission_85.384617.csv


In [5]:
# Build ensemble: for each N, pick the best configuration across all sources
print("\nBuilding ensemble baseline...")

ensemble_configs = {}  # n -> (best_score, best_source, config_data)

for n in range(1, 201):
    best_score = float('inf')
    best_source = None
    best_data = None
    
    for entry in csv_scores:
        if n in entry['scores_per_n'] and entry['scores_per_n'][n] < best_score:
            best_score = entry['scores_per_n'][n]
            best_source = entry['path']
    
    ensemble_configs[n] = {
        'score': best_score,
        'source': best_source
    }

# Calculate ensemble total score
ensemble_total = sum(ensemble_configs[n]['score'] for n in range(1, 201))
print(f"Ensemble baseline total score: {ensemble_total:.6f}")

# Show improvement breakdown
print(f"\nComparison:")
print(f"  Best single CSV: {csv_scores[0]['total_score']:.6f}")
print(f"  Ensemble baseline: {ensemble_total:.6f}")
print(f"  Improvement: {csv_scores[0]['total_score'] - ensemble_total:.6f}")


Building ensemble baseline...
Ensemble baseline total score: 85.034311

Comparison:
  Best single CSV: 85.087800
  Ensemble baseline: 85.034311
  Improvement: 0.053489


In [6]:
# Create the ensemble CSV by extracting best configs from each source
print("\nCreating ensemble CSV...")

ensemble_rows = []
source_counts = {}

for n in range(1, 201):
    source_path = ensemble_configs[n]['source']
    if source_path is None:
        print(f"  WARNING: No valid source for N={n}")
        continue
    
    # Track source usage
    source_name = source_path.split('/')[-1]
    source_counts[source_name] = source_counts.get(source_name, 0) + 1
    
    # Load the configuration for this N
    df = pd.read_csv(source_path)
    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    group = df[df['N'] == n]
    
    for _, row in group.iterrows():
        ensemble_rows.append({
            'id': row['id'],
            'x': row['x'],
            'y': row['y'],
            'deg': row['deg']
        })

ensemble_df = pd.DataFrame(ensemble_rows)
ensemble_df.to_csv('/home/code/experiments/003_ensemble/ensemble_baseline.csv', index=False)

print(f"Ensemble CSV created with {len(ensemble_rows)} rows")
print(f"\nSource distribution (top 10):")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1])[:10]:
    print(f"  {source}: {count} configurations")


Creating ensemble CSV...


Ensemble CSV created with 20100 rows

Source distribution (top 10):
  submission_85.087800.csv: 196 configurations
  candidate_001.csv: 3 configurations
  submission.csv: 1 configurations


In [7]:
# Verify ensemble score
verify_score, _ = score_csv_fast('/home/code/experiments/003_ensemble/ensemble_baseline.csv')
print(f"Verified ensemble score: {verify_score:.6f}")

Verified ensemble score: 85.034311


In [8]:
# Now apply our optimization pipeline to the ensemble baseline
# First, copy the C++ optimizer from experiment 002
shutil.copy('/home/code/experiments/002_multiphase/tree_packer_v2.cpp',
            '/home/code/experiments/003_ensemble/tree_packer_v2.cpp')

# Compile
os.chdir('/home/code/experiments/003_ensemble')
result = subprocess.run(
    ['g++', '-O3', '-march=native', '-std=c++17', '-fopenmp', '-o', 'tree_packer_v2', 'tree_packer_v2.cpp'],
    capture_output=True, text=True
)
if result.returncode == 0:
    print("C++ optimizer compiled successfully")
else:
    print(f"Compilation failed: {result.stderr}")

C++ optimizer compiled successfully


In [9]:
# Copy ensemble baseline to working file
shutil.copy('/home/code/experiments/003_ensemble/ensemble_baseline.csv',
            '/home/code/experiments/003_ensemble/submission.csv')

print(f"Starting optimization from ensemble baseline (score: {verify_score:.6f})")

Starting optimization from ensemble baseline (score: 85.034311)


In [10]:
# Run optimization with higher iterations
print("\n" + "=" * 60)
print("PHASE 1: C++ Optimizer on Ensemble Baseline")
print("=" * 60)

best_score = verify_score
best_file = '/home/code/experiments/003_ensemble/submission.csv'

for seed in range(3):
    print(f"\n--- Seed {seed} ---")
    
    if seed > 0:
        shutil.copy(best_file, '/home/code/experiments/003_ensemble/submission.csv')
    
    start_time = time.time()
    result = subprocess.run(
        ['./tree_packer_v2', '-n', '10000', '-r', '96', '-s', str(seed)],
        capture_output=True, text=True,
        cwd='/home/code/experiments/003_ensemble'
    )
    elapsed = time.time() - start_time
    
    print(f"Completed in {elapsed:.1f}s")
    print(result.stdout)
    
    if os.path.exists('/home/code/experiments/003_ensemble/submission_optimized.csv'):
        new_score, _ = score_csv_fast('/home/code/experiments/003_ensemble/submission_optimized.csv')
        print(f"Score: {new_score:.6f}")
        
        if new_score < best_score:
            best_score = new_score
            shutil.copy('/home/code/experiments/003_ensemble/submission_optimized.csv',
                       f'/home/code/experiments/003_ensemble/best_seed{seed}.csv')
            best_file = f'/home/code/experiments/003_ensemble/best_seed{seed}.csv'
            shutil.copy(best_file, '/home/code/experiments/003_ensemble/submission.csv')
            print(f"NEW BEST: {best_score:.6f}")

print(f"\nPhase 1 best score: {best_score:.6f}")


PHASE 1: C++ Optimizer on Ensemble Baseline

--- Seed 0 ---


Completed in 96.1s
Enhanced Tree Packer with Fractional Translation
Iterations: 10000, Rounds: 96, Seed: 0
Initial: 85.034311
Final:   85.023427



Score: 85.023422
NEW BEST: 85.023422

--- Seed 1 ---


Completed in 96.0s
Enhanced Tree Packer with Fractional Translation
Iterations: 10000, Rounds: 96, Seed: 1
Initial: 85.023422
Final:   84.981348



Score: 84.981346
NEW BEST: 84.981346

--- Seed 2 ---


Completed in 96.3s
Enhanced Tree Packer with Fractional Translation
Iterations: 10000, Rounds: 96, Seed: 2
Initial: 84.981346
Final:   84.897367

Score: 84.897367


NEW BEST: 84.897367

Phase 1 best score: 84.897367


In [None]:
# Phase 2: Backward Propagation
print("\n" + "=" * 60)
print("PHASE 2: Backward Propagation")
print("=" * 60)

# ChristmasTree class for validation
class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(str(center_x))
        self.center_y = Decimal(str(center_y))
        self.angle = Decimal(str(angle))

        trunk_w = Decimal('0.15')
        trunk_h = Decimal('0.2')
        base_w = Decimal('0.7')
        mid_w = Decimal('0.4')
        top_w = Decimal('0.25')
        tip_y = Decimal('0.8')
        tier_1_y = Decimal('0.5')
        tier_2_y = Decimal('0.25')
        base_y = Decimal('0.0')
        trunk_bottom_y = -trunk_h

        initial_polygon = Polygon([
            (Decimal('0.0') * scale_factor, tip_y * scale_factor),
            (top_w / Decimal('2') * scale_factor, tier_1_y * scale_factor),
            (top_w / Decimal('4') * scale_factor, tier_1_y * scale_factor),
            (mid_w / Decimal('2') * scale_factor, tier_2_y * scale_factor),
            (mid_w / Decimal('4') * scale_factor, tier_2_y * scale_factor),
            (base_w / Decimal('2') * scale_factor, base_y * scale_factor),
            (trunk_w / Decimal('2') * scale_factor, base_y * scale_factor),
            (trunk_w / Decimal('2') * scale_factor, trunk_bottom_y * scale_factor),
            (-(trunk_w / Decimal('2')) * scale_factor, trunk_bottom_y * scale_factor),
            (-(trunk_w / Decimal('2')) * scale_factor, base_y * scale_factor),
            (-(base_w / Decimal('2')) * scale_factor, base_y * scale_factor),
            (-(mid_w / Decimal('4')) * scale_factor, tier_2_y * scale_factor),
            (-(mid_w / Decimal('2')) * scale_factor, tier_2_y * scale_factor),
            (-(top_w / Decimal('4')) * scale_factor, tier_1_y * scale_factor),
            (-(top_w / Decimal('2')) * scale_factor, tier_1_y * scale_factor),
        ])
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated,
                                          xoff=float(self.center_x * scale_factor),
                                          yoff=float(self.center_y * scale_factor))

def load_trees(n, df):
    group_data = df[df["id"].str.startswith(f"{n:03d}_")]
    trees = []
    for _, row in group_data.iterrows():
        x = str(row["x"])[1:] if str(row["x"]).startswith('s') else str(row["x"])
        y = str(row["y"])[1:] if str(row["y"]).startswith('s') else str(row["y"])
        deg = str(row["deg"])[1:] if str(row["deg"]).startswith('s') else str(row["deg"])
        if x and y and deg:
            trees.append(ChristmasTree(x, y, deg))
    return trees

def get_side(trees):
    if not trees:
        return 0.0
    xys = np.concatenate([np.asarray(t.polygon.exterior.xy).T / float(scale_factor) for t in trees])
    return max(xys.max(axis=0) - xys.min(axis=0))

def has_overlap(trees):
    if len(trees) <= 1:
        return False
    polygons = [t.polygon for t in trees]
    tree_index = STRtree(polygons)
    for i, poly in enumerate(polygons):
        for idx in tree_index.query(poly):
            if idx != i and poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                return True
    return False

def backward_propagation(input_file, output_file):
    df = pd.read_csv(input_file)
    configs = {}
    sides = {}
    
    for n in range(1, 201):
        trees = load_trees(n, df)
        if trees:
            configs[n] = trees
            sides[n] = get_side(trees)
    
    print(f"Initial score: {sum(s**2/n for n, s in sides.items()):.6f}")
    
    improvements = 0
    for n in range(200, 1, -1):
        if n not in configs or (n-1) not in configs:
            continue
        
        current_side = sides[n-1]
        best_side = current_side
        best_idx = None
        
        for tree_idx in range(n):
            candidate = [t for i, t in enumerate(configs[n]) if i != tree_idx]
            if len(candidate) != n - 1:
                continue
            cand_side = get_side(candidate)
            if cand_side < best_side and not has_overlap(candidate):
                best_side = cand_side
                best_idx = tree_idx
        
        if best_idx is not None:
            configs[n-1] = [t for i, t in enumerate(configs[n]) if i != best_idx]
            sides[n-1] = best_side
            improvements += 1
    
    print(f"Improvements: {improvements}")
    
    rows = []
    for n in range(1, 201):
        if n in configs:
            for i, tree in enumerate(configs[n]):
                rows.append({
                    'id': f"{n:03d}_{i}",
                    'x': f"s{float(tree.center_x)}",
                    'y': f"s{float(tree.center_y)}",
                    'deg': f"s{float(tree.angle)}"
                })
    
    pd.DataFrame(rows).to_csv(output_file, index=False)
    final_score = sum(s**2/n for n, s in sides.items())
    print(f"Final score: {final_score:.6f}")
    return final_score

bp_score = backward_propagation(best_file, '/home/code/experiments/003_ensemble/submission_bp.csv')
if bp_score < best_score:
    best_score = bp_score
    best_file = '/home/code/experiments/003_ensemble/submission_bp.csv'
    print(f"Backward propagation improved to: {best_score:.6f}")

In [None]:
# Phase 3: Fix Direction
print("\n" + "=" * 60)
print("PHASE 3: Fix Direction")
print("=" * 60)

def calculate_bbox_at_angle(angle_deg, points):
    rad = np.radians(angle_deg)
    c, s = np.cos(rad), np.sin(rad)
    rot = points.dot(np.array([[c, s], [-s, c]]))
    return max(rot.max(axis=0) - rot.min(axis=0))

def optimize_rotation(trees):
    pts = []
    for t in trees:
        pts.extend(list(t.polygon.exterior.coords))
    pts = np.array(pts) / float(scale_factor)
    try:
        hull = pts[ConvexHull(pts).vertices]
    except:
        return get_side(trees), 0.0
    
    init = calculate_bbox_at_angle(0, hull)
    res = minimize_scalar(lambda a: calculate_bbox_at_angle(a, hull), bounds=(0.001, 89.999), method='bounded')
    if res.fun < init - 1e-8:
        return res.fun, res.x
    return init, 0.0

def apply_rotation(trees, angle):
    if not trees or abs(angle) < 1e-9:
        return trees
    
    bounds = [t.polygon.bounds for t in trees]
    center = np.array([(min(b[0] for b in bounds) + max(b[2] for b in bounds)) / 2,
                       (min(b[1] for b in bounds) + max(b[3] for b in bounds)) / 2]) / float(scale_factor)
    
    rad = np.radians(angle)
    c, s = np.cos(rad), np.sin(rad)
    rot = np.array([[c, -s], [s, c]])
    
    pts = np.array([[float(t.center_x), float(t.center_y)] for t in trees])
    rotated = (pts - center).dot(rot.T) + center
    
    return [ChristmasTree(str(rotated[i, 0]), str(rotated[i, 1]), str(float(trees[i].angle) + angle)) 
            for i in range(len(trees))]

def fix_direction(input_path, output_path):
    df = pd.read_csv(input_path)
    configs = {}
    sides = {}
    
    for n in range(1, 201):
        trees = load_trees(n, df)
        if trees:
            configs[n] = trees
            sides[n] = get_side(trees)
    
    print(f"Initial: {sum(s**2/n for n, s in sides.items()):.6f}")
    
    improved = 0
    for n in range(1, 201):
        if n not in configs or len(configs[n]) < 2:
            continue
        try:
            best_side, best_angle = optimize_rotation(configs[n])
            if abs(best_angle) > 0.001 and best_side < sides[n] - 1e-8:
                rotated = apply_rotation(configs[n], best_angle)
                if not has_overlap(rotated):
                    configs[n] = rotated
                    sides[n] = best_side
                    improved += 1
        except:
            pass
    
    print(f"Improved {improved} groups")
    
    rows = []
    for n in range(1, 201):
        if n in configs:
            for i, tree in enumerate(configs[n]):
                rows.append({
                    'id': f"{n:03d}_{i}",
                    'x': f"s{float(tree.center_x)}",
                    'y': f"s{float(tree.center_y)}",
                    'deg': f"s{float(tree.angle)}"
                })
    
    pd.DataFrame(rows).to_csv(output_path, index=False)
    final = sum(s**2/n for n, s in sides.items())
    print(f"Final: {final:.6f}")
    return final

fd_score = fix_direction(best_file, '/home/code/experiments/003_ensemble/submission_fd.csv')
if fd_score < best_score:
    best_score = fd_score
    best_file = '/home/code/experiments/003_ensemble/submission_fd.csv'
    print(f"Fix direction improved to: {best_score:.6f}")

In [None]:
# Final validation and copy to submission
print("\n" + "=" * 60)
print("FINAL RESULTS")
print("=" * 60)

final_score, _ = score_csv_fast(best_file)
print(f"Final score: {final_score:.6f}")

# Copy to submission folder
shutil.copy(best_file, '/home/submission/submission.csv')
print(f"Copied to /home/submission/submission.csv")

print(f"\n=== SUMMARY ===")
print(f"Ensemble baseline: {verify_score:.6f}")
print(f"Final optimized: {final_score:.6f}")
print(f"Improvement: {verify_score - final_score:.6f}")
print(f"Target: 68.931058")
print(f"Gap to target: {final_score - 68.931058:.6f}")