# Crodoc Backpacking Ensemble Approach

This notebook implements the crodoc kernel's ensemble approach:
1. Load ALL CSV files (745 files found)
2. For each N from 1-200, pick the best configuration across all files
3. Apply backward iteration to propagate good configs

In [1]:
import math
import numpy as np
import pandas as pd
import glob
import os
import time
from numba import njit

print(f"Starting crodoc ensemble approach...")

Starting crodoc ensemble approach...


In [2]:
# Tree shape constants
TRUNK_W = 0.15
TRUNK_H = 0.2
BASE_W = 0.7
MID_W = 0.4
TOP_W = 0.25
TIP_Y = 0.8
TIER_1_Y = 0.5
TIER_2_Y = 0.25
BASE_Y = 0.0
TRUNK_BOTTOM_Y = -TRUNK_H

@njit(cache=True)
def rotate_point(x, y, cos_a, sin_a):
    return x * cos_a - y * sin_a, x * sin_a + y * cos_a

@njit(cache=True)
def get_tree_vertices(cx, cy, angle_deg):
    """Get 15 vertices of tree polygon at given position and angle."""
    angle_rad = angle_deg * math.pi / 180.0
    cos_a = math.cos(angle_rad)
    sin_a = math.sin(angle_rad)
    vertices = np.empty((15, 2), dtype=np.float64)
    pts = np.array([
        [0.0, TIP_Y],
        [TOP_W / 2.0, TIER_1_Y],
        [TOP_W / 4.0, TIER_1_Y],
        [MID_W / 2.0, TIER_2_Y],
        [MID_W / 4.0, TIER_2_Y],
        [BASE_W / 2.0, BASE_Y],
        [TRUNK_W / 2.0, BASE_Y],
        [TRUNK_W / 2.0, TRUNK_BOTTOM_Y],
        [-TRUNK_W / 2.0, TRUNK_BOTTOM_Y],
        [-TRUNK_W / 2.0, BASE_Y],
        [-BASE_W / 2.0, BASE_Y],
        [-MID_W / 4.0, TIER_2_Y],
        [-MID_W / 2.0, TIER_2_Y],
        [-TOP_W / 4.0, TIER_1_Y],
        [-TOP_W / 2.0, TIER_1_Y],
    ], dtype=np.float64)
    for i in range(15):
        rx, ry = rotate_point(pts[i, 0], pts[i, 1], cos_a, sin_a)
        vertices[i, 0] = rx + cx
        vertices[i, 1] = ry + cy
    return vertices

@njit(cache=True)
def compute_bounding_box(all_vertices):
    """Compute overall bounding box of all polygons."""
    min_x = math.inf
    min_y = math.inf
    max_x = -math.inf
    max_y = -math.inf
    for verts in all_vertices:
        for i in range(verts.shape[0]):
            x, y = verts[i, 0], verts[i, 1]
            if x < min_x: min_x = x
            if x > max_x: max_x = x
            if y < min_y: min_y = y
            if y > max_y: max_y = y
    return min_x, min_y, max_x, max_y

@njit(cache=True)
def get_side_length(all_vertices):
    """Get side length of bounding square."""
    min_x, min_y, max_x, max_y = compute_bounding_box(all_vertices)
    return max(max_x - min_x, max_y - min_y)

@njit(cache=True)
def calculate_score_numba(all_vertices):
    """Calculate score = max(width, height)^2 / n"""
    side = get_side_length(all_vertices)
    return side * side / len(all_vertices)

In [3]:
def load_csv_solution(filepath):
    """Load a CSV file and return xs, ys, degs arrays for each N."""
    try:
        df = pd.read_csv(filepath)
        
        # Handle different column names
        if 'deg' in df.columns:
            angle_col = 'deg'
        elif 'angle' in df.columns:
            angle_col = 'angle'
        else:
            return None
        
        # Handle 's' prefix in values
        for col in ['x', 'y', angle_col]:
            if col in df.columns and df[col].dtype == object:
                df[col] = df[col].astype(str).str.replace('s', '').astype(float)
        
        # Check if we have the expected format
        if 'id' not in df.columns:
            return None
        
        # Parse into per-N arrays
        result = {}
        for n in range(1, 201):
            prefix = f"{n:03d}_"
            group = df[df["id"].str.startswith(prefix)].sort_values("id")
            if len(group) == n:
                result[n] = {
                    'xs': group['x'].values.astype(np.float64),
                    'ys': group['y'].values.astype(np.float64),
                    'degs': group[angle_col].values.astype(np.float64)
                }
        
        return result if result else None
    except Exception as e:
        return None

def calculate_score_for_n(xs, ys, degs):
    """Calculate score for a configuration."""
    n = len(xs)
    vertices = [get_tree_vertices(xs[i], ys[i], degs[i]) for i in range(n)]
    return calculate_score_numba(vertices), get_side_length(vertices)

In [4]:
# Find all CSV files
print("Finding all CSV files...")
csv_files = []
for pattern in ['/home/nonroot/snapshots/**/*.csv', '/home/code/**/*.csv']:
    csv_files.extend(glob.glob(pattern, recursive=True))

# Filter to likely solution files
csv_files = [f for f in csv_files if os.path.isfile(f) and 'sample_submission' not in f]
print(f"Found {len(csv_files)} CSV files")

Finding all CSV files...
Found 699 CSV files


In [5]:
# Load all solutions and build ensemble
print("\nLoading all CSV files and building ensemble...")
t0 = time.time()

# Store best configuration for each N
best_configs = {}  # n -> {'xs': ..., 'ys': ..., 'degs': ..., 'score': ..., 'source': ...}

loaded_count = 0
for i, csv_path in enumerate(csv_files):
    if i % 100 == 0:
        print(f"  Processing file {i}/{len(csv_files)}...")
    
    solutions = load_csv_solution(csv_path)
    if solutions is None:
        continue
    
    loaded_count += 1
    
    for n, config in solutions.items():
        score, side = calculate_score_for_n(config['xs'], config['ys'], config['degs'])
        
        if n not in best_configs or score < best_configs[n]['score']:
            best_configs[n] = {
                'xs': config['xs'],
                'ys': config['ys'],
                'degs': config['degs'],
                'score': score,
                'side': side,
                'source': csv_path
            }

print(f"\nLoaded {loaded_count} valid CSV files in {time.time() - t0:.1f}s")
print(f"Found configurations for {len(best_configs)} different N values")


Loading all CSV files and building ensemble...
  Processing file 0/699...


  Processing file 100/699...


  Processing file 200/699...


  Processing file 300/699...


  Processing file 400/699...


  Processing file 500/699...


  Processing file 600/699...



Loaded 694 valid CSV files in 518.1s
Found configurations for 200 different N values


In [6]:
# Calculate ensemble score before backward iteration
ensemble_score = sum(best_configs[n]['score'] for n in range(1, 201) if n in best_configs)
print(f"\nEnsemble score (before backward iteration): {ensemble_score:.6f}")

# Compare with saspav_latest baseline
baseline_path = '/home/code/external_data/saspav_latest/santa-2025.csv'
baseline_solutions = load_csv_solution(baseline_path)
baseline_score = sum(calculate_score_for_n(baseline_solutions[n]['xs'], baseline_solutions[n]['ys'], baseline_solutions[n]['degs'])[0] for n in range(1, 201))
print(f"Baseline score (saspav_latest): {baseline_score:.6f}")
print(f"Difference: {ensemble_score - baseline_score:+.6f}")


Ensemble score (before backward iteration): 51.384395


Baseline score (saspav_latest): 70.659958
Difference: -19.275563


In [7]:
# Check which N values have different best sources
print("\nAnalyzing ensemble sources...")
source_counts = {}
for n in range(1, 201):
    if n in best_configs:
        source = best_configs[n]['source']
        source_counts[source] = source_counts.get(source, 0) + 1

print(f"\nTop 10 sources by number of best configurations:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1])[:10]:
    print(f"  {count} configs: {source}")


Analyzing ensemble sources...

Top 10 sources by number of best configurations:
  145 configs: /home/nonroot/snapshots/santa-2025/21145963314/code/experiments/011_comprehensive_ensemble/ensemble.csv
  31 configs: /home/nonroot/snapshots/santa-2025/21145968755/code/experiments/007_eazy_optimizer/submission.csv
  13 configs: /home/nonroot/snapshots/santa-2025/21145968755/code/submission_candidates/candidate_004.csv
  5 configs: /home/nonroot/snapshots/santa-2025/21145966992/code/experiments/001_baseline/submission.csv
  3 configs: /home/nonroot/snapshots/santa-2025/21108486172/code/experiments/submission_v21.csv
  2 configs: /home/nonroot/snapshots/santa-2025/21108486172/code/experiments/submission.csv
  1 configs: /home/nonroot/snapshots/santa-2025/21116303805/code/submission.csv


In [9]:
# Debug: Check a few specific N values to verify scoring
print("Debugging ensemble scores for specific N values:")
for n in [1, 2, 5, 10, 20, 50, 100, 150, 200]:
    if n in best_configs:
        config = best_configs[n]
        baseline_config = baseline_solutions[n]
        
        ensemble_score_n = config['score']
        baseline_score_n, _ = calculate_score_for_n(baseline_config['xs'], baseline_config['ys'], baseline_config['degs'])
        
        print(f"N={n}: ensemble={ensemble_score_n:.6f}, baseline={baseline_score_n:.6f}, diff={ensemble_score_n - baseline_score_n:+.6f}")
        print(f"       source: {config['source'][:80]}...")

Debugging ensemble scores for specific N values:
N=1: ensemble=0.661250, baseline=0.661250, diff=+0.000000
       source: /home/nonroot/snapshots/santa-2025/21116303805/code/submission.csv...
N=2: ensemble=0.437328, baseline=0.450779, diff=-0.013452
       source: /home/nonroot/snapshots/santa-2025/21145966992/code/experiments/001_baseline/sub...
N=5: ensemble=0.212694, baseline=0.416850, diff=-0.204155
       source: /home/nonroot/snapshots/santa-2025/21108486172/code/experiments/submission.csv...
N=10: ensemble=0.154792, baseline=0.376630, diff=-0.221838
       source: /home/nonroot/snapshots/santa-2025/21145968755/code/submission_candidates/candid...
N=20: ensemble=0.061771, baseline=0.376057, diff=-0.314286
       source: /home/nonroot/snapshots/santa-2025/21145968755/code/submission_candidates/candid...
N=50: ensemble=0.034288, baseline=0.360753, diff=-0.326466
       source: /home/nonroot/snapshots/santa-2025/21145963314/code/experiments/011_comprehensiv...
N=100: ensemble=0.3376

In [10]:
# Check the suspicious N=5 configuration
n = 5
config = best_configs[n]
print(f"N={n} configuration from {config['source']}:")
print(f"  xs: {config['xs']}")
print(f"  ys: {config['ys']}")
print(f"  degs: {config['degs']}")
print(f"  score: {config['score']:.6f}")
print(f"  side: {config['side']:.6f}")

# Verify the score calculation
vertices = [get_tree_vertices(config['xs'][i], config['ys'][i], config['degs'][i]) for i in range(n)]
side = get_side_length(vertices)
score = side * side / n
print(f"\nVerified: side={side:.6f}, score={score:.6f}")

N=5 configuration from /home/nonroot/snapshots/santa-2025/21108486172/code/experiments/submission.csv:
  xs: [-0.23810728  0.10801893  0.2641822  -0.19595589 -0.30662785]
  ys: [1.15667829 0.92658514 1.05882798 1.51535109 1.31740825]
  degs: [245.09087297 336.00203216  84.89213506 173.59810744 299.3586456 ]
  score: 0.212694
  side: 1.031248

Verified: side=1.031248, score=0.212694


In [11]:
# Check for overlaps in the N=5 configuration
from shapely.geometry import Polygon
from shapely.ops import unary_union

def check_overlaps(xs, ys, degs):
    """Check if any trees overlap."""
    n = len(xs)
    polygons = []
    for i in range(n):
        verts = get_tree_vertices(xs[i], ys[i], degs[i])
        poly = Polygon(verts)
        polygons.append(poly)
    
    # Check pairwise overlaps
    overlaps = []
    for i in range(n):
        for j in range(i+1, n):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                overlap_area = polygons[i].intersection(polygons[j]).area
                if overlap_area > 1e-10:
                    overlaps.append((i, j, overlap_area))
    return overlaps

# Check N=5
n = 5
config = best_configs[n]
overlaps = check_overlaps(config['xs'], config['ys'], config['degs'])
print(f"N={n}: Found {len(overlaps)} overlaps")
for i, j, area in overlaps:
    print(f"  Trees {i} and {j} overlap with area {area:.6f}")

N=5: Found 10 overlaps
  Trees 0 and 1 overlap with area 0.099947
  Trees 0 and 2 overlap with area 0.144939
  Trees 0 and 3 overlap with area 0.096061
  Trees 0 and 4 overlap with area 0.052814
  Trees 1 and 2 overlap with area 0.131988
  Trees 1 and 3 overlap with area 0.001699
  Trees 1 and 4 overlap with area 0.002621
  Trees 2 and 3 overlap with area 0.045500
  Trees 2 and 4 overlap with area 0.015281
  Trees 3 and 4 overlap with area 0.138024


In [13]:
# Check all configurations for overlaps
print("Checking all configurations for overlaps...")

def check_overlaps_safe(xs, ys, degs):
    """Check if any trees overlap (with error handling)."""
    n = len(xs)
    polygons = []
    for i in range(n):
        verts = get_tree_vertices(xs[i], ys[i], degs[i])
        # Close the polygon by adding first point at end
        verts_closed = np.vstack([verts, verts[0]])
        try:
            poly = Polygon(verts_closed)
            if not poly.is_valid:
                poly = poly.buffer(0)  # Fix invalid polygons
            polygons.append(poly)
        except:
            return None  # Can't check this config
    
    # Check pairwise overlaps
    overlaps = []
    for i in range(n):
        for j in range(i+1, n):
            try:
                if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                    overlap_area = polygons[i].intersection(polygons[j]).area
                    if overlap_area > 1e-10:
                        overlaps.append((i, j, overlap_area))
            except:
                pass
    return overlaps

invalid_configs = []
valid_configs = []
for n in range(1, 201):
    if n in best_configs:
        config = best_configs[n]
        overlaps = check_overlaps_safe(config['xs'], config['ys'], config['degs'])
        if overlaps is None:
            continue
        if overlaps:
            invalid_configs.append((n, len(overlaps), config['source']))
        else:
            valid_configs.append(n)

print(f"\nFound {len(invalid_configs)} configurations with overlaps")
print(f"Found {len(valid_configs)} valid configurations")
print("\nFirst 20 invalid configurations:")
for n, num_overlaps, source in invalid_configs[:20]:
    print(f"  N={n}: {num_overlaps} overlaps, source: {source[:60]}...")

Checking all configurations for overlaps...



Found 167 configurations with overlaps
Found 32 valid configurations

First 20 invalid configurations:
  N=2: 1 overlaps, source: /home/nonroot/snapshots/santa-2025/21145966992/code/experime...
  N=3: 3 overlaps, source: /home/nonroot/snapshots/santa-2025/21108486172/code/experime...
  N=4: 6 overlaps, source: /home/nonroot/snapshots/santa-2025/21108486172/code/experime...
  N=5: 10 overlaps, source: /home/nonroot/snapshots/santa-2025/21108486172/code/experime...
  N=6: 15 overlaps, source: /home/nonroot/snapshots/santa-2025/21108486172/code/experime...
  N=7: 21 overlaps, source: /home/nonroot/snapshots/santa-2025/21108486172/code/experime...
  N=8: 21 overlaps, source: /home/nonroot/snapshots/santa-2025/21145968755/code/submissi...
  N=9: 28 overlaps, source: /home/nonroot/snapshots/santa-2025/21145968755/code/submissi...
  N=10: 36 overlaps, source: /home/nonroot/snapshots/santa-2025/21145968755/code/submissi...
  N=11: 45 overlaps, source: /home/nonroot/snapshots/santa-2025/211459

In [14]:
# Rebuild ensemble with overlap checking - only keep valid configurations
print("Rebuilding ensemble with overlap checking...")
t0 = time.time()

# Store best VALID configuration for each N
best_valid_configs = {}

for i, csv_path in enumerate(csv_files):
    if i % 100 == 0:
        print(f"  Processing file {i}/{len(csv_files)}...")
    
    solutions = load_csv_solution(csv_path)
    if solutions is None:
        continue
    
    for n, config in solutions.items():
        # Check for overlaps
        overlaps = check_overlaps_safe(config['xs'], config['ys'], config['degs'])
        if overlaps is None or overlaps:  # Skip if can't check or has overlaps
            continue
        
        score, side = calculate_score_for_n(config['xs'], config['ys'], config['degs'])
        
        if n not in best_valid_configs or score < best_valid_configs[n]['score']:
            best_valid_configs[n] = {
                'xs': config['xs'],
                'ys': config['ys'],
                'degs': config['degs'],
                'score': score,
                'side': side,
                'source': csv_path
            }

print(f"\nRebuilt ensemble in {time.time() - t0:.1f}s")
print(f"Found valid configurations for {len(best_valid_configs)} different N values")

Rebuilding ensemble with overlap checking...
  Processing file 0/699...


  Processing file 100/699...


  Processing file 200/699...


  Processing file 300/699...


  Processing file 400/699...


  Processing file 500/699...


  Processing file 600/699...



Rebuilt ensemble in 5618.4s
Found valid configurations for 200 different N values


In [15]:
# Calculate valid ensemble score
valid_ensemble_score = sum(best_valid_configs[n]['score'] for n in range(1, 201) if n in best_valid_configs)
print(f"Valid ensemble score: {valid_ensemble_score:.6f}")
print(f"Baseline score (saspav_latest): {baseline_score:.6f}")
print(f"Difference: {valid_ensemble_score - baseline_score:+.6f}")

Valid ensemble score: 70.659944
Baseline score (saspav_latest): 70.659958
Difference: -0.000015


In [16]:
# Check which N values have improvements over baseline
print("Checking for improvements over baseline...")
improvements = []
for n in range(1, 201):
    if n in best_valid_configs:
        config = best_valid_configs[n]
        baseline_config = baseline_solutions[n]
        baseline_score_n, _ = calculate_score_for_n(baseline_config['xs'], baseline_config['ys'], baseline_config['degs'])
        
        if config['score'] < baseline_score_n - 1e-9:
            improvements.append((n, baseline_score_n, config['score'], config['source']))

print(f"Found {len(improvements)} N values with improvements:")
for n, baseline_n, ensemble_n, source in improvements:
    print(f"  N={n}: baseline={baseline_n:.9f}, ensemble={ensemble_n:.9f}, diff={ensemble_n - baseline_n:+.9f}")
    print(f"         source: {source}")

Checking for improvements over baseline...


Found 67 N values with improvements:
  N=2: baseline=0.450779183, ensemble=0.450779057, diff=-0.000000126
         source: /home/nonroot/snapshots/santa-2025/21117525284/code/experiments/003_preoptimized/submission_test.csv
  N=38: baseline=0.366522024, ensemble=0.366522022, diff=-0.000000002
         source: /home/nonroot/snapshots/santa-2025/21145968755/code/experiments/007_eazy_optimizer/submission.csv
  N=51: baseline=0.365574161, ensemble=0.365574047, diff=-0.000000114
         source: /home/nonroot/snapshots/santa-2025/21145968755/code/experiments/007_eazy_optimizer/submission.csv
  N=73: baseline=0.353203556, ensemble=0.353201196, diff=-0.000002360
         source: /home/nonroot/snapshots/santa-2025/21145968755/code/experiments/007_eazy_optimizer/submission.csv
  N=83: baseline=0.352425567, ensemble=0.352425470, diff=-0.000000097
         source: /home/nonroot/snapshots/santa-2025/21145968755/code/experiments/007_eazy_optimizer/submission.csv
  N=86: baseline=0.350087939, ensemb

In [None]:
# Save the valid ensemble as submission
print("Saving valid ensemble as submission...")
os.makedirs('/home/submission', exist_ok=True)

rows = []
for n in range(1, 201):
    if n in best_valid_configs:
        config = best_valid_configs[n]
        for i in range(n):
            rows.append({
                'id': f'{n:03d}_{i:03d}',
                'x': config['xs'][i],
                'y': config['ys'][i],
                'angle': config['degs'][i]
            })

df = pd.DataFrame(rows)
df.to_csv('/home/submission/submission.csv', index=False)
print(f"Saved to /home/submission/submission.csv")
print(f"Total rows: {len(df)}")
print(f"\\nFinal score: {valid_ensemble_score:.9f}\")\nprint(f\"Improvement over baseline: {baseline_score - valid_ensemble_score:.9f}\")"

In [None]:
# Apply backward iteration (crodoc backpacking)
print("\nApplying backward iteration (backpacking)...")

# Track best side length seen so far
best_side = float('inf')
best_n = None
best_config = None
improvements = []

# Iterate from 200 down to 1
for n in range(200, 0, -1):
    if n not in best_configs:
        continue
    
    current_config = best_configs[n]
    current_side = current_config['side']
    
    if current_side < best_side:
        # Current has better (lower) side - use it
        best_side = current_side
        best_n = n
        best_config = current_config
    else:
        # Current side is worse - try adapting from best by dropping trees
        if best_config is not None and len(best_config['xs']) >= n:
            # Take first n trees from best config
            adapted_xs = best_config['xs'][:n]
            adapted_ys = best_config['ys'][:n]
            adapted_degs = best_config['degs'][:n]
            
            adapted_score, adapted_side = calculate_score_for_n(adapted_xs, adapted_ys, adapted_degs)
            
            if adapted_side < current_side:
                improvement_pct = ((current_side - adapted_side) / current_side) * 100
                improvements.append({
                    'n': n,
                    'original_side': current_side,
                    'adapted_side': adapted_side,
                    'improvement_pct': improvement_pct
                })
                
                # Update best_configs with adapted configuration
                best_configs[n] = {
                    'xs': adapted_xs,
                    'ys': adapted_ys,
                    'degs': adapted_degs,
                    'score': adapted_score,
                    'side': adapted_side,
                    'source': f'adapted_from_n={best_n}'
                }

print(f"Found {len(improvements)} improvements through backward iteration")
if improvements:
    print("\nTop 10 improvements:")
    for imp in sorted(improvements, key=lambda x: -x['improvement_pct'])[:10]:
        print(f"  N={imp['n']}: {imp['original_side']:.6f} -> {imp['adapted_side']:.6f} ({imp['improvement_pct']:.2f}% reduction)")

In [None]:
# Calculate final ensemble score after backward iteration
final_score = sum(best_configs[n]['score'] for n in range(1, 201) if n in best_configs)
print(f"\nFinal ensemble score (after backward iteration): {final_score:.6f}")
print(f"Baseline score (saspav_latest): {baseline_score:.6f}")
print(f"Improvement over baseline: {baseline_score - final_score:+.6f}")

In [None]:
# Save submission
print("\nSaving submission...")
os.makedirs('/home/submission', exist_ok=True)

rows = []
for n in range(1, 201):
    if n in best_configs:
        config = best_configs[n]
        for i in range(n):
            rows.append({
                'id': f'{n:03d}_{i:03d}',
                'x': config['xs'][i],
                'y': config['ys'][i],
                'angle': config['degs'][i]
            })

df = pd.DataFrame(rows)
df.to_csv('/home/submission/submission.csv', index=False)
print(f"Saved to /home/submission/submission.csv")
print(f"Total rows: {len(df)}")
print(f"\nFinal score: {final_score:.9f}")