# Crodoc Backpacking Ensemble Approach

This notebook implements the crodoc kernel's ensemble approach:
1. Load ALL CSV files (745 files found)
2. For each N from 1-200, pick the best configuration across all files
3. Apply backward iteration to propagate good configs

In [1]:
import math
import numpy as np
import pandas as pd
import glob
import os
import time
from numba import njit

print(f"Starting crodoc ensemble approach...")

Starting crodoc ensemble approach...


In [2]:
# Tree shape constants
TRUNK_W = 0.15
TRUNK_H = 0.2
BASE_W = 0.7
MID_W = 0.4
TOP_W = 0.25
TIP_Y = 0.8
TIER_1_Y = 0.5
TIER_2_Y = 0.25
BASE_Y = 0.0
TRUNK_BOTTOM_Y = -TRUNK_H

@njit(cache=True)
def rotate_point(x, y, cos_a, sin_a):
    return x * cos_a - y * sin_a, x * sin_a + y * cos_a

@njit(cache=True)
def get_tree_vertices(cx, cy, angle_deg):
    """Get 15 vertices of tree polygon at given position and angle."""
    angle_rad = angle_deg * math.pi / 180.0
    cos_a = math.cos(angle_rad)
    sin_a = math.sin(angle_rad)
    vertices = np.empty((15, 2), dtype=np.float64)
    pts = np.array([
        [0.0, TIP_Y],
        [TOP_W / 2.0, TIER_1_Y],
        [TOP_W / 4.0, TIER_1_Y],
        [MID_W / 2.0, TIER_2_Y],
        [MID_W / 4.0, TIER_2_Y],
        [BASE_W / 2.0, BASE_Y],
        [TRUNK_W / 2.0, BASE_Y],
        [TRUNK_W / 2.0, TRUNK_BOTTOM_Y],
        [-TRUNK_W / 2.0, TRUNK_BOTTOM_Y],
        [-TRUNK_W / 2.0, BASE_Y],
        [-BASE_W / 2.0, BASE_Y],
        [-MID_W / 4.0, TIER_2_Y],
        [-MID_W / 2.0, TIER_2_Y],
        [-TOP_W / 4.0, TIER_1_Y],
        [-TOP_W / 2.0, TIER_1_Y],
    ], dtype=np.float64)
    for i in range(15):
        rx, ry = rotate_point(pts[i, 0], pts[i, 1], cos_a, sin_a)
        vertices[i, 0] = rx + cx
        vertices[i, 1] = ry + cy
    return vertices

@njit(cache=True)
def compute_bounding_box(all_vertices):
    """Compute overall bounding box of all polygons."""
    min_x = math.inf
    min_y = math.inf
    max_x = -math.inf
    max_y = -math.inf
    for verts in all_vertices:
        for i in range(verts.shape[0]):
            x, y = verts[i, 0], verts[i, 1]
            if x < min_x: min_x = x
            if x > max_x: max_x = x
            if y < min_y: min_y = y
            if y > max_y: max_y = y
    return min_x, min_y, max_x, max_y

@njit(cache=True)
def get_side_length(all_vertices):
    """Get side length of bounding square."""
    min_x, min_y, max_x, max_y = compute_bounding_box(all_vertices)
    return max(max_x - min_x, max_y - min_y)

@njit(cache=True)
def calculate_score_numba(all_vertices):
    """Calculate score = max(width, height)^2 / n"""
    side = get_side_length(all_vertices)
    return side * side / len(all_vertices)

In [3]:
def load_csv_solution(filepath):
    """Load a CSV file and return xs, ys, degs arrays for each N."""
    try:
        df = pd.read_csv(filepath)
        
        # Handle different column names
        if 'deg' in df.columns:
            angle_col = 'deg'
        elif 'angle' in df.columns:
            angle_col = 'angle'
        else:
            return None
        
        # Handle 's' prefix in values
        for col in ['x', 'y', angle_col]:
            if col in df.columns and df[col].dtype == object:
                df[col] = df[col].astype(str).str.replace('s', '').astype(float)
        
        # Check if we have the expected format
        if 'id' not in df.columns:
            return None
        
        # Parse into per-N arrays
        result = {}
        for n in range(1, 201):
            prefix = f"{n:03d}_"
            group = df[df["id"].str.startswith(prefix)].sort_values("id")
            if len(group) == n:
                result[n] = {
                    'xs': group['x'].values.astype(np.float64),
                    'ys': group['y'].values.astype(np.float64),
                    'degs': group[angle_col].values.astype(np.float64)
                }
        
        return result if result else None
    except Exception as e:
        return None

def calculate_score_for_n(xs, ys, degs):
    """Calculate score for a configuration."""
    n = len(xs)
    vertices = [get_tree_vertices(xs[i], ys[i], degs[i]) for i in range(n)]
    return calculate_score_numba(vertices), get_side_length(vertices)

In [4]:
# Find all CSV files
print("Finding all CSV files...")
csv_files = []
for pattern in ['/home/nonroot/snapshots/**/*.csv', '/home/code/**/*.csv']:
    csv_files.extend(glob.glob(pattern, recursive=True))

# Filter to likely solution files
csv_files = [f for f in csv_files if os.path.isfile(f) and 'sample_submission' not in f]
print(f"Found {len(csv_files)} CSV files")

Finding all CSV files...
Found 699 CSV files


In [None]:
# Load all solutions and build ensemble
print("\nLoading all CSV files and building ensemble...")
t0 = time.time()

# Store best configuration for each N
best_configs = {}  # n -> {'xs': ..., 'ys': ..., 'degs': ..., 'score': ..., 'source': ...}

loaded_count = 0
for i, csv_path in enumerate(csv_files):
    if i % 100 == 0:
        print(f"  Processing file {i}/{len(csv_files)}...")
    
    solutions = load_csv_solution(csv_path)
    if solutions is None:
        continue
    
    loaded_count += 1
    
    for n, config in solutions.items():
        score, side = calculate_score_for_n(config['xs'], config['ys'], config['degs'])
        
        if n not in best_configs or score < best_configs[n]['score']:
            best_configs[n] = {
                'xs': config['xs'],
                'ys': config['ys'],
                'degs': config['degs'],
                'score': score,
                'side': side,
                'source': csv_path
            }

print(f"\nLoaded {loaded_count} valid CSV files in {time.time() - t0:.1f}s")
print(f"Found configurations for {len(best_configs)} different N values")

In [None]:
# Calculate ensemble score before backward iteration
ensemble_score = sum(best_configs[n]['score'] for n in range(1, 201) if n in best_configs)
print(f"\nEnsemble score (before backward iteration): {ensemble_score:.6f}")

# Compare with saspav_latest baseline
baseline_path = '/home/code/external_data/saspav_latest/santa-2025.csv'
baseline_solutions = load_csv_solution(baseline_path)
baseline_score = sum(calculate_score_for_n(baseline_solutions[n]['xs'], baseline_solutions[n]['ys'], baseline_solutions[n]['degs'])[0] for n in range(1, 201))
print(f"Baseline score (saspav_latest): {baseline_score:.6f}")
print(f"Difference: {ensemble_score - baseline_score:+.6f}")

In [None]:
# Check which N values have different best sources
print("\nAnalyzing ensemble sources...")
source_counts = {}
for n in range(1, 201):
    if n in best_configs:
        source = best_configs[n]['source']
        source_counts[source] = source_counts.get(source, 0) + 1

print(f"\nTop 10 sources by number of best configurations:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1])[:10]:
    print(f"  {count} configs: {source}")

In [None]:
# Apply backward iteration (crodoc backpacking)
print("\nApplying backward iteration (backpacking)...")

# Track best side length seen so far
best_side = float('inf')
best_n = None
best_config = None
improvements = []

# Iterate from 200 down to 1
for n in range(200, 0, -1):
    if n not in best_configs:
        continue
    
    current_config = best_configs[n]
    current_side = current_config['side']
    
    if current_side < best_side:
        # Current has better (lower) side - use it
        best_side = current_side
        best_n = n
        best_config = current_config
    else:
        # Current side is worse - try adapting from best by dropping trees
        if best_config is not None and len(best_config['xs']) >= n:
            # Take first n trees from best config
            adapted_xs = best_config['xs'][:n]
            adapted_ys = best_config['ys'][:n]
            adapted_degs = best_config['degs'][:n]
            
            adapted_score, adapted_side = calculate_score_for_n(adapted_xs, adapted_ys, adapted_degs)
            
            if adapted_side < current_side:
                improvement_pct = ((current_side - adapted_side) / current_side) * 100
                improvements.append({
                    'n': n,
                    'original_side': current_side,
                    'adapted_side': adapted_side,
                    'improvement_pct': improvement_pct
                })
                
                # Update best_configs with adapted configuration
                best_configs[n] = {
                    'xs': adapted_xs,
                    'ys': adapted_ys,
                    'degs': adapted_degs,
                    'score': adapted_score,
                    'side': adapted_side,
                    'source': f'adapted_from_n={best_n}'
                }

print(f"Found {len(improvements)} improvements through backward iteration")
if improvements:
    print("\nTop 10 improvements:")
    for imp in sorted(improvements, key=lambda x: -x['improvement_pct'])[:10]:
        print(f"  N={imp['n']}: {imp['original_side']:.6f} -> {imp['adapted_side']:.6f} ({imp['improvement_pct']:.2f}% reduction)")

In [None]:
# Calculate final ensemble score after backward iteration
final_score = sum(best_configs[n]['score'] for n in range(1, 201) if n in best_configs)
print(f"\nFinal ensemble score (after backward iteration): {final_score:.6f}")
print(f"Baseline score (saspav_latest): {baseline_score:.6f}")
print(f"Improvement over baseline: {baseline_score - final_score:+.6f}")

In [None]:
# Save submission
print("\nSaving submission...")
os.makedirs('/home/submission', exist_ok=True)

rows = []
for n in range(1, 201):
    if n in best_configs:
        config = best_configs[n]
        for i in range(n):
            rows.append({
                'id': f'{n:03d}_{i:03d}',
                'x': config['xs'][i],
                'y': config['ys'][i],
                'angle': config['degs'][i]
            })

df = pd.DataFrame(rows)
df.to_csv('/home/submission/submission.csv', index=False)
print(f"Saved to /home/submission/submission.csv")
print(f"Total rows: {len(df)}")
print(f"\nFinal score: {final_score:.9f}")