# Experiment 002: Ensemble Strategy

Combine the best configurations from multiple pre-optimized solutions.
For each N=1 to 200, pick the configuration with the smallest bounding box.

In [None]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
import warnings
import os
warnings.filterwarnings('ignore')

# Tree shape coordinates
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, deg):
    """Create a tree polygon at position (x, y) with rotation deg."""
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = rotate(poly, deg, origin=(0, 0))
    poly = translate(poly, x, y)
    return poly

def calculate_bounding_box_side(trees):
    """Calculate the side length of the bounding box for a set of trees."""
    all_x = []
    all_y = []
    for x, y, deg in trees:
        poly = create_tree_polygon(x, y, deg)
        bounds = poly.bounds
        all_x.extend([bounds[0], bounds[2]])
        all_y.extend([bounds[1], bounds[3]])
    
    width = max(all_x) - min(all_x)
    height = max(all_y) - min(all_y)
    return max(width, height)

def parse_submission(filepath):
    """Parse submission CSV and return dict of N -> list of (x, y, deg) tuples."""
    df = pd.read_csv(filepath)
    
    def parse_val(v):
        if isinstance(v, str) and v.startswith('s'):
            return float(v[1:])
        return float(v)
    
    df['x_val'] = df['x'].apply(parse_val)
    df['y_val'] = df['y'].apply(parse_val)
    df['deg_val'] = df['deg'].apply(parse_val)
    df['N'] = df['id'].apply(lambda x: int(x.split('_')[0]))
    
    result = {}
    for n, group in df.groupby('N'):
        trees = list(zip(group['x_val'], group['y_val'], group['deg_val']))
        result[n] = trees
    
    return result

def calculate_score(side_lengths):
    """Calculate total score from side lengths dict."""
    return sum(s**2 / n for n, s in side_lengths.items())

print("Functions defined successfully")

In [None]:
# Load all available datasets
datasets_dir = '/home/code/exploration/datasets'

csv_files = [
    'submission.csv',       # jazivxt ~70.647
    'santa-2025.csv',       # saspav ~70.659
    'submission_best.csv',  # unknown
    '71.97.csv',            # ~71.97
    '72.49.csv',            # ~72.49
    'smartmanoj.csv',       # SmartManoj GitHub
]

solutions = {}
for csv_file in csv_files:
    filepath = os.path.join(datasets_dir, csv_file)
    if os.path.exists(filepath):
        try:
            trees_by_n = parse_submission(filepath)
            solutions[csv_file] = trees_by_n
            print(f"Loaded {csv_file}: {len(trees_by_n)} configurations")
        except Exception as e:
            print(f"Error loading {csv_file}: {e}")
    else:
        print(f"File not found: {csv_file}")

print(f"\nTotal solutions loaded: {len(solutions)}")

In [None]:
# Calculate side lengths for each solution and each N
print("Calculating side lengths for all solutions...")

side_lengths_by_source = {}
for source_name, trees_by_n in solutions.items():
    side_lengths = {}
    for n in range(1, 201):
        if n in trees_by_n:
            side = calculate_bounding_box_side(trees_by_n[n])
            side_lengths[n] = side
    side_lengths_by_source[source_name] = side_lengths
    total_score = calculate_score(side_lengths)
    print(f"{source_name}: Total score = {total_score:.6f}")

In [None]:
# For each N, find the best source (smallest side length)
print("\nFinding best configuration for each N...")

best_source_by_n = {}
best_side_by_n = {}

for n in range(1, 201):
    best_side = float('inf')
    best_source = None
    
    for source_name, side_lengths in side_lengths_by_source.items():
        if n in side_lengths:
            if side_lengths[n] < best_side:
                best_side = side_lengths[n]
                best_source = source_name
    
    if best_source is not None:
        best_source_by_n[n] = best_source
        best_side_by_n[n] = best_side

# Count how many N values each source wins
source_wins = {}
for n, source in best_source_by_n.items():
    source_wins[source] = source_wins.get(source, 0) + 1

print("\nSource wins (number of N values where each source is best):")
for source, wins in sorted(source_wins.items(), key=lambda x: -x[1]):
    print(f"  {source}: {wins} wins")

In [None]:
# Calculate ensemble score
ensemble_score = calculate_score(best_side_by_n)
baseline_score = 70.647327

print(f"\n=== ENSEMBLE RESULTS ===")
print(f"Baseline score: {baseline_score:.6f}")
print(f"Ensemble score: {ensemble_score:.6f}")
print(f"Improvement: {baseline_score - ensemble_score:.6f}")
print(f"Target: 68.919")
print(f"Gap to target: {ensemble_score - 68.919:.3f} points")

In [None]:
# Show which N values improved
print("\nN values where ensemble improved over baseline (submission.csv):")
baseline_sides = side_lengths_by_source['submission.csv']

improvements = []
for n in range(1, 201):
    if n in baseline_sides and n in best_side_by_n:
        baseline_side = baseline_sides[n]
        ensemble_side = best_side_by_n[n]
        if ensemble_side < baseline_side:
            improvement = (baseline_side**2 / n) - (ensemble_side**2 / n)
            improvements.append((n, baseline_side, ensemble_side, improvement, best_source_by_n[n]))

print(f"\nTotal N values improved: {len(improvements)}")
print("\nTop 20 improvements:")
for n, base, ens, imp, src in sorted(improvements, key=lambda x: -x[3])[:20]:
    print(f"  N={n}: {base:.6f} -> {ens:.6f} (improvement: {imp:.6f}) from {src}")

In [None]:
# Create the ensemble submission
print("\nCreating ensemble submission...")

ensemble_rows = []
for n in range(1, 201):
    best_source = best_source_by_n[n]
    trees = solutions[best_source][n]
    
    for i, (x, y, deg) in enumerate(trees):
        row = {
            'id': f"{n:03d}_{i}",
            'x': f"s{x}",
            'y': f"s{y}",
            'deg': f"s{deg}"
        }
        ensemble_rows.append(row)

ensemble_df = pd.DataFrame(ensemble_rows)
print(f"Ensemble submission has {len(ensemble_df)} rows")
print(f"\nFirst 10 rows:")
print(ensemble_df.head(10))

In [None]:
# Save the ensemble submission
import os
os.makedirs('/home/submission', exist_ok=True)
ensemble_df.to_csv('/home/submission/submission.csv', index=False)
print("Saved ensemble submission to /home/submission/submission.csv")

# Verify by recalculating score
print("\nVerifying ensemble submission...")
verify_trees = parse_submission('/home/submission/submission.csv')
verify_sides = {}
for n in range(1, 201):
    if n in verify_trees:
        verify_sides[n] = calculate_bounding_box_side(verify_trees[n])

verify_score = calculate_score(verify_sides)
print(f"Verified ensemble score: {verify_score:.6f}")

In [None]:
# Save metrics
import json

metrics = {
    'cv_score': ensemble_score,
    'baseline_score': baseline_score,
    'improvement': baseline_score - ensemble_score,
    'source_wins': source_wins,
    'n_improved': len(improvements)
}

with open('/home/code/experiments/002_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Saved metrics to experiments/002_ensemble/metrics.json")
print(f"\nFinal Ensemble Score: {ensemble_score:.6f}")