# Fresh Ensemble from Latest Datasets

Download latest datasets and create ensemble of best-per-N solutions.

In [None]:
import pandas as pd
import numpy as np
from shapely import affinity
from shapely.geometry import Polygon
from shapely.ops import unary_union
import os
import glob

# Tree geometry
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

def get_tree_polygon(cx, cy, angle_deg):
    angle_rad = np.radians(angle_deg)
    cos_a, sin_a = np.cos(angle_rad), np.sin(angle_rad)
    x = TX * cos_a - TY * sin_a + cx
    y = TX * sin_a + TY * cos_a + cy
    return Polygon(zip(x, y))

def get_bounding_box_side(trees):
    all_poly = unary_union(trees)
    bounds = all_poly.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def parse_csv(path):
    """Parse a submission CSV and return dict of (n -> (side, df_rows))"""
    try:
        df = pd.read_csv(path)
        # Handle 's' prefix
        for col in ['x', 'y', 'deg']:
            if df[col].dtype == object:
                df[col] = df[col].str.strip('s').astype(float)
        df['n'] = df['id'].str[:3].astype(int)
        
        results = {}
        for n in range(1, 201):
            group = df[df['n'] == n]
            if len(group) != n:
                continue
            trees = [get_tree_polygon(row['x'], row['y'], row['deg']) for _, row in group.iterrows()]
            side = get_bounding_box_side(trees)
            results[n] = (side, group)
        return results
    except Exception as e:
        print(f"Error parsing {path}: {e}")
        return {}

print("Functions defined.")

In [None]:
# Find all CSV files
csv_files = [
    '/home/code/experiments/006_fresh_ensemble/datasets/santa-2025.csv',
    '/home/code/experiments/006_fresh_ensemble/datasets/submission.csv',
    '/home/code/experiments/006_fresh_ensemble/datasets/submission_best.csv',
    '/home/submission/submission.csv',  # Our current best
]

# Also check exploration datasets
for f in glob.glob('/home/code/exploration/datasets/*.csv'):
    if f not in csv_files:
        csv_files.append(f)

print(f"Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(f"  {f}")

In [None]:
# Parse all CSVs and find best solution for each N
all_solutions = {}
for path in csv_files:
    print(f"\nParsing {os.path.basename(path)}...")
    solutions = parse_csv(path)
    if solutions:
        # Calculate total score
        total = sum(s**2/n for n, (s, _) in solutions.items())
        print(f"  Total score: {total:.6f}, N values: {len(solutions)}")
        all_solutions[path] = solutions

print(f"\nParsed {len(all_solutions)} valid CSV files.")

In [None]:
# Find best solution for each N across all sources
best_per_n = {}
best_source_per_n = {}

for n in range(1, 201):
    best_side = float('inf')
    best_source = None
    best_rows = None
    
    for path, solutions in all_solutions.items():
        if n in solutions:
            side, rows = solutions[n]
            if side < best_side:
                best_side = side
                best_source = path
                best_rows = rows
    
    if best_rows is not None:
        best_per_n[n] = (best_side, best_rows)
        best_source_per_n[n] = best_source

print("Best source for each N:")
print("="*60)

# Count sources
source_counts = {}
for n, source in best_source_per_n.items():
    source_name = os.path.basename(source)
    source_counts[source_name] = source_counts.get(source_name, 0) + 1

for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f"  {source}: {count} N values")

# Calculate ensemble score
ensemble_score = sum(side**2/n for n, (side, _) in best_per_n.items())
print(f"\nEnsemble total score: {ensemble_score:.6f}")

In [None]:
# Compare with our current best
current_best_score = 70.659437
print(f"Current best score: {current_best_score:.6f}")
print(f"Ensemble score: {ensemble_score:.6f}")
print(f"Improvement: {current_best_score - ensemble_score:.6f}")

if ensemble_score < current_best_score - 1e-6:
    print("\n*** IMPROVEMENT FOUND! ***")
    
    # Create ensemble submission
    ensemble_rows = []
    for n in range(1, 201):
        if n in best_per_n:
            _, rows = best_per_n[n]
            for _, row in rows.iterrows():
                ensemble_rows.append({
                    'id': row['id'],
                    'x': f"s{row['x']:.18f}",
                    'y': f"s{row['y']:.18f}",
                    'deg': f"s{row['deg']:.18f}"
                })
    
    ensemble_df = pd.DataFrame(ensemble_rows)
    ensemble_df.to_csv('/home/submission/submission.csv', index=False)
    print(f"Saved ensemble to /home/submission/submission.csv")
else:
    print("\nNo improvement from ensemble.")

In [None]:
# Save metrics
import json

metrics = {
    'cv_score': ensemble_score,
    'baseline_score': current_best_score,
    'improvement': current_best_score - ensemble_score,
    'num_sources': len(all_solutions),
    'source_counts': source_counts,
    'notes': 'Fresh ensemble from latest Kaggle datasets'
}

with open('/home/code/experiments/006_fresh_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Final score: {ensemble_score:.6f}")