# Precision-Preserving Ensemble

exp_002 failed because of precision loss when parsing floats.
This experiment preserves original string precision when combining solutions.

In [None]:
import os
import json
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union

# Tree polygon vertices for score calculation
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, angle):
    """Create a tree polygon at position (x, y) with rotation angle (degrees)."""
    poly = Polygon(zip(TX, TY))
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def calculate_side(trees):
    """Calculate the bounding box side length for a set of trees."""
    polys = [create_tree_polygon(*t) for t in trees]
    union = unary_union(polys)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def calculate_score_for_n(trees, n):
    """Calculate score contribution for N trees."""
    side = calculate_side(trees)
    return (side ** 2) / n

print("Functions defined!")

In [None]:
def load_snapshot_raw(path):
    """Load snapshot preserving original string precision."""
    rows_by_n = {}
    with open(path, 'r') as f:
        next(f)  # Skip header
        for line in f:
            parts = line.strip().split(',')
            if len(parts) != 4:
                continue
            id_val = parts[0]
            n = int(id_val.split('_')[0])
            if n not in rows_by_n:
                rows_by_n[n] = []
            rows_by_n[n].append(parts)  # Keep as strings!
    return rows_by_n

def parse_row_to_tuple(row):
    """Parse a row to (x, y, angle) tuple for score calculation."""
    x_str = row[1]
    y_str = row[2]
    deg_str = row[3]
    x = float(x_str[1:] if x_str.startswith('s') else x_str)
    y = float(y_str[1:] if y_str.startswith('s') else y_str)
    angle = float(deg_str[1:] if deg_str.startswith('s') else deg_str)
    return (x, y, angle)

def calculate_score_from_rows(rows, n):
    """Calculate score from raw string rows (parses floats for comparison only)."""
    trees = [parse_row_to_tuple(row) for row in rows]
    return calculate_score_for_n(trees, n)

print("Raw loading functions defined!")

In [None]:
# Load baseline (exp_001 which passed Kaggle) as raw strings
baseline_path = '/home/nonroot/snapshots/santa-2025/21145966992/submission/submission.csv'
baseline_raw = load_snapshot_raw(baseline_path)

# Calculate baseline scores
baseline_scores = {n: calculate_score_from_rows(baseline_raw[n], n) for n in range(1, 201)}
baseline_total = sum(baseline_scores.values())
print(f"Baseline total score: {baseline_total:.6f}")

# Verify precision is preserved
print(f"\nBaseline N=2 raw rows:")
for row in baseline_raw[2]:
    print(f"  {row}")

In [None]:
# Initialize best_per_n with baseline
best_per_n = {
    n: {
        'score': baseline_scores[n],
        'rows': baseline_raw[n],
        'source': 'baseline'
    } for n in range(1, 201)
}

print("Initialized with baseline")

In [None]:
# Load all snapshots and find improvements
snapshot_base = '/home/nonroot/snapshots/santa-2025/'
snapshot_dirs = sorted(os.listdir(snapshot_base))
print(f"Found {len(snapshot_dirs)} snapshot directories")

improvements_found = 0
snapshots_processed = 0

for snap_dir in snapshot_dirs:
    sub_path = os.path.join(snapshot_base, snap_dir, 'submission', 'submission.csv')
    if not os.path.exists(sub_path):
        continue
    
    try:
        snap_raw = load_snapshot_raw(sub_path)
        
        # Check each N
        for n in range(1, 201):
            if n not in snap_raw:
                continue
            
            rows = snap_raw[n]
            score = calculate_score_from_rows(rows, n)
            
            # Only accept if better score
            if score < best_per_n[n]['score'] - 1e-10:
                best_per_n[n]['score'] = score
                best_per_n[n]['rows'] = rows  # Keep original string rows!
                best_per_n[n]['source'] = snap_dir
                improvements_found += 1
        
        snapshots_processed += 1
        if snapshots_processed % 20 == 0:
            print(f"Processed {snapshots_processed} snapshots, {improvements_found} improvements...")
                    
    except Exception as e:
        pass

print(f"\nTotal: {snapshots_processed} snapshots processed")
print(f"Improvements found: {improvements_found}")

In [None]:
# Calculate new total score
new_total = sum(best_per_n[n]['score'] for n in range(1, 201))
print(f"New total score: {new_total:.6f}")
print(f"Baseline total: {baseline_total:.6f}")
print(f"Improvement: {baseline_total - new_total:.6f}")

# Count unique sources
sources = set(best_per_n[n]['source'] for n in range(1, 201))
print(f"\nUnique sources used: {len(sources)}")

In [None]:
# Show top improvements
print("\nTop 20 improvements:")
improved_n = []
for n in range(1, 201):
    if best_per_n[n]['source'] != 'baseline':
        old_score = baseline_scores[n]
        new_score = best_per_n[n]['score']
        improved_n.append((n, old_score - new_score, best_per_n[n]['source']))

for n, improvement, source in sorted(improved_n, key=lambda x: -x[1])[:20]:
    print(f"  N={n}: improved by {improvement:.6f} from {source}")

print(f"\nTotal N values improved: {len(improved_n)}")

In [None]:
# Write ensemble preserving original string precision
def write_ensemble(best_per_n, output_path):
    """Write ensemble using original string rows (preserves precision)."""
    with open(output_path, 'w') as f:
        f.write('id,x,y,deg\n')
        for n in range(1, 201):
            for row in best_per_n[n]['rows']:
                f.write(','.join(row) + '\n')

write_ensemble(best_per_n, '/home/submission/submission.csv')
print("Saved ensemble to /home/submission/submission.csv")

# Verify precision is preserved
print("\nVerifying precision preservation...")
with open('/home/submission/submission.csv', 'r') as f:
    lines = f.readlines()
print(f"Total lines: {len(lines)}")

# Check N=2 (the one that failed in exp_002)
print("\nN=2 rows in output:")
for line in lines:
    if line.startswith('002_'):
        print(f"  {line.strip()}")

In [None]:
# Verify the score by re-loading
verify_raw = load_snapshot_raw('/home/submission/submission.csv')
verify_total = sum(calculate_score_from_rows(verify_raw[n], n) for n in range(1, 201))
print(f"Verified score: {verify_total:.6f}")
print(f"Expected score: {new_total:.6f}")
print(f"Match: {abs(verify_total - new_total) < 1e-10}")

In [None]:
# Save metrics
metrics = {
    'cv_score': new_total,
    'baseline_score': baseline_total,
    'improvement': baseline_total - new_total,
    'improvements_found': improvements_found,
    'unique_sources': len(sources),
    'n_values_improved': len(improved_n),
    'notes': 'Precision-preserving ensemble. Uses original string rows instead of re-serialized floats.'
}

with open('/home/code/experiments/004_precision_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("Metrics saved.")
print(f"CV Score: {new_total:.6f}")