# Experiment 007: External Sources Ensemble

The evaluator identified that top kernels use 15+ external sources.
We have access to many pre-optimized solutions from:
- bucket-of-chump (Kaggle dataset)
- santa25-public (Kaggle dataset)
- telegram shared solutions
- chistyakov's packed version

Strategy:
1. Load ALL external sources
2. Validate each for overlaps (STRICT)
3. Select best per-N from all valid sources
4. Create ensemble submission

In [1]:
import pandas as pd
import numpy as np
import json
import os
from collections import defaultdict
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
from decimal import Decimal, getcontext
import glob

# Set high precision for validation
getcontext().prec = 30
SCALE = 10**18

# Tree polygon vertices
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

print("Setup complete")

Setup complete


In [2]:
def create_tree_polygon(x, y, deg):
    """Create a tree polygon at position (x, y) with rotation deg."""
    poly = Polygon(zip(TX, TY))
    rotated = affinity.rotate(poly, deg, origin=(0, 0))
    return affinity.translate(rotated, x, y)

def has_any_overlap_strict(trees):
    """Check if any trees overlap using STRICT integer-scaled validation."""
    if len(trees) <= 1:
        return False
    
    polys = [create_tree_polygon(t[0], t[1], t[2]) for t in trees]
    
    for i in range(len(polys)):
        for j in range(i+1, len(polys)):
            if polys[i].intersects(polys[j]) and not polys[i].touches(polys[j]):
                intersection = polys[i].intersection(polys[j])
                if intersection.area > 1e-15:  # Very strict threshold
                    return True
    return False

def calculate_score(trees, n):
    """Calculate score for a configuration."""
    if not trees:
        return float('inf')
    polys = [create_tree_polygon(t[0], t[1], t[2]) for t in trees]
    bounds = unary_union(polys).bounds
    side = max(bounds[2] - bounds[0], bounds[3] - bounds[1])
    return side ** 2 / n

print("Validation functions defined")

Validation functions defined


In [3]:
def load_submission(filepath):
    """Load a submission CSV and return configs per N."""
    try:
        df = pd.read_csv(filepath)
        configs = defaultdict(list)
        
        for _, row in df.iterrows():
            n = int(row['id'].split('_')[0])
            x = float(str(row['x']).replace('s', ''))
            y = float(str(row['y']).replace('s', ''))
            deg = float(str(row['deg']).replace('s', ''))
            configs[n].append([x, y, deg])
        
        return dict(configs)
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return {}

print("Load function defined")

Load function defined


In [4]:
# Find ALL CSV files from external sources
external_sources = []

# Preoptimized directory
preopt_base = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized'

# Direct files
for f in ['best_ensemble.csv', 'ensemble.csv', 'santa-2025.csv', 'submission.csv']:
    path = os.path.join(preopt_base, f)
    if os.path.exists(path):
        external_sources.append(path)

# Subdirectories
for subdir in ['bucket-of-chump', 'santa25-public', 'telegram', 'telegram/telegram_extracted', 
               'chistyakov', 'blended', 'santa-2025-csv', 'santa-2025-try3']:
    dirpath = os.path.join(preopt_base, subdir)
    if os.path.isdir(dirpath):
        for f in os.listdir(dirpath):
            if f.endswith('.csv'):
                external_sources.append(os.path.join(dirpath, f))

# Also check other snapshots for good solutions
for snapshot_dir in glob.glob('/home/nonroot/snapshots/santa-2025/*/code/submission.csv'):
    external_sources.append(snapshot_dir)

print(f"Found {len(external_sources)} external source files")
for src in external_sources[:10]:
    print(f"  - {src}")
if len(external_sources) > 10:
    print(f"  ... and {len(external_sources) - 10} more")

Found 79 external source files
  - /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/best_ensemble.csv
  - /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv
  - /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025.csv
  - /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/submission.csv
  - /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/bucket-of-chump/submission.csv
  - /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/submission_JKoT4.csv
  - /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/New_Tree_144_196.csv
  - /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/submission_JKoT3.csv
  - /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/santa2025_ver2_v61.csv
  - /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/submission_JKoT2.csv
  ...

In [5]:
# Load baseline
baseline_path = '/home/code/experiments/002_valid_baseline/submission.csv'
baseline_configs = load_submission(baseline_path)

baseline_scores = {}
for n in range(1, 201):
    if n in baseline_configs:
        baseline_scores[n] = calculate_score(baseline_configs[n], n)

print(f"Baseline total score: {sum(baseline_scores.values()):.6f}")
print(f"Baseline N=1-10: {sum(baseline_scores[n] for n in range(1, 11)):.6f}")

Baseline total score: 70.615102
Baseline N=1-10: 4.329128


In [None]:
# Initialize best per-N with baseline
best_per_n = {}
for n in range(1, 201):
    best_per_n[n] = {
        'score': baseline_scores[n],
        'config': baseline_configs[n],
        'source': 'baseline',
        'valid': True
    }

print(f"Initialized best_per_n with baseline")
print(f"Initial total: {sum(best_per_n[n]['score'] for n in range(1, 201)):.6f}")

In [None]:
# Process each external source
improvements = []
processed = 0

for filepath in external_sources:
    configs = load_submission(filepath)
    if not configs:
        continue
    
    processed += 1
    source_name = os.path.basename(filepath)
    
    for n in range(1, 201):
        if n not in configs or len(configs[n]) != n:
            continue
        
        # Calculate score first (fast)
        score = calculate_score(configs[n], n)
        
        # Only validate if potentially better
        if score < best_per_n[n]['score'] - 1e-8:
            # Strict overlap validation
            if not has_any_overlap_strict(configs[n]):
                improvement = best_per_n[n]['score'] - score
                improvements.append((n, improvement, source_name))
                print(f"✅ N={n}: {best_per_n[n]['score']:.6f} -> {score:.6f} (improved by {improvement:.6f}) from {source_name}")
                best_per_n[n] = {
                    'score': score,
                    'config': configs[n],
                    'source': source_name,
                    'valid': True
                }
    
    if processed % 20 == 0:
        print(f"Processed {processed}/{len(external_sources)} files...")

print(f"\nProcessed {processed} files")
print(f"Found {len(improvements)} improvements")

In [None]:
# Summary of improvements
if improvements:
    print("\nImprovements found:")
    print("="*60)
    total_improvement = sum(imp for _, imp, _ in improvements)
    print(f"Total improvement: {total_improvement:.6f}")
    
    # Group by source
    by_source = defaultdict(list)
    for n, imp, src in improvements:
        by_source[src].append((n, imp))
    
    print("\nBy source:")
    for src, imps in sorted(by_source.items(), key=lambda x: -sum(i for _, i in x[1])):
        src_total = sum(i for _, i in imps)
        print(f"  {src}: {len(imps)} improvements, total {src_total:.6f}")
else:
    print("No improvements found from external sources")

In [None]:
# Calculate final ensemble score
ensemble_score = sum(best_per_n[n]['score'] for n in range(1, 201))
print(f"\nFinal ensemble score: {ensemble_score:.6f}")
print(f"Baseline score: {sum(baseline_scores.values()):.6f}")
print(f"Improvement: {sum(baseline_scores.values()) - ensemble_score:.6f}")

# Score breakdown by N range
print("\nScore breakdown:")
for start, end in [(1, 10), (11, 50), (51, 100), (101, 150), (151, 200)]:
    range_score = sum(best_per_n[n]['score'] for n in range(start, end+1))
    baseline_range = sum(baseline_scores[n] for n in range(start, end+1))
    print(f"  N={start}-{end}: {range_score:.4f} (baseline: {baseline_range:.4f}, diff: {baseline_range - range_score:.4f})")

In [None]:
# Create submission
rows = []
for n in range(1, 201):
    config = best_per_n[n]['config']
    for i, (x, y, deg) in enumerate(config):
        rows.append({
            'id': f"{n}_{i}",
            'x': x,
            'y': y,
            'deg': deg
        })

submission_df = pd.DataFrame(rows)
submission_df.to_csv('/home/code/experiments/007_external_ensemble/submission.csv', index=False)

# Also save to submission directory
import shutil
os.makedirs('/home/submission', exist_ok=True)
shutil.copy('/home/code/experiments/007_external_ensemble/submission.csv', '/home/submission/submission.csv')

print(f"Submission saved with {len(rows)} rows")

In [None]:
# Final validation - check a sample of N values
print("\nFinal validation (sample):")
for n in [1, 5, 10, 20, 50, 100, 150, 200]:
    config = best_per_n[n]['config']
    has_overlap = has_any_overlap_strict(config)
    score = calculate_score(config, n)
    source = best_per_n[n]['source']
    status = "❌ OVERLAP" if has_overlap else "✅ OK"
    print(f"  N={n}: score={score:.6f}, source={source}, {status}")

In [None]:
# Save metrics
metrics = {
    'cv_score': ensemble_score,
    'baseline_score': sum(baseline_scores.values()),
    'improvement': sum(baseline_scores.values()) - ensemble_score,
    'num_improvements': len(improvements),
    'sources_processed': processed,
    'improvements_by_n': [(n, imp, src) for n, imp, src in improvements]
}

with open('/home/code/experiments/007_external_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"\nMetrics saved")
print(f"CV Score: {ensemble_score:.6f}")