# Experiment 011: Mega Ensemble from ALL Sources

Ensemble from ALL available CSV files:
- 204 files from kaggle_datasets (including 78 from bbox_sub)
- 3767 files from snapshots
- Current best (exp_010)

For each N, take the BEST solution across ALL sources that passes strict 1e18 validation.

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
import glob
import json
import time
import warnings
warnings.filterwarnings('ignore')

getcontext().prec = 30
SCALE_FACTOR = Decimal('1e18')

print("Setup complete")

Setup complete


In [2]:
# Tree shape vertices
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, angle):
    """Create tree polygon for scoring."""
    x, y, angle = float(x), float(y), float(angle)
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def create_high_precision_tree(x, y, angle):
    """Create tree polygon with 1e18 scaling for validation."""
    x = Decimal(str(x))
    y = Decimal(str(y))
    angle = Decimal(str(angle))
    sf = SCALE_FACTOR
    vertices = [
        (float(Decimal('0.0') * sf), float(Decimal('0.8') * sf)),
        (float(Decimal('0.125') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('-0.125') * sf), float(Decimal('0.5') * sf)),
    ]
    poly = Polygon(vertices)
    poly = affinity.rotate(poly, float(angle), origin=(0, 0))
    poly = affinity.translate(poly, xoff=float(x * sf), yoff=float(y * sf))
    return poly

def validate_no_overlap_strict(trees_data):
    """Check for overlaps using 1e18 scaling."""
    if len(trees_data) <= 1:
        return True
    polygons = [create_high_precision_tree(t['x'], t['y'], t['deg']) for t in trees_data]
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                return False
    return True

def get_bbox_side(trees):
    """Get bounding box side length."""
    if len(trees) == 0:
        return 0
    polygons = [create_tree_polygon(t['x'], t['y'], t['deg']) for t in trees]
    union = unary_union(polygons)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def get_score(trees, n):
    """Get score contribution for N trees."""
    side = get_bbox_side(trees)
    return (side ** 2) / n

print("Core functions defined")

Core functions defined


In [3]:
# Collect ALL CSV files
print("Collecting all CSV files...")

csv_files = []

# kaggle_datasets (including bbox_sub)
kaggle_files = glob.glob('/home/code/kaggle_datasets/**/*.csv', recursive=True)
csv_files.extend(kaggle_files)
print(f"  kaggle_datasets: {len(kaggle_files)} files")

# snapshots - sample to avoid too many (3767 is a lot)
snapshot_files = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
print(f"  snapshots: {len(snapshot_files)} files")

# Sample snapshots to keep it manageable (take every 10th)
sampled_snapshots = snapshot_files[::10]  # Every 10th file
csv_files.extend(sampled_snapshots)
print(f"  sampled snapshots: {len(sampled_snapshots)} files")

# Current best
csv_files.append('/home/code/experiments/010_full_ensemble/submission.csv')

print(f"\nTotal sources: {len(csv_files)} files")

Collecting all CSV files...
  kaggle_datasets: 204 files
  snapshots: 3767 files
  sampled snapshots: 377 files

Total sources: 582 files


In [4]:
def load_trees_from_csv(csv_path, n):
    """Load trees for a specific N from a CSV file."""
    try:
        df = pd.read_csv(csv_path)
        # Handle different column formats
        if 'id' in df.columns:
            df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        elif 'n' in df.columns:
            df['N'] = df['n']
        else:
            return None
        
        g = df[df['N'] == n]
        if len(g) != n:
            return None
        
        trees = []
        for _, row in g.iterrows():
            x = str(row['x']).replace('s', '')
            y = str(row['y']).replace('s', '')
            deg = str(row['deg']).replace('s', '')
            trees.append({'x': x, 'y': y, 'deg': deg})
        return trees
    except Exception as e:
        return None

print("Load function defined")

Load function defined


In [5]:
# Load current best (exp_010) as baseline
print("Loading exp_010 as baseline...")
baseline_df = pd.read_csv('/home/code/experiments/010_full_ensemble/submission.csv')
baseline_df['N'] = baseline_df['id'].astype(str).str.split('_').str[0].astype(int)

best_trees = {}
best_scores = {}
best_sources = {}

for n, g in baseline_df.groupby('N'):
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    best_trees[n] = trees
    best_scores[n] = get_score(trees, n)
    best_sources[n] = 'exp_010'

baseline_total = sum(best_scores.values())
print(f"Baseline score: {baseline_total:.6f}")

Loading exp_010 as baseline...


Baseline score: 70.364392


In [6]:
# Process all CSV files and find best per-N
print("\n" + "=" * 60)
print("PROCESSING ALL SOURCES")
print("=" * 60)

improvements = []
files_processed = 0
start_time = time.time()

for csv_file in csv_files:
    files_processed += 1
    if files_processed % 50 == 0:
        elapsed = time.time() - start_time
        print(f"  Processed {files_processed}/{len(csv_files)} files ({elapsed:.1f}s)")
    
    for n in range(1, 201):
        trees = load_trees_from_csv(csv_file, n)
        if trees is None:
            continue
        
        # Calculate score first (fast)
        score = get_score(trees, n)
        
        # Only validate if potentially better
        if score < best_scores[n] - 1e-9:
            # Validate with strict 1e18 precision
            if validate_no_overlap_strict(trees):
                improvement = best_scores[n] - score
                improvements.append((n, improvement, csv_file))
                best_trees[n] = trees
                best_scores[n] = score
                best_sources[n] = csv_file.split('/')[-1]

print(f"\nTotal time: {time.time() - start_time:.1f}s")
print(f"Files processed: {files_processed}")
print(f"N values improved: {len(improvements)}")


PROCESSING ALL SOURCES


  Processed 50/582 files (481.0s)


  Processed 100/582 files (987.7s)


  Processed 150/582 files (1496.0s)


  Processed 200/582 files (1998.1s)


  Processed 250/582 files (2482.0s)


  Processed 300/582 files (2963.1s)


  Processed 350/582 files (3445.3s)


  Processed 400/582 files (3924.2s)


  Processed 450/582 files (4411.0s)


  Processed 500/582 files (4897.5s)


  Processed 550/582 files (5383.0s)



Total time: 5694.3s
Files processed: 582
N values improved: 80


In [7]:
# Show improvements
if improvements:
    print("\nImprovements found:")
    total_improvement = 0
    for n, imp, source in sorted(improvements, key=lambda x: -x[1])[:20]:
        print(f"  N={n:3d}: +{imp:.6f} from {source.split('/')[-1]}")
        total_improvement += imp
    print(f"\nTotal improvement: {total_improvement:.6f}")
else:
    print("\nNo improvements found!")


Improvements found:
  N= 53: +0.007569 from santa-2025.csv
  N= 85: +0.002132 from santa-2025.csv
  N= 54: +0.001801 from santa-2025.csv
  N= 26: +0.001497 from santa-2025.csv
  N=125: +0.001315 from santa-2025.csv
  N=124: +0.001115 from santa-2025.csv
  N= 94: +0.000838 from santa-2025.csv
  N= 46: +0.000786 from santa-2025.csv
  N= 31: +0.000780 from santa-2025.csv
  N=114: +0.000656 from santa-2025.csv
  N= 51: +0.000617 from santa-2025.csv
  N=133: +0.000577 from santa-2025.csv
  N=116: +0.000484 from santa-2025.csv
  N= 82: +0.000467 from santa-2025.csv
  N=187: +0.000385 from santa-2025.csv
  N=134: +0.000332 from santa-2025.csv
  N=186: +0.000294 from santa-2025.csv
  N= 77: +0.000189 from santa-2025.csv
  N=150: +0.000188 from santa-2025.csv
  N= 49: +0.000171 from santa-2025.csv

Total improvement: 0.022195


In [8]:
# Calculate final score
final_total = sum(best_scores.values())
print(f"\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Baseline score (exp_010): {baseline_total:.6f}")
print(f"Final score: {final_total:.6f}")
print(f"Improvement: {baseline_total - final_total:.6f}")
print(f"Target: 68.879467")
print(f"Gap to target: {final_total - 68.879467:.6f}")


RESULTS
Baseline score (exp_010): 70.364392
Final score: 70.340960
Improvement: 0.023432
Target: 68.879467
Gap to target: 1.461493


In [9]:
# Final validation
print("\n" + "=" * 60)
print("FINAL VALIDATION")
print("=" * 60)

final_overlaps = []
for n in range(1, 201):
    if not validate_no_overlap_strict(best_trees[n]):
        final_overlaps.append(n)

if final_overlaps:
    print(f"WARNING: {len(final_overlaps)} N values have overlaps: {final_overlaps[:10]}...")
else:
    print("✅ All N values pass strict validation!")


FINAL VALIDATION




In [10]:
# Create submission
print("\n" + "=" * 60)
print("CREATE SUBMISSION")
print("=" * 60)

rows = []
for n in range(1, 201):
    trees = best_trees[n]
    for i, t in enumerate(trees):
        # Ensure proper formatting
        x_val = str(t['x']).replace('s', '')
        y_val = str(t['y']).replace('s', '')
        deg_val = str(t['deg']).replace('s', '')
        rows.append({
            'id': f"{n:03d}_{i}",
            'x': f"s{x_val}",
            'y': f"s{y_val}",
            'deg': f"s{deg_val}"
        })

submission_df = pd.DataFrame(rows)
print(f"Submission shape: {submission_df.shape}")

submission_df.to_csv('/home/code/experiments/011_mega_ensemble_all/submission.csv', index=False)
submission_df.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved!")


CREATE SUBMISSION
Submission shape: (20100, 4)
Submission saved!


In [11]:
# Save metrics
metrics = {
    'cv_score': final_total,
    'baseline_score': baseline_total,
    'improvement': baseline_total - final_total,
    'n_improved': len(improvements),
    'total_sources': len(csv_files),
    'final_overlaps': len(final_overlaps),
    'target': 68.879467,
    'gap': final_total - 68.879467
}

with open('/home/code/experiments/011_mega_ensemble_all/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("\nMetrics saved!")
print(json.dumps(metrics, indent=2))


Metrics saved!
{
  "cv_score": 70.34095966509364,
  "baseline_score": 70.36439162315997,
  "improvement": 0.023431958066339575,
  "n_improved": 80,
  "total_sources": 582,
  "final_overlaps": 7,
  "target": 68.879467,
  "gap": 1.4614926650936297
}


In [12]:
# Fix overlaps by reverting to baseline
print("Fixing overlaps by reverting to baseline...")

# Reload baseline for the overlapping N values
for n in final_overlaps:
    g = baseline_df[baseline_df['N'] == n]
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    best_trees[n] = trees
    best_scores[n] = get_score(trees, n)
    best_sources[n] = 'exp_010 (reverted)'
    print(f"  Reverted N={n}")

# Recalculate final score
final_total = sum(best_scores.values())
print(f"\nFinal score after fixing overlaps: {final_total:.6f}")

# Re-validate
final_overlaps_2 = []
for n in range(1, 201):
    if not validate_no_overlap_strict(best_trees[n]):
        final_overlaps_2.append(n)

if final_overlaps_2:
    print(f"WARNING: Still {len(final_overlaps_2)} N values have overlaps!")
else:
    print("✅ All N values pass strict validation!")

Fixing overlaps by reverting to baseline...
  Reverted N=63
  Reverted N=123
  Reverted N=137
  Reverted N=143
  Reverted N=162
  Reverted N=184
  Reverted N=185

Final score after fixing overlaps: 70.340960




In [13]:
# Create fixed submission
print("\n" + "=" * 60)
print("CREATE FIXED SUBMISSION")
print("=" * 60)

rows = []
for n in range(1, 201):
    trees = best_trees[n]
    for i, t in enumerate(trees):
        x_val = str(t['x']).replace('s', '')
        y_val = str(t['y']).replace('s', '')
        deg_val = str(t['deg']).replace('s', '')
        rows.append({
            'id': f"{n:03d}_{i}",
            'x': f"s{x_val}",
            'y': f"s{y_val}",
            'deg': f"s{deg_val}"
        })

submission_df = pd.DataFrame(rows)
print(f"Submission shape: {submission_df.shape}")

submission_df.to_csv('/home/code/experiments/011_mega_ensemble_all/submission.csv', index=False)
submission_df.to_csv('/home/submission/submission.csv', index=False)
print("Fixed submission saved!")

# Update metrics
metrics = {
    'cv_score': final_total,
    'baseline_score': baseline_total,
    'improvement': baseline_total - final_total,
    'n_improved': len([x for x in improvements if x[0] not in final_overlaps]),
    'total_sources': len(csv_files),
    'final_overlaps': len(final_overlaps_2),
    'target': 68.879467,
    'gap': final_total - 68.879467
}

with open('/home/code/experiments/011_mega_ensemble_all/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("\nFinal metrics:")
print(json.dumps(metrics, indent=2))


CREATE FIXED SUBMISSION
Submission shape: (20100, 4)
Fixed submission saved!

Final metrics:
{
  "cv_score": 70.34095966509364,
  "baseline_score": 70.36439162315997,
  "improvement": 0.023431958066339575,
  "n_improved": 80,
  "total_sources": 582,
  "final_overlaps": 7,
  "target": 68.879467,
  "gap": 1.4614926650936297
}


In [14]:
# Check which N values have overlaps in exp_010 baseline
print("Checking exp_010 baseline for overlaps...")
baseline_overlaps = []
for n in range(1, 201):
    g = baseline_df[baseline_df['N'] == n]
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    if not validate_no_overlap_strict(trees):
        baseline_overlaps.append(n)

print(f"exp_010 has {len(baseline_overlaps)} N values with overlaps: {baseline_overlaps}")

Checking exp_010 baseline for overlaps...


exp_010 has 7 N values with overlaps: [63, 123, 137, 143, 162, 184, 185]


In [16]:
# Use exp_009 as fallback for overlapping N values (it has 0 overlaps)
print("Loading exp_009 as fallback for overlapping N values...")
exp009_df = pd.read_csv('/home/code/experiments/009_santa_ensemble/submission.csv')
exp009_df['N'] = exp009_df['id'].astype(str).str.split('_').str[0].astype(int)

for n in final_overlaps:
    g = exp009_df[exp009_df['N'] == n]
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    
    # Validate exp_009 solution
    if validate_no_overlap_strict(trees):
        best_trees[n] = trees
        best_scores[n] = get_score(trees, n)
        best_sources[n] = 'exp_009 (fallback)'
        print(f"  N={n}: Using exp_009 (valid)")
    else:
        print(f"  N={n}: exp_009 also has overlaps!")

# Recalculate final score
final_total = sum(best_scores.values())
print(f"\nFinal score after exp_009 fallback: {final_total:.6f}")

# Re-validate
final_overlaps_3 = []
for n in range(1, 201):
    if not validate_no_overlap_strict(best_trees[n]):
        final_overlaps_3.append(n)

if final_overlaps_3:
    print(f"WARNING: Still {len(final_overlaps_3)} N values have overlaps: {final_overlaps_3}")
else:
    print("All N values pass strict validation!")

Loading exp_009 as fallback for overlapping N values...
  N=63: Using exp_009 (valid)
  N=123: Using exp_009 (valid)


  N=137: Using exp_009 (valid)
  N=143: Using exp_009 (valid)
  N=162: Using exp_009 (valid)


  N=184: Using exp_009 (valid)
  N=185: Using exp_009 (valid)

Final score after exp_009 fallback: 70.340960


All N values pass strict validation!
