# Experiment 003: Validated Ensemble

Fix the ensemble approach with proper validation:
1. Check rotation angles are in valid range (0-360)
2. Verify no overlapping trees using Shapely
3. Use submission.csv as primary, only take from other sources if validated

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
from shapely.strtree import STRtree
import warnings
import os
warnings.filterwarnings('ignore')

# Tree shape coordinates
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, deg):
    """Create a tree polygon at position (x, y) with rotation deg."""
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = rotate(poly, deg, origin=(0, 0))
    poly = translate(poly, x, y)
    return poly

def calculate_bounding_box_side(trees):
    """Calculate the side length of the bounding box for a set of trees."""
    all_x = []
    all_y = []
    for x, y, deg in trees:
        poly = create_tree_polygon(x, y, deg)
        bounds = poly.bounds
        all_x.extend([bounds[0], bounds[2]])
        all_y.extend([bounds[1], bounds[3]])
    
    width = max(all_x) - min(all_x)
    height = max(all_y) - min(all_y)
    return max(width, height)

def has_overlap(trees):
    """Check if any trees in the configuration overlap."""
    if len(trees) <= 1:
        return False
    
    polygons = [create_tree_polygon(x, y, deg) for x, y, deg in trees]
    tree_index = STRtree(polygons)
    
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx == i:
                continue
            if poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                return True
    return False

def validate_rotation_angles(trees):
    """Check if all rotation angles are in valid range."""
    for x, y, deg in trees:
        # Normalize to 0-360 range
        normalized = deg % 360
        # Check if original was wildly out of range (corrupted)
        if abs(deg) > 1000:  # Clearly corrupted
            return False
    return True

def parse_submission(filepath):
    """Parse submission CSV and return dict of N -> list of (x, y, deg) tuples."""
    df = pd.read_csv(filepath)
    
    def parse_val(v):
        if isinstance(v, str) and v.startswith('s'):
            return float(v[1:])
        return float(v)
    
    df['x_val'] = df['x'].apply(parse_val)
    df['y_val'] = df['y'].apply(parse_val)
    df['deg_val'] = df['deg'].apply(parse_val)
    df['N'] = df['id'].apply(lambda x: int(x.split('_')[0]))
    
    result = {}
    for n, group in df.groupby('N'):
        trees = list(zip(group['x_val'], group['y_val'], group['deg_val']))
        result[n] = trees
    
    return result

def calculate_score(side_lengths):
    """Calculate total score from side lengths dict."""
    return sum(s**2 / n for n, s in side_lengths.items())

print("Functions defined successfully")

Functions defined successfully


In [2]:
# First, let's check the corrupted data in santa-2025.csv
print("Checking for corrupted data in santa-2025.csv...")
santa_trees = parse_submission('/home/code/exploration/datasets/santa-2025.csv')

corrupted_n = []
for n in range(1, 201):
    if n in santa_trees:
        trees = santa_trees[n]
        if not validate_rotation_angles(trees):
            corrupted_n.append(n)
            # Show the corrupted angles
            for i, (x, y, deg) in enumerate(trees):
                if abs(deg) > 1000:
                    print(f"  N={n}, tree {i}: deg={deg}")

print(f"\nCorrupted N values in santa-2025.csv: {corrupted_n}")
print(f"Total corrupted: {len(corrupted_n)}")

Checking for corrupted data in santa-2025.csv...
  N=5, tree 4: deg=12810.004582885203
  N=9, tree 2: deg=4344.502311354251
  N=15, tree 12: deg=9648.500795790587
  N=18, tree 9: deg=-4709.593368712057
  N=20, tree 8: deg=-3126.7991993215137
  N=21, tree 9: deg=3524.3517159193584
  N=21, tree 10: deg=-23521.963111527
  N=25, tree 13: deg=7319.8412869784515
  N=29, tree 5: deg=1597.7143707719092
  N=29, tree 12: deg=4298.615863616461
  N=29, tree 15: deg=40385.35351327787
  N=29, tree 16: deg=-2226.37062350373
  N=29, tree 23: deg=3572.7253326376385
  N=29, tree 25: deg=10972.411300999707
  N=30, tree 20: deg=2723.6132137498485
  N=32, tree 22: deg=-8755.82563212256
  N=34, tree 1: deg=-2540.5178715655984
  N=34, tree 27: deg=-1192.7479214024142
  N=36, tree 19: deg=-8303.47370020926
  N=36, tree 34: deg=-11003.16556386343
  N=37, tree 12: deg=-5826.867487000423
  N=39, tree 17: deg=9608.373145260997
  N=41, tree 17: deg=6417.544083358243
  N=42, tree 4: deg=16623.662758960036
  N=43, t

In [3]:
# Load all datasets and validate each configuration
datasets_dir = '/home/code/exploration/datasets'

csv_files = [
    'submission.csv',               # jazivxt - primary source
    'santa-2025.csv',               # saspav - has corrupted data
    'submission_best.csv',
    'smartmanoj.csv',
]

solutions = {}
for csv_file in csv_files:
    filepath = os.path.join(datasets_dir, csv_file)
    if os.path.exists(filepath):
        try:
            trees_by_n = parse_submission(filepath)
            solutions[csv_file] = trees_by_n
            print(f"Loaded {csv_file}: {len(trees_by_n)} configurations")
        except Exception as e:
            print(f"Error loading {csv_file}: {e}")

print(f"\nTotal solutions loaded: {len(solutions)}")

Loaded submission.csv: 200 configurations


Loaded santa-2025.csv: 200 configurations
Loaded submission_best.csv: 200 configurations


Loaded smartmanoj.csv: 200 configurations

Total solutions loaded: 4


In [4]:
# Calculate side lengths and validate each configuration
print("Calculating side lengths and validating configurations...")

side_lengths_by_source = {}
valid_configs = {}  # source -> {n: True/False}

for source_name, trees_by_n in solutions.items():
    side_lengths = {}
    valid_configs[source_name] = {}
    
    for n in range(1, 201):
        if n in trees_by_n:
            trees = trees_by_n[n]
            
            # Validate rotation angles
            if not validate_rotation_angles(trees):
                valid_configs[source_name][n] = False
                continue
            
            # Check for overlaps
            if has_overlap(trees):
                valid_configs[source_name][n] = False
                continue
            
            # Configuration is valid
            valid_configs[source_name][n] = True
            side = calculate_bounding_box_side(trees)
            side_lengths[n] = side
    
    side_lengths_by_source[source_name] = side_lengths
    valid_count = sum(1 for v in valid_configs[source_name].values() if v)
    invalid_count = sum(1 for v in valid_configs[source_name].values() if not v)
    
    if len(side_lengths) > 0:
        total_score = calculate_score(side_lengths)
        print(f"{source_name}: {valid_count} valid, {invalid_count} invalid, score={total_score:.6f}")
    else:
        print(f"{source_name}: {valid_count} valid, {invalid_count} invalid, no valid configs")

Calculating side lengths and validating configurations...


submission.csv: 149 valid, 51 invalid, score=52.620250


santa-2025.csv: 70 valid, 130 invalid, score=25.485783


submission_best.csv: 200 valid, 0 invalid, score=70.926150


smartmanoj.csv: 200 valid, 0 invalid, score=70.743774


In [5]:
# For each N, find the best VALID source
print("\nFinding best valid configuration for each N...")

best_source_by_n = {}
best_side_by_n = {}

for n in range(1, 201):
    best_side = float('inf')
    best_source = None
    
    for source_name, side_lengths in side_lengths_by_source.items():
        if n in side_lengths and valid_configs[source_name].get(n, False):
            if side_lengths[n] < best_side:
                best_side = side_lengths[n]
                best_source = source_name
    
    if best_source is not None:
        best_source_by_n[n] = best_source
        best_side_by_n[n] = best_side
    else:
        print(f"WARNING: No valid configuration for N={n}!")

# Count wins
source_wins = {}
for n, source in best_source_by_n.items():
    source_wins[source] = source_wins.get(source, 0) + 1

print("\nSource wins (valid configurations only):")
for source, wins in sorted(source_wins.items(), key=lambda x: -x[1]):
    print(f"  {source}: {wins} wins")


Finding best valid configuration for each N...

Source wins (valid configurations only):
  submission.csv: 120 wins
  smartmanoj.csv: 51 wins
  santa-2025.csv: 29 wins


In [6]:
# Calculate ensemble score
ensemble_score = calculate_score(best_side_by_n)
baseline_score = 70.647327

print(f"\n=== VALIDATED ENSEMBLE RESULTS ===")
print(f"Baseline score: {baseline_score:.6f}")
print(f"Ensemble score: {ensemble_score:.6f}")
print(f"Improvement: {baseline_score - ensemble_score:.6f}")
print(f"Target: 68.919")
print(f"Gap to target: {ensemble_score - 68.919:.3f} points")


=== VALIDATED ENSEMBLE RESULTS ===
Baseline score: 70.647327
Ensemble score: 70.671581
Improvement: -0.024254
Target: 68.919
Gap to target: 1.753 points


In [7]:
# Create the validated ensemble submission
print("\nCreating validated ensemble submission...")

ensemble_rows = []
for n in range(1, 201):
    if n not in best_source_by_n:
        print(f"ERROR: Missing N={n}")
        continue
        
    best_source = best_source_by_n[n]
    trees = solutions[best_source][n]
    
    for i, (x, y, deg) in enumerate(trees):
        row = {
            'id': f"{n:03d}_{i}",
            'x': f"s{x}",
            'y': f"s{y}",
            'deg': f"s{deg}"
        }
        ensemble_rows.append(row)

ensemble_df = pd.DataFrame(ensemble_rows)
print(f"Ensemble submission has {len(ensemble_df)} rows")


Creating validated ensemble submission...
Ensemble submission has 20100 rows


In [8]:
# Final validation: Check ALL configurations in the ensemble for overlaps
print("\nFinal validation: Checking all configurations for overlaps...")

failed_n = []
for n in range(1, 201):
    if n not in best_source_by_n:
        failed_n.append(n)
        continue
    
    best_source = best_source_by_n[n]
    trees = solutions[best_source][n]
    
    if has_overlap(trees):
        print(f"  N={n}: OVERLAP DETECTED (source: {best_source})")
        failed_n.append(n)

if len(failed_n) == 0:
    print("All configurations passed overlap check!")
else:
    print(f"\nFailed N values: {failed_n}")


Final validation: Checking all configurations for overlaps...


All configurations passed overlap check!


In [9]:
# Save the submission if validation passed
import os

if len(failed_n) == 0:
    os.makedirs('/home/submission', exist_ok=True)
    ensemble_df.to_csv('/home/submission/submission.csv', index=False)
    print("Saved validated ensemble submission to /home/submission/submission.csv")
    
    # Verify by recalculating score
    print("\nVerifying ensemble submission...")
    verify_trees = parse_submission('/home/submission/submission.csv')
    verify_sides = {}
    for n in range(1, 201):
        if n in verify_trees:
            verify_sides[n] = calculate_bounding_box_side(verify_trees[n])
    
    verify_score = calculate_score(verify_sides)
    print(f"Verified ensemble score: {verify_score:.6f}")
else:
    print("\nSubmission NOT saved due to validation failures!")
    print("Falling back to baseline submission.csv")
    import shutil
    shutil.copy('/home/code/exploration/datasets/submission.csv', '/home/submission/submission.csv')
    verify_score = baseline_score
    print(f"Using baseline score: {verify_score:.6f}")

Saved validated ensemble submission to /home/submission/submission.csv

Verifying ensemble submission...


Verified ensemble score: 70.671581


In [None]:
# Save metrics
import json

metrics = {
    'cv_score': verify_score,
    'baseline_score': baseline_score,
    'improvement': baseline_score - verify_score,
    'source_wins': source_wins,
    'validation_passed': len(failed_n) == 0
}

with open('/home/code/experiments/003_validated_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Saved metrics to experiments/003_validated_ensemble/metrics.json")
print(f"\nFinal Score: {verify_score:.6f}")

In [None]:
# Investigate: Why is submission.csv showing 51 invalid configs?
# The submission.csv was accepted on LB, so it can't have overlaps

print("Investigating submission.csv 'invalid' configurations...")

# Check which N values are marked invalid
invalid_n = [n for n in range(1, 201) if not valid_configs['submission.csv'].get(n, False)]
print(f"Invalid N values in submission.csv: {invalid_n[:20]}... (total: {len(invalid_n)})")

# Let's check one of them manually
test_n = invalid_n[0] if invalid_n else 1
trees = solutions['submission.csv'][test_n]
print(f"\nChecking N={test_n}:")
print(f"  Number of trees: {len(trees)}")
print(f"  Rotation angles valid: {validate_rotation_angles(trees)}")
print(f"  Has overlap: {has_overlap(trees)}")

# Check rotation angles
for i, (x, y, deg) in enumerate(trees[:5]):
    print(f"  Tree {i}: x={x:.4f}, y={y:.4f}, deg={deg:.4f}")