# Experiment 006: Valid Ensemble with Overlap Checking

The previous experiment (005) had a CRITICAL BUG - it selected configurations with overlapping trees.
This experiment will:
1. Include proper overlap validation
2. Only consider configurations with 0 overlaps
3. Build a valid ensemble

In [None]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
import glob
from tqdm import tqdm
import shutil

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def parse_value(s):
    if isinstance(s, str) and s.startswith('s'):
        return float(s[1:])
    return float(s)

def create_tree_polygon(x, y, deg):
    angle_rad = np.radians(deg)
    cos_a, sin_a = np.cos(angle_rad), np.sin(angle_rad)
    vertices = [(tx * cos_a - ty * sin_a + x, tx * sin_a + ty * cos_a + y) for tx, ty in zip(TX, TY)]
    return Polygon(vertices)

def compute_bounding_side(polygons):
    if not polygons:
        return 0
    all_points = []
    for poly in polygons:
        all_points.extend(list(poly.exterior.coords))
    all_points = np.array(all_points)
    return max(all_points.max(axis=0) - all_points.min(axis=0))

def check_overlaps_for_n(df, n):
    """Check if configuration N has overlapping trees. Returns (has_overlap, overlap_area)"""
    prefix = f"{n:03d}_"
    trees = df[df['id'].str.startswith(prefix)]
    if len(trees) != n:
        return True, float('inf')
    
    polygons = []
    for _, row in trees.iterrows():
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        polygons.append(create_tree_polygon(x, y, deg))
    
    # Check for overlaps
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > 1e-10:
                    return True, intersection.area
    return False, 0

def compute_score_for_n_with_validation(df, n):
    """Compute score for N, returning inf if overlaps exist"""
    has_overlap, _ = check_overlaps_for_n(df, n)
    if has_overlap:
        return float('inf'), None
    
    prefix = f"{n:03d}_"
    trees = df[df['id'].str.startswith(prefix)]
    polygons = [create_tree_polygon(parse_value(row['x']), parse_value(row['y']), parse_value(row['deg'])) for _, row in trees.iterrows()]
    side = compute_bounding_side(polygons)
    return side**2 / n, trees

def compute_total_score(df):
    total = 0
    for n in range(1, 201):
        score, _ = compute_score_for_n_with_validation(df, n)
        total += score
    return total

def validate_submission(df):
    """Validate entire submission has no overlaps"""
    overlap_count = 0
    for n in range(1, 201):
        has_overlap, _ = check_overlaps_for_n(df, n)
        if has_overlap:
            overlap_count += 1
    return overlap_count == 0, overlap_count

print("Functions defined with overlap validation")

In [None]:
# First, verify the saspav baseline has no overlaps
df_baseline = pd.read_csv('/home/code/external_data/saspav/santa-2025.csv')
is_valid, overlap_count = validate_submission(df_baseline)
print(f"Saspav baseline validation: valid={is_valid}, overlaps={overlap_count}")

if is_valid:
    baseline_score = compute_total_score(df_baseline)
    print(f"Saspav baseline score: {baseline_score:.6f}")
else:
    print("ERROR: Baseline has overlaps!")

In [None]:
# Scan ALL CSV files and check for validity
all_csvs = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
external_csvs = glob.glob('/home/code/external_data/**/*.csv', recursive=True)
all_csvs.extend(external_csvs)
print(f"Total CSV files to scan: {len(all_csvs)}")

# First pass: identify valid submission files (correct format)
valid_format_sources = []
for csv_path in tqdm(all_csvs, desc="Checking format"):
    try:
        df = pd.read_csv(csv_path)
        if 'id' in df.columns and 'x' in df.columns and 'y' in df.columns and 'deg' in df.columns:
            if len(df) >= 20000:
                valid_format_sources.append(csv_path)
    except:
        pass

print(f"\nFound {len(valid_format_sources)} files with valid format")

In [None]:
# Second pass: check each source for overlaps and score valid ones
print("Checking for overlaps and scoring valid sources...")
valid_sources = []  # (path, score, overlap_count)

for csv_path in tqdm(valid_format_sources[:100], desc="Validating sources"):  # Limit to first 100 for speed
    try:
        df = pd.read_csv(csv_path)
        is_valid, overlap_count = validate_submission(df)
        if is_valid:
            score = compute_total_score(df)
            valid_sources.append((csv_path, score, overlap_count))
        else:
            # Still track invalid sources for analysis
            pass
    except Exception as e:
        pass

print(f"\nFound {len(valid_sources)} sources with 0 overlaps")

# Sort by score
valid_sources.sort(key=lambda x: x[1])

print("\nTop 10 valid sources:")
for path, score, overlaps in valid_sources[:10]:
    print(f"  {score:.6f}: {path.split('/')[-1]} (overlaps={overlaps})")

In [None]:
# Check more sources - focus on preoptimized directory
preopt_csvs = glob.glob('/home/nonroot/snapshots/santa-2025/*/code/preoptimized/**/*.csv', recursive=True)
print(f"Preoptimized CSVs: {len(preopt_csvs)}")

for csv_path in tqdm(preopt_csvs, desc="Checking preoptimized"):
    if csv_path not in [v[0] for v in valid_sources]:
        try:
            df = pd.read_csv(csv_path)
            if 'id' in df.columns and len(df) >= 20000:
                is_valid, overlap_count = validate_submission(df)
                if is_valid:
                    score = compute_total_score(df)
                    valid_sources.append((csv_path, score, overlap_count))
        except:
            pass

valid_sources.sort(key=lambda x: x[1])
print(f"\nTotal valid sources: {len(valid_sources)}")
print("\nTop 10 valid sources:")
for path, score, overlaps in valid_sources[:10]:
    print(f"  {score:.6f}: {path.split('/')[-1]}")

In [None]:
# Build ensemble from valid sources only
print("\nBuilding ensemble from valid sources...")

# Load all valid dataframes
all_valid_dfs = {}
for path, score, _ in valid_sources:
    try:
        all_valid_dfs[path] = pd.read_csv(path)
    except:
        pass

print(f"Loaded {len(all_valid_dfs)} valid dataframes")

# For each N, find the best VALID configuration
best_per_n = {}  # n -> (score, source_path, trees_df)

for n in tqdm(range(1, 201), desc="Finding best per N"):
    best_score = float('inf')
    best_source = None
    best_trees = None
    
    for path, df in all_valid_dfs.items():
        score, trees = compute_score_for_n_with_validation(df, n)
        if score < best_score:
            best_score = score
            best_source = path
            best_trees = trees
    
    best_per_n[n] = (best_score, best_source, best_trees)

# Compute ensemble total
ensemble_total = sum(best_per_n[n][0] for n in range(1, 201))
print(f"\nEnsemble total score: {ensemble_total:.6f}")
print(f"Baseline score: {baseline_score:.6f}")
print(f"Improvement: {baseline_score - ensemble_total:.9f}")

In [None]:
# Analyze which sources win
source_wins = {}
for n in range(1, 201):
    source = best_per_n[n][1]
    source_name = source.split('/')[-1] if source else 'None'
    source_wins[source_name] = source_wins.get(source_name, 0) + 1

print("Source wins distribution:")
for source, wins in sorted(source_wins.items(), key=lambda x: -x[1]):
    print(f"  {source}: {wins} N values")

In [None]:
# Save the valid ensemble
ensemble_rows = []
for n in range(1, 201):
    trees = best_per_n[n][2]
    if trees is not None:
        for _, row in trees.iterrows():
            ensemble_rows.append(row.to_dict())

ensemble_df = pd.DataFrame(ensemble_rows)
ensemble_df.to_csv('/home/submission/submission.csv', index=False)
print(f"Saved ensemble with {len(ensemble_df)} rows")

# Final validation
df_verify = pd.read_csv('/home/submission/submission.csv')
is_valid, overlap_count = validate_submission(df_verify)
print(f"\nFinal validation: valid={is_valid}, overlaps={overlap_count}")

if is_valid:
    verify_score = compute_total_score(df_verify)
    print(f"Verified ensemble score: {verify_score:.6f}")
else:
    print("ERROR: Ensemble has overlaps!")

In [None]:
# Summary
print("="*60)
print("EXPERIMENT 006 SUMMARY: Valid Ensemble")
print("="*60)
print(f"Total sources scanned: {len(all_csvs)}")
print(f"Valid sources (0 overlaps): {len(valid_sources)}")
print(f"\nBaseline score: {baseline_score:.6f}")
print(f"Ensemble score: {verify_score:.6f}")
print(f"Improvement: {baseline_score - verify_score:.9f}")
print("="*60)

In [None]:
# Model wrapper for submission
class ValidEnsemble:
    def __init__(self, data='single'):
        self.data = data
        
    def load_best(self):
        return pd.read_csv('/home/submission/submission.csv')
    
    def save_submission(self, path):
        df = self.load_best()
        df.to_csv(path, index=False)
        return df

model = ValidEnsemble(data='single')
print("Model wrapper defined")