# Create Valid Submission (Non-Touching Trees)

Kaggle requires trees to have distance > 0 (not just non-overlapping).
This notebook will:
1. Load the best ensemble
2. Check for touching trees
3. Apply minimal separation to create gaps
4. Verify validity and calculate score

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.strtree import STRtree
from itertools import combinations
import json
import os

getcontext().prec = 30

# Christmas Tree class
class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(str(center_x))
        self.center_y = Decimal(str(center_y))
        self.angle = Decimal(str(angle))
        
        initial_polygon = Polygon([
            (0.0, 0.8),      # Tip
            (0.125, 0.5),    # Right top tier
            (0.0625, 0.5),
            (0.2, 0.25),     # Right mid tier
            (0.1, 0.25),
            (0.35, 0.0),     # Right base
            (0.075, 0.0),    # Right trunk
            (0.075, -0.2),
            (-0.075, -0.2),  # Left trunk
            (-0.075, 0.0),
            (-0.35, 0.0),    # Left base
            (-0.1, 0.25),    # Left mid tier
            (-0.2, 0.25),
            (-0.0625, 0.5),  # Left top tier
            (-0.125, 0.5),
        ])
        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=float(self.center_x), yoff=float(self.center_y))

def parse_value(val):
    """Parse submission value (may have 's' prefix)"""
    if isinstance(val, str) and val.startswith('s'):
        return val[1:]
    return str(val)

def load_trees_for_n(df, n):
    """Load all trees for configuration n"""
    prefix = f"{n:03d}_"
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in rows.iterrows():
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        trees.append(ChristmasTree(x, y, deg))
    return trees

print("Classes and functions defined")

Classes and functions defined


In [2]:
def get_min_distance(trees):
    """Get minimum distance between any pair of trees"""
    if len(trees) <= 1:
        return float('inf')
    
    min_dist = float('inf')
    for i, j in combinations(range(len(trees)), 2):
        dist = trees[i].polygon.distance(trees[j].polygon)
        min_dist = min(min_dist, dist)
    return min_dist

def get_touching_pairs(trees, threshold=1e-9):
    """Get pairs of trees that are touching (distance < threshold)"""
    touching = []
    for i, j in combinations(range(len(trees)), 2):
        dist = trees[i].polygon.distance(trees[j].polygon)
        if dist < threshold:
            touching.append((i, j, dist))
    return touching

def get_bounding_box_side(trees):
    """Get the side length of the bounding square"""
    all_points = []
    for tree in trees:
        coords = np.array(tree.polygon.exterior.coords)
        all_points.append(coords)
    all_points = np.vstack(all_points)
    
    min_x, min_y = all_points.min(axis=0)
    max_x, max_y = all_points.max(axis=0)
    
    return max(max_x - min_x, max_y - min_y)

print("Helper functions defined")

Helper functions defined


In [3]:
# Load the current best ensemble
df = pd.read_csv('/home/code/experiments/002_valid_ensemble/submission.csv')
print(f"Loaded submission with {len(df)} rows")

# Check how many N values have touching trees
touching_counts = []
for n in range(1, 201):
    trees = load_trees_for_n(df, n)
    touching = get_touching_pairs(trees)
    if touching:
        touching_counts.append((n, len(touching)))

print(f"\nN values with touching trees: {len(touching_counts)}/200")
print(f"First 10 with touching: {touching_counts[:10]}")

Loaded submission with 20100 rows



N values with touching trees: 199/200
First 10 with touching: [(2, 1), (3, 1), (4, 3), (5, 3), (6, 6), (7, 9), (8, 10), (9, 10), (10, 15), (11, 14)]


In [4]:
# Calculate current score
def calculate_score(df):
    """Calculate total score for submission"""
    total_score = 0
    for n in range(1, 201):
        trees = load_trees_for_n(df, n)
        side = get_bounding_box_side(trees)
        contribution = (side ** 2) / n
        total_score += contribution
    return total_score

current_score = calculate_score(df)
print(f"Current score: {current_score:.6f}")
print(f"Target: 68.919154")
print(f"Gap: {current_score - 68.919154:.6f}")

Current score: 70.646824
Target: 68.919154
Gap: 1.727670


In [5]:
# Let's look at all snapshot CSVs and find ones with non-touching trees
import glob

snapshot_dirs = glob.glob('/home/nonroot/snapshots/santa-2025/*/')
print(f"Found {len(snapshot_dirs)} snapshot directories")

# Find all CSV files
all_csvs = []
for d in snapshot_dirs:
    csvs = glob.glob(f"{d}**/*.csv", recursive=True)
    all_csvs.extend(csvs)

print(f"Found {len(all_csvs)} CSV files total")

Found 62 snapshot directories
Found 1027 CSV files total


In [None]:
# For each N, find the best configuration that has NO touching trees (distance > 1e-9)
# This is a more thorough search

def is_valid_configuration(trees, min_gap=1e-9):
    """Check if configuration has gaps between all trees"""
    if len(trees) <= 1:
        return True
    
    for i, j in combinations(range(len(trees)), 2):
        dist = trees[i].polygon.distance(trees[j].polygon)
        if dist < min_gap:
            return False
    return True

# Sample a few CSVs to understand the data
sample_csvs = all_csvs[:5]
for csv_path in sample_csvs:
    try:
        df_sample = pd.read_csv(csv_path)
        if 'id' in df_sample.columns and len(df_sample) == 20100:
            print(f"Valid submission format: {csv_path}")
    except:
        pass

In [None]:
# Build the best valid ensemble by checking all CSVs
# For each N, keep track of the best (smallest bbox) valid configuration

best_configs = {}  # n -> (score_contribution, csv_path, rows)

print("Scanning all CSVs for valid configurations...")
processed = 0
valid_csvs = 0

for csv_path in all_csvs:
    try:
        df_csv = pd.read_csv(csv_path)
        if 'id' not in df_csv.columns or len(df_csv) != 20100:
            continue
        valid_csvs += 1
        
        # Check each N
        for n in range(1, 201):
            trees = load_trees_for_n(df_csv, n)
            if len(trees) != n:
                continue
            
            # Check if valid (non-touching)
            if not is_valid_configuration(trees, min_gap=1e-9):
                continue
            
            # Calculate score contribution
            side = get_bounding_box_side(trees)
            contribution = (side ** 2) / n
            
            # Keep if better than current best
            if n not in best_configs or contribution < best_configs[n][0]:
                prefix = f"{n:03d}_"
                rows = df_csv[df_csv['id'].str.startswith(prefix)].copy()
                best_configs[n] = (contribution, csv_path, rows)
        
        processed += 1
        if processed % 100 == 0:
            print(f"Processed {processed} CSVs, found valid configs for {len(best_configs)} N values")
            
    except Exception as e:
        continue

print(f"\nProcessed {valid_csvs} valid CSVs")
print(f"Found valid configurations for {len(best_configs)}/200 N values")

In [None]:
# Check which N values are missing valid configurations
missing_n = [n for n in range(1, 201) if n not in best_configs]
print(f"Missing valid configurations for N values: {missing_n[:20]}..." if len(missing_n) > 20 else f"Missing: {missing_n}")
print(f"Total missing: {len(missing_n)}")

In [None]:
# For missing N values, we need to create valid configurations
# Option 1: Take the touching configuration and apply small separation
# Option 2: Use sample_submission.csv as fallback

# First, let's see what the sample submission looks like for missing N
sample_df = pd.read_csv('/home/data/sample_submission.csv')

# Check if sample submission has valid (non-touching) configurations
print("Checking sample submission for valid configurations...")
for n in missing_n[:10]:
    trees = load_trees_for_n(sample_df, n)
    if is_valid_configuration(trees, min_gap=1e-9):
        side = get_bounding_box_side(trees)
        contribution = (side ** 2) / n
        print(f"N={n}: Sample submission is valid, contribution={contribution:.4f}")
    else:
        min_dist = get_min_distance(trees)
        print(f"N={n}: Sample submission has touching trees, min_dist={min_dist:.2e}")

In [None]:
# The sample submission likely has non-touching trees since it's the default
# Let's use it as fallback for missing N values

for n in missing_n:
    trees = load_trees_for_n(sample_df, n)
    if len(trees) == n:
        side = get_bounding_box_side(trees)
        contribution = (side ** 2) / n
        prefix = f"{n:03d}_"
        rows = sample_df[sample_df['id'].str.startswith(prefix)].copy()
        best_configs[n] = (contribution, 'sample_submission.csv', rows)

print(f"After adding sample submission fallbacks: {len(best_configs)}/200 N values")

In [None]:
# Build the final valid ensemble
all_rows = []
for n in range(1, 201):
    if n in best_configs:
        _, _, rows = best_configs[n]
        all_rows.append(rows)
    else:
        print(f"WARNING: No configuration for N={n}")

valid_ensemble_df = pd.concat(all_rows, ignore_index=True)
print(f"Valid ensemble has {len(valid_ensemble_df)} rows")

# Calculate total score
valid_score = sum(best_configs[n][0] for n in range(1, 201) if n in best_configs)
print(f"\nValid ensemble score: {valid_score:.6f}")
print(f"Target: 68.919154")
print(f"Gap: {valid_score - 68.919154:.6f} ({(valid_score - 68.919154) / 68.919154 * 100:.2f}%)")

In [None]:
# Verify the ensemble is truly valid (no touching trees)
print("Verifying all configurations are valid (non-touching)...")
invalid_n = []
for n in range(1, 201):
    trees = load_trees_for_n(valid_ensemble_df, n)
    if not is_valid_configuration(trees, min_gap=1e-9):
        min_dist = get_min_distance(trees)
        invalid_n.append((n, min_dist))

if invalid_n:
    print(f"WARNING: {len(invalid_n)} configurations still have touching trees!")
    print(f"First 10: {invalid_n[:10]}")
else:
    print("All configurations are valid (non-touching)!")

In [None]:
# Save the valid ensemble
valid_ensemble_df.to_csv('/home/code/experiments/002_valid_submission/submission.csv', index=False)
print("Saved to /home/code/experiments/002_valid_submission/submission.csv")

# Also copy to /home/submission/
import shutil
shutil.copy('/home/code/experiments/002_valid_submission/submission.csv', '/home/submission/submission.csv')
print("Copied to /home/submission/submission.csv")

# Save metrics
metrics = {'cv_score': valid_score}
with open('/home/code/experiments/002_valid_submission/metrics.json', 'w') as f:
    json.dump(metrics, f)
print(f"Metrics saved: {metrics}")

In [None]:
# Score breakdown by N range
print("\nScore breakdown by N range:")
for start, end in [(1, 10), (11, 50), (51, 100), (101, 150), (151, 200)]:
    range_score = sum(best_configs[n][0] for n in range(start, end+1) if n in best_configs)
    print(f"  N={start}-{end}: {range_score:.4f}")

print(f"\nTotal: {valid_score:.6f}")