# Experiment 001: Baseline with Pre-optimized CSV

This experiment establishes the baseline score using the pre-optimized submission from public kernels.

In [None]:
import os
import subprocess
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.strtree import STRtree
from decimal import Decimal

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def get_tree_polygon(x, y, deg):
    """Create a Shapely polygon for a tree at (x, y) with rotation deg."""
    base_poly = Polygon(zip(TX, TY))
    rotated = affinity.rotate(base_poly, deg, origin=(0, 0))
    translated = affinity.translate(rotated, x, y)
    return translated

def load_trees_for_n(df, n):
    """Load trees for a specific N value from submission dataframe."""
    prefix = f'{n:03d}_'
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in rows.iterrows():
        x = float(str(row['x']).replace('s', ''))
        y = float(str(row['y']).replace('s', ''))
        deg = float(str(row['deg']).replace('s', ''))
        trees.append((x, y, deg))
    return trees

def get_bounding_box_side(trees):
    """Calculate the side length of the bounding square for trees."""
    if not trees:
        return 0
    
    all_x = []
    all_y = []
    for x, y, deg in trees:
        poly = get_tree_polygon(x, y, deg)
        bounds = poly.bounds  # (minx, miny, maxx, maxy)
        all_x.extend([bounds[0], bounds[2]])
        all_y.extend([bounds[1], bounds[3]])
    
    width = max(all_x) - min(all_x)
    height = max(all_y) - min(all_y)
    return max(width, height)

def has_overlap(trees):
    """Check if any trees overlap (touching is OK)."""
    if len(trees) <= 1:
        return False
    
    polygons = [get_tree_polygon(x, y, deg) for x, y, deg in trees]
    tree_index = STRtree(polygons)
    
    for i, poly in enumerate(polygons):
        candidates = tree_index.query(poly)
        for j in candidates:
            if i != j and poly.intersects(polygons[j]) and not poly.touches(polygons[j]):
                return True
    return False

def calculate_total_score(df):
    """Calculate total score for a submission."""
    total_score = 0
    per_n_scores = []
    
    for n in range(1, 201):
        trees = load_trees_for_n(df, n)
        side = get_bounding_box_side(trees)
        score = side**2 / n
        total_score += score
        per_n_scores.append((n, side, score))
    
    return total_score, per_n_scores

print("Functions defined successfully!")

In [None]:
# Download pre-optimized CSV from Kaggle dataset
os.chdir('/home/code/experiments/001_baseline')

# Try to download the pre-optimized CSV
try:
    result = subprocess.run(
        ['kaggle', 'datasets', 'download', 'smartmanoj/santa-2025-csv', '-f', 'santa-2025.csv', '--force'],
        capture_output=True, text=True, timeout=120
    )
    print("Download stdout:", result.stdout)
    print("Download stderr:", result.stderr)
except Exception as e:
    print(f"Download error: {e}")

# Check what files we have
print("\nFiles in directory:")
for f in os.listdir('.'):
    print(f"  {f}")

In [None]:
# Unzip if needed and load the CSV
import zipfile

if os.path.exists('santa-2025.csv.zip'):
    with zipfile.ZipFile('santa-2025.csv.zip', 'r') as z:
        z.extractall('.')
    print("Unzipped santa-2025.csv.zip")

# Check for the CSV file
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
print(f"CSV files found: {csv_files}")

# Load the pre-optimized submission
if 'santa-2025.csv' in csv_files:
    df = pd.read_csv('santa-2025.csv')
    print(f"Loaded santa-2025.csv with {len(df)} rows")
    print(df.head())
else:
    # Fall back to sample submission
    print("Pre-optimized CSV not found, using sample submission")
    df = pd.read_csv('/home/data/sample_submission.csv')
    print(f"Loaded sample_submission.csv with {len(df)} rows")

In [None]:
# Calculate the total score
print("Calculating total score...")
total_score, per_n_scores = calculate_total_score(df)
print(f"\nTotal Score: {total_score:.6f}")

# Show scores for first 20 N values
print("\nPer-N scores (first 20):")
for n, side, score in per_n_scores[:20]:
    print(f"  N={n:3d}: side={side:.6f}, score={score:.6f}")

# Show scores for last 10 N values
print("\nPer-N scores (last 10):")
for n, side, score in per_n_scores[-10:]:
    print(f"  N={n:3d}: side={side:.6f}, score={score:.6f}")

In [None]:
# Validate for overlaps (sample check on a few N values)
print("Checking for overlaps (sample)...")
overlap_found = False
for n in [1, 10, 50, 100, 150, 200]:
    trees = load_trees_for_n(df, n)
    if has_overlap(trees):
        print(f"  N={n}: OVERLAP DETECTED!")
        overlap_found = True
    else:
        print(f"  N={n}: OK")

if not overlap_found:
    print("\nNo overlaps detected in sampled configurations.")

In [None]:
# Save the submission file
submission_path = '/home/submission/submission.csv'
df.to_csv(submission_path, index=False)
print(f"Saved submission to {submission_path}")

# Also save a copy in the experiment folder
df.to_csv('submission.csv', index=False)
print(f"Saved copy to experiments/001_baseline/submission.csv")

print(f"\n=== BASELINE SCORE: {total_score:.6f} ===")