# Experiment 001: Baseline with Pre-optimized CSV

This experiment establishes the baseline score using the pre-optimized submission from public kernels.

In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.strtree import STRtree
from decimal import Decimal

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def get_tree_polygon(x, y, deg):
    """Create a Shapely polygon for a tree at (x, y) with rotation deg."""
    base_poly = Polygon(zip(TX, TY))
    rotated = affinity.rotate(base_poly, deg, origin=(0, 0))
    translated = affinity.translate(rotated, x, y)
    return translated

def load_trees_for_n(df, n):
    """Load trees for a specific N value from submission dataframe."""
    prefix = f'{n:03d}_'
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in rows.iterrows():
        x = float(str(row['x']).replace('s', ''))
        y = float(str(row['y']).replace('s', ''))
        deg = float(str(row['deg']).replace('s', ''))
        trees.append((x, y, deg))
    return trees

def get_bounding_box_side(trees):
    """Calculate the side length of the bounding square for trees."""
    if not trees:
        return 0
    
    all_x = []
    all_y = []
    for x, y, deg in trees:
        poly = get_tree_polygon(x, y, deg)
        bounds = poly.bounds  # (minx, miny, maxx, maxy)
        all_x.extend([bounds[0], bounds[2]])
        all_y.extend([bounds[1], bounds[3]])
    
    width = max(all_x) - min(all_x)
    height = max(all_y) - min(all_y)
    return max(width, height)

def has_overlap(trees):
    """Check if any trees overlap (touching is OK)."""
    if len(trees) <= 1:
        return False
    
    polygons = [get_tree_polygon(x, y, deg) for x, y, deg in trees]
    tree_index = STRtree(polygons)
    
    for i, poly in enumerate(polygons):
        candidates = tree_index.query(poly)
        for j in candidates:
            if i != j and poly.intersects(polygons[j]) and not poly.touches(polygons[j]):
                return True
    return False

def calculate_total_score(df):
    """Calculate total score for a submission."""
    total_score = 0
    per_n_scores = []
    
    for n in range(1, 201):
        trees = load_trees_for_n(df, n)
        side = get_bounding_box_side(trees)
        score = side**2 / n
        total_score += score
        per_n_scores.append((n, side, score))
    
    return total_score, per_n_scores

print("Functions defined successfully!")

Functions defined successfully!


In [2]:
# Download pre-optimized CSV from Kaggle dataset
os.chdir('/home/code/experiments/001_baseline')

# Try to download the pre-optimized CSV
try:
    result = subprocess.run(
        ['kaggle', 'datasets', 'download', 'smartmanoj/santa-2025-csv', '-f', 'santa-2025.csv', '--force'],
        capture_output=True, text=True, timeout=120
    )
    print("Download stdout:", result.stdout)
    print("Download stderr:", result.stderr)
except Exception as e:
    print(f"Download error: {e}")

# Check what files we have
print("\nFiles in directory:")
for f in os.listdir('.'):
    print(f"  {f}")

Download stdout: 403 Client Error: Forbidden for url: https://api.kaggle.com/v1/datasets.DatasetApiService/GetDatasetMetadata

Download stderr: 

Files in directory:
  baseline.ipynb


In [3]:
# Unzip if needed and load the CSV
import zipfile

if os.path.exists('santa-2025.csv.zip'):
    with zipfile.ZipFile('santa-2025.csv.zip', 'r') as z:
        z.extractall('.')
    print("Unzipped santa-2025.csv.zip")

# Check for the CSV file
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
print(f"CSV files found: {csv_files}")

# Load the pre-optimized submission
if 'santa-2025.csv' in csv_files:
    df = pd.read_csv('santa-2025.csv')
    print(f"Loaded santa-2025.csv with {len(df)} rows")
    print(df.head())
else:
    # Fall back to sample submission
    print("Pre-optimized CSV not found, using sample submission")
    df = pd.read_csv('/home/data/sample_submission.csv')
    print(f"Loaded sample_submission.csv with {len(df)} rows")

CSV files found: []
Pre-optimized CSV not found, using sample submission
Loaded sample_submission.csv with 20100 rows


In [4]:
# Calculate the total score
print("Calculating total score...")
total_score, per_n_scores = calculate_total_score(df)
print(f"\nTotal Score: {total_score:.6f}")

# Show scores for first 20 N values
print("\nPer-N scores (first 20):")
for n, side, score in per_n_scores[:20]:
    print(f"  N={n:3d}: side={side:.6f}, score={score:.6f}")

# Show scores for last 10 N values
print("\nPer-N scores (last 10):")
for n, side, score in per_n_scores[-10:]:
    print(f"  N={n:3d}: side={side:.6f}, score={score:.6f}")

Calculating total score...



Total Score: 173.652299

Per-N scores (first 20):
  N=  1: side=1.000000, score=1.000000
  N=  2: side=1.211271, score=0.733589
  N=  3: side=1.670600, score=0.930301
  N=  4: side=2.039257, score=1.039642
  N=  5: side=2.121716, score=0.900336
  N=  6: side=2.172745, score=0.786803
  N=  7: side=2.901647, score=1.202794
  N=  8: side=3.441115, score=1.480159
  N=  9: side=3.441115, score=1.315697
  N= 10: side=3.441115, score=1.184127
  N= 11: side=3.441115, score=1.076479
  N= 12: side=3.441115, score=0.986773
  N= 13: side=4.147386, score=1.323139
  N= 14: side=4.147386, score=1.228629
  N= 15: side=4.147386, score=1.146721
  N= 16: side=4.147386, score=1.075051
  N= 17: side=4.147386, score=1.011812
  N= 18: side=4.147386, score=0.955601
  N= 19: side=4.147386, score=0.905306
  N= 20: side=4.147386, score=0.860041

Per-N scores (last 10):
  N=191: side=13.034470, score=0.889515
  N=192: side=13.034470, score=0.884882
  N=193: side=13.034470, score=0.880297
  N=194: side=13.034470,

In [5]:
# Validate for overlaps (sample check on a few N values)
print("Checking for overlaps (sample)...")
overlap_found = False
for n in [1, 10, 50, 100, 150, 200]:
    trees = load_trees_for_n(df, n)
    if has_overlap(trees):
        print(f"  N={n}: OVERLAP DETECTED!")
        overlap_found = True
    else:
        print(f"  N={n}: OK")

if not overlap_found:
    print("\nNo overlaps detected in sampled configurations.")

Checking for overlaps (sample)...
  N=1: OK
  N=10: OK
  N=50: OK
  N=100: OK
  N=150: OK
  N=200: OK

No overlaps detected in sampled configurations.


In [6]:
# Save the submission file
submission_path = '/home/submission/submission.csv'
df.to_csv(submission_path, index=False)
print(f"Saved submission to {submission_path}")

# Also save a copy in the experiment folder
df.to_csv('submission.csv', index=False)
print(f"Saved copy to experiments/001_baseline/submission.csv")

print(f"\n=== BASELINE SCORE: {total_score:.6f} ===")

Saved submission to /home/submission/submission.csv
Saved copy to experiments/001_baseline/submission.csv

=== BASELINE SCORE: 173.652299 ===


In [None]:
# Load the pre-optimized CSV from snapshots
preopt_path = '/home/code/experiments/001_baseline/santa-2025.csv'
df_preopt = pd.read_csv(preopt_path)
print(f"Loaded pre-optimized CSV with {len(df_preopt)} rows")
print(df_preopt.head())

# Calculate the score
print("\nCalculating score for pre-optimized submission...")
total_score_preopt, per_n_scores_preopt = calculate_total_score(df_preopt)
print(f"\n=== PRE-OPTIMIZED SCORE: {total_score_preopt:.6f} ===")

In [None]:
# Also check the best_ensemble.csv
ensemble_path = '/home/code/experiments/001_baseline/best_ensemble.csv'
df_ensemble = pd.read_csv(ensemble_path)
print(f"Loaded best_ensemble.csv with {len(df_ensemble)} rows")

print("\nCalculating score for best_ensemble submission...")
total_score_ensemble, per_n_scores_ensemble = calculate_total_score(df_ensemble)
print(f"\n=== BEST ENSEMBLE SCORE: {total_score_ensemble:.6f} ===")