# Baseline Experiment - Santa 2025

Goal: Find the best pre-optimized submission and establish a baseline score.

Target: Beat 68.901319

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
import os
import glob

# Tree geometry (15-vertex polygon)
def get_tree_polygon():
    """Returns the base Christmas tree polygon centered at origin."""
    # Tree shape from the getting started notebook
    vertices = [
        (-0.075, -0.2),   # trunk bottom left
        (-0.075, 0.0),    # trunk top left
        (-0.35, 0.0),     # base tier left
        (-0.2, 0.25),     # base tier top left
        (-0.35, 0.25),    # middle tier bottom left
        (-0.125, 0.5),    # middle tier top left
        (-0.2, 0.5),      # top tier bottom left
        (0.0, 0.8),       # tip
        (0.2, 0.5),       # top tier bottom right
        (0.125, 0.5),     # middle tier top right
        (0.35, 0.25),     # middle tier bottom right
        (0.2, 0.25),      # base tier top right
        (0.35, 0.0),      # base tier right
        (0.075, 0.0),     # trunk top right
        (0.075, -0.2),    # trunk bottom right
    ]
    return Polygon(vertices)

BASE_TREE = get_tree_polygon()
print(f"Base tree bounds: {BASE_TREE.bounds}")
print(f"Base tree area: {BASE_TREE.area:.4f}")

Base tree bounds: (-0.35, -0.2, 0.35, 0.8)
Base tree area: 0.3463


In [2]:
def create_tree_polygon(x, y, deg):
    """Create a tree polygon at position (x, y) with rotation deg."""
    tree = affinity.rotate(BASE_TREE, deg, origin=(0, 0))
    tree = affinity.translate(tree, x, y)
    return tree

def calculate_bounding_box_side(trees):
    """Calculate the side length of the bounding square for a list of tree polygons."""
    if not trees:
        return 0
    
    all_points = []
    for tree in trees:
        all_points.extend(list(tree.exterior.coords))
    
    points = np.array(all_points)
    min_x, max_x = points[:, 0].min(), points[:, 0].max()
    min_y, max_y = points[:, 1].min(), points[:, 1].max()
    
    return max(max_x - min_x, max_y - min_y)

def score_submission(df):
    """Calculate the total score for a submission dataframe."""
    total_score = 0
    side_lengths = {}
    
    for n in range(1, 201):
        # Get trees for this N
        prefix = f"{n:03d}_"
        n_trees = df[df['id'].str.startswith(prefix)]
        
        if len(n_trees) == 0:
            print(f"Warning: No trees found for N={n}")
            continue
        
        # Create tree polygons
        trees = []
        for _, row in n_trees.iterrows():
            x = float(str(row['x']).replace('s', ''))
            y = float(str(row['y']).replace('s', ''))
            deg = float(str(row['deg']).replace('s', ''))
            trees.append(create_tree_polygon(x, y, deg))
        
        # Calculate bounding box side
        side = calculate_bounding_box_side(trees)
        side_lengths[n] = side
        
        # Add to total score
        total_score += (side ** 2) / n
    
    return total_score, side_lengths

print("Scoring functions defined.")

Scoring functions defined.


In [3]:
# Find all pre-optimized submissions
preopt_base = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized'

csv_files = []
for root, dirs, files in os.walk(preopt_base):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

print(f"Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(f"  {f}")

Found 30 CSV files:
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/submission.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/best_ensemble.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/72.49.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/71.97.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/telegram_extracted/72.49.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/telegram_extracted/71.97.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/submission_JKoT4.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/New_Tree_144_196.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimi

In [4]:
# Score a few key submissions to find the best one
key_files = [
    f'{preopt_base}/santa-2025.csv',
    f'{preopt_base}/best_ensemble.csv',
    f'{preopt_base}/ensemble.csv',
    f'{preopt_base}/santa25-public/submission_70_926149550346.csv',
    f'{preopt_base}/santa25-public/submission_70_936673758122.csv',
]

results = []
for fpath in key_files:
    if os.path.exists(fpath):
        try:
            df = pd.read_csv(fpath)
            score, _ = score_submission(df)
            results.append((fpath, score))
            print(f"{os.path.basename(fpath)}: {score:.6f}")
        except Exception as e:
            print(f"{os.path.basename(fpath)}: ERROR - {e}")
    else:
        print(f"{os.path.basename(fpath)}: NOT FOUND")

print("\nBest submission:")
if results:
    best = min(results, key=lambda x: x[1])
    print(f"  {best[0]}: {best[1]:.6f}")

santa-2025.csv: 76.404339


best_ensemble.csv: 76.404339


ensemble.csv: 76.404339


submission_70_926149550346.csv: 76.669156


submission_70_936673758122.csv: 76.683733

Best submission:
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025.csv: 76.404339


In [5]:
# Let's also check the submission.csv from the snapshot
snapshot_submission = '/home/nonroot/snapshots/santa-2025/21116303805/code/submission.csv'
if os.path.exists(snapshot_submission):
    df = pd.read_csv(snapshot_submission)
    score, side_lengths = score_submission(df)
    print(f"Snapshot submission.csv: {score:.6f}")
    
    # Show score breakdown by N ranges
    print("\nScore breakdown by N range:")
    for start, end in [(1, 10), (11, 50), (51, 100), (101, 150), (151, 200)]:
        range_score = sum((side_lengths[n]**2)/n for n in range(start, end+1))
        print(f"  N={start}-{end}: {range_score:.6f}")

Snapshot submission.csv: 76.404339

Score breakdown by N range:
  N=1-10: 5.409419
  N=11-50: 16.615299
  N=51-100: 18.784299
  N=101-150: 18.031182
  N=151-200: 17.564140


In [6]:
# Use the best pre-optimized submission as our baseline
# Based on previous analysis, santa-2025.csv is the best at 70.676102
best_file = f'{preopt_base}/santa-2025.csv'
df_best = pd.read_csv(best_file)

# Verify the score
score, side_lengths = score_submission(df_best)
print(f"Best baseline score: {score:.6f}")
print(f"Target score: 68.901319")
print(f"Gap: {score - 68.901319:.6f}")

# Save as our baseline submission
df_best.to_csv('/home/submission/submission.csv', index=False)
print("\nSaved to /home/submission/submission.csv")

Best baseline score: 76.404339
Target score: 68.901319
Gap: 7.503020

Saved to /home/submission/submission.csv


In [7]:
# Analyze which N values have the most room for improvement
print("\nTop 20 N values by score contribution (S^2/n):")
contributions = [(n, (side_lengths[n]**2)/n, side_lengths[n]) for n in range(1, 201)]
contributions.sort(key=lambda x: x[1], reverse=True)

for n, contrib, side in contributions[:20]:
    print(f"  N={n:3d}: contrib={contrib:.6f}, side={side:.6f}")


Top 20 N values by score contribution (S^2/n):
  N=  1: contrib=0.661250, side=0.813173
  N=  2: contrib=0.661151, side=1.149914
  N=  3: contrib=0.591708, side=1.332337
  N=  4: contrib=0.555931, side=1.491215
  N=  5: contrib=0.537831, side=1.639864
  N=  6: contrib=0.509745, side=1.748848
  N=  7: contrib=0.480450, side=1.833889
  N=  8: contrib=0.478404, side=1.956330
  N=  9: contrib=0.475038, side=2.067690
  N= 10: contrib=0.457913, side=2.139890
  N= 11: contrib=0.448473, side=2.221081
  N= 15: contrib=0.445610, side=2.585371
  N= 12: contrib=0.442070, side=2.303223
  N= 13: contrib=0.439503, side=2.390302
  N= 14: contrib=0.438649, side=2.478121
  N= 16: contrib=0.436462, side=2.642610
  N= 21: contrib=0.432029, side=3.012076
  N= 20: contrib=0.431599, side=2.938024
  N= 17: contrib=0.431538, side=2.708533
  N= 22: contrib=0.429432, side=3.073679


In [8]:
# Check for overlaps in the submission
from shapely.strtree import STRtree

def check_overlaps(df, n):
    """Check if any trees overlap for configuration N."""
    prefix = f"{n:03d}_"
    n_trees = df[df['id'].str.startswith(prefix)]
    
    trees = []
    for _, row in n_trees.iterrows():
        x = float(str(row['x']).replace('s', ''))
        y = float(str(row['y']).replace('s', ''))
        deg = float(str(row['deg']).replace('s', ''))
        trees.append(create_tree_polygon(x, y, deg))
    
    # Check all pairs for overlap
    for i in range(len(trees)):
        for j in range(i+1, len(trees)):
            if trees[i].intersects(trees[j]) and not trees[i].touches(trees[j]):
                # Check if it's a real overlap (not just touching)
                intersection = trees[i].intersection(trees[j])
                if intersection.area > 1e-10:
                    return True, i, j, intersection.area
    return False, None, None, 0

# Check a few configurations
print("Checking for overlaps in key configurations...")
for n in [1, 2, 3, 4, 5, 10, 50, 100, 150, 200]:
    has_overlap, i, j, area = check_overlaps(df_best, n)
    if has_overlap:
        print(f"  N={n}: OVERLAP between trees {i} and {j}, area={area:.10f}")
    else:
        print(f"  N={n}: OK")

Checking for overlaps in key configurations...
  N=1: OK
  N=2: OVERLAP between trees 0 and 1, area=0.0272077342
  N=3: OVERLAP between trees 0 and 1, area=0.0001724227
  N=4: OVERLAP between trees 0 and 1, area=0.0038633604
  N=5: OVERLAP between trees 0 and 3, area=0.0047788472
  N=10: OVERLAP between trees 0 and 7, area=0.0487314972
  N=50: OVERLAP between trees 0 and 2, area=0.0249990131
  N=100: OVERLAP between trees 0 and 24, area=0.0399463291
  N=150: OVERLAP between trees 0 and 27, area=0.0399352179
  N=200: OVERLAP between trees 0 and 67, area=0.0378285760


In [9]:
# Final summary
print("="*60)
print("BASELINE EXPERIMENT SUMMARY")
print("="*60)
print(f"Best pre-optimized file: {best_file}")
print(f"Baseline score: {score:.6f}")
print(f"Target score: 68.901319")
print(f"Gap to target: {score - 68.901319:.6f} ({(score - 68.901319)/68.901319*100:.2f}%)")
print(f"\nSubmission saved to: /home/submission/submission.csv")
print("="*60)

BASELINE EXPERIMENT SUMMARY
Best pre-optimized file: /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025.csv
Baseline score: 76.404339
Target score: 68.901319
Gap to target: 7.503020 (10.89%)

Submission saved to: /home/submission/submission.csv


In [10]:
# The scoring discrepancy suggests my calculation might be wrong
# Let me check the actual submission format and recalculate

# First, let's look at the actual values in the CSV
print("Sample rows from santa-2025.csv:")
print(df_best.head(20))
print(f"\nTotal rows: {len(df_best)}")

# Check if the 's' prefix is causing issues
print("\nSample x values:")
print(df_best['x'].head(10))

Sample rows from santa-2025.csv:
       id                       x                       y  \
0   001_0    s-48.196086194214246     s58.770984615214225   
1   002_0   s0.154097069621355887  s-0.038540742694794648   
2   002_1  s-0.154097069621372845  s-0.561459257305224058   
3   003_0      s1.123655816140301      s0.781101815992563   
4   003_1       s1.23405569584216      s1.275999500663759   
5   003_2      s0.641714640229075      s1.180458566613381   
6   004_0  s-0.324747789589372171   s0.132109978088185392   
7   004_1   s0.315354346242637695   s0.132109978063475492   
8   004_2   s0.324747789592379210  s-0.732109978069475531   
9   004_3  s-0.315354348134818330  s-0.732109978094185987   
10  005_0  s-0.460619134626841731   s0.135736729990337091   
11  005_1  s-0.448950577962320241  s-0.774007795472532845   
12  005_2   s0.460619134541600361  s-0.664114941518851198   
13  005_3   s0.064437841146223779  s-0.447038051246298751   
14  005_4   s0.265192384627995659   s0.1409256778295

In [12]:
# The N=1 tree is at a strange position. Let me check the bounding box calculation
# for N=1 specifically

n = 1
prefix = f"{n:03d}_"
n_trees = df_best[df_best['id'].str.startswith(prefix)]
print(f"N=1 trees:")
print(n_trees)

# Create the tree polygon
for _, row in n_trees.iterrows():
    x = float(str(row['x']).replace('s', ''))
    y = float(str(row['y']).replace('s', ''))
    deg = float(str(row['deg']).replace('s', ''))
    print(f"\nPosition: ({x}, {y}), Rotation: {deg}")
    
    tree = create_tree_polygon(x, y, deg)
    print(f"Tree bounds: {tree.bounds}")
    
    # Calculate bounding box side
    minx, miny, maxx, maxy = tree.bounds
    width = maxx - minx
    height = maxy - miny
    side = max(width, height)
    print(f"Width: {width:.6f}, Height: {height:.6f}, Side: {side:.6f}")
    print(f"Score contribution: {side**2 / n:.6f}")

N=1 trees:
      id                     x                    y    deg
0  001_0  s-48.196086194214246  s58.770984615214225  s45.0

Position: (-48.196086194214246, 58.770984615214225), Rotation: 45.0
Tree bounds: (-48.76177161916348, 58.523497241798935, -47.948598820798956, 59.33667004016346)
Width: 0.813173, Height: 0.813173, Side: 0.813173
Score contribution: 0.661250


In [13]:
# Let me read the competition description to understand the scoring better
with open('/home/data/description.md', 'r') as f:
    desc = f.read()
print(desc)

# Santa 2025 - Christmas Tree Packing Challenge

## Description

<img title=”seasons greetings” src="https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F8939556%2Ff84b88f18d9ee1657b7229ad9fab9713%2FGemini_Generated_Image_kgcl4gkgcl4gkgcl.png?generation=1762216480960277&alt=media" style="float: right; height: 357px">

_Here comes a challenge, here comes a challenge,_<br>
_Right to your front door! _ <br>
_Santa has tree toys, tiny tree toys,_ <br>
_To mail from shore to shore._ <br>
_He needs the smallest box, indeed a square box,_<br>
_To fit them all inside,_<br>
_So he can mail these stocking stuffers_<br>
_On his big long Christmas ride!_ <br>

_Here comes the problem, here comes the problem._<br>
_We need the smallest size!_<br>
_For one to two hundred trees in shipments,_<br>
_We need your expert eyes._<br> 
_Can you find the best solution to help us pack_<br> 
_All the tiny trees inside?_<br>
_We must find the optimal packing to help Santa Claus_<br> 
_

In [14]:
# Let me check the getting started notebook for the exact scoring implementation
# The issue might be that I need to use the convex hull or a different bounding box method

# Let me also check if there's a metric implementation in the research kernels
import subprocess

# Check for metric implementation
result = subprocess.run(['find', '/home/nonroot/snapshots/santa-2025/21116303805/code', '-name', '*.py', '-o', '-name', '*.ipynb'], 
                       capture_output=True, text=True)
print("Python files found:")
print(result.stdout[:2000])

Python files found:
/home/nonroot/snapshots/santa-2025/21116303805/code/experiments/004_sa_v1_parallel/fractional_translation.py
/home/nonroot/snapshots/santa-2025/21116303805/code/experiments/005_backward_propagation/backward_prop.py
/home/nonroot/snapshots/santa-2025/21116303805/code/experiments/003_original_baseline/validate.py
/home/nonroot/snapshots/santa-2025/21116303805/code/research/kernels/jonathanchan_santa25-ensemble-sa-fractional-translation/santa25-ensemble-sa-fractional-translation.ipynb
/home/nonroot/snapshots/santa-2025/21116303805/code/research/kernels/saspav_santa-submission/santa-submission.ipynb
/home/nonroot/snapshots/santa-2025/21116303805/code/research/kernels/jazivxt_why-not/why-not.ipynb
/home/nonroot/snapshots/santa-2025/21116303805/code/research/kernels/yongsukprasertsuk_santa-2025-best-keeping-bbox3-runner/santa-2025-best-keeping-bbox3-runner.ipynb
/home/nonroot/snapshots/santa-2025/21116303805/code/research/kernels/smartmanoj_santa-claude/santa-claude.ipynb

In [15]:
# Use the validated scoring function from the snapshot
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
from shapely.strtree import STRtree

getcontext().prec = 30

class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(center_x)
        self.center_y = Decimal(center_y)
        self.angle = Decimal(angle)

        trunk_w = Decimal('0.15')
        trunk_h = Decimal('0.2')
        base_w = Decimal('0.7')
        mid_w = Decimal('0.4')
        top_w = Decimal('0.25')
        tip_y = Decimal('0.8')
        tier_1_y = Decimal('0.5')
        tier_2_y = Decimal('0.25')
        base_y = Decimal('0.0')
        trunk_bottom_y = -trunk_h

        initial_polygon = Polygon([
            (float(0), float(tip_y)),
            (float(top_w / 2), float(tier_1_y)),
            (float(top_w / 4), float(tier_1_y)),
            (float(mid_w / 2), float(tier_2_y)),
            (float(mid_w / 4), float(tier_2_y)),
            (float(base_w / 2), float(base_y)),
            (float(trunk_w / 2), float(base_y)),
            (float(trunk_w / 2), float(trunk_bottom_y)),
            (float(-trunk_w / 2), float(trunk_bottom_y)),
            (float(-trunk_w / 2), float(base_y)),
            (float(-base_w / 2), float(base_y)),
            (float(-mid_w / 4), float(tier_2_y)),
            (float(-mid_w / 2), float(tier_2_y)),
            (float(-top_w / 4), float(tier_1_y)),
            (float(-top_w / 2), float(tier_1_y)),
        ])

        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=float(self.center_x), yoff=float(self.center_y))

def load_trees_for_n(df, n):
    prefix = f"{n:03d}_"
    subset = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in subset.iterrows():
        x = str(row['x']).lstrip('s')
        y = str(row['y']).lstrip('s')
        deg = str(row['deg']).lstrip('s')
        trees.append(ChristmasTree(x, y, deg))
    return trees

def has_overlap(trees):
    if len(trees) <= 1:
        return False
    polygons = [t.polygon for t in trees]
    tree_index = STRtree(polygons)
    
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx != i:
                if poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                    intersection = poly.intersection(polygons[idx])
                    if intersection.area > 1e-12:
                        return True
    return False

def get_bounding_box_side(trees):
    if not trees:
        return 0
    all_coords = []
    for tree in trees:
        coords = np.array(tree.polygon.exterior.coords)
        all_coords.append(coords)
    all_coords = np.vstack(all_coords)
    x_range = all_coords[:, 0].max() - all_coords[:, 0].min()
    y_range = all_coords[:, 1].max() - all_coords[:, 1].min()
    return max(x_range, y_range)

def score_submission_v2(df, max_n=200):
    total_score = 0
    overlaps = []
    side_lengths = {}
    for n in range(1, max_n + 1):
        trees = load_trees_for_n(df, n)
        if len(trees) != n:
            print(f"Warning: n={n} has {len(trees)} trees instead of {n}")
            continue
        if has_overlap(trees):
            overlaps.append(n)
        side = get_bounding_box_side(trees)
        side_lengths[n] = side
        score_n = (side ** 2) / n
        total_score += score_n
    return total_score, overlaps, side_lengths

print("Validated scoring function defined.")

Validated scoring function defined.


In [16]:
# Now score the best pre-optimized submission with the validated function
best_file = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025.csv'
df_best = pd.read_csv(best_file)

print("Scoring with validated function...")
score_v2, overlaps_v2, side_lengths_v2 = score_submission_v2(df_best)
print(f"Score: {score_v2:.6f}")
print(f"Overlapping configurations: {overlaps_v2[:10]}... (total: {len(overlaps_v2)})")
print(f"Target: 68.901319")
print(f"Gap: {score_v2 - 68.901319:.6f}")

Scoring with validated function...


Score: 70.676102
Overlapping configurations: []... (total: 0)
Target: 68.901319
Gap: 1.774783


In [17]:
# Copy the best submission to the submission folder
import shutil
shutil.copy(best_file, '/home/submission/submission.csv')
print(f"Copied {best_file} to /home/submission/submission.csv")

# Verify the copy
df_check = pd.read_csv('/home/submission/submission.csv')
print(f"Submission shape: {df_check.shape}")
print(f"First few rows:\\n{df_check.head()}")

Copied /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025.csv to /home/submission/submission.csv
Submission shape: (20100, 4)
First few rows:\n      id                       x                       y  \
0  001_0    s-48.196086194214246     s58.770984615214225   
1  002_0   s0.154097069621355887  s-0.038540742694794648   
2  002_1  s-0.154097069621372845  s-0.561459257305224058   
3  003_0      s1.123655816140301      s0.781101815992563   
4  003_1       s1.23405569584216      s1.275999500663759   

                       deg  
0                    s45.0  
1  s203.629377730656841550  
2   s23.629377730656791812  
3        s111.125132292893  
4         s66.370622269343  
