# Santa 2025 - Quick EDA

In [1]:
import pandas as pd
import numpy as np

# Load sample submission
df = pd.read_csv('/home/data/sample_submission.csv')
print(f'Shape: {df.shape}')
print(f'Columns: {df.columns.tolist()}')
df.head(20)

Shape: (20100, 4)
Columns: ['id', 'x', 'y', 'deg']


Unnamed: 0,id,x,y,deg
0,001_0,s0.0,s0.0,s90.0
1,002_0,s0.0,s0.0,s90.0
2,002_1,s0.202736,s-0.511271,s90.0
3,003_0,s0.0,s0.0,s90.0
4,003_1,s0.202736,s-0.511271,s90.0
5,003_2,s0.5206,s0.177413,s180.0
6,004_0,s0.0,s0.0,s90.0
7,004_1,s0.202736,s-0.511271,s90.0
8,004_2,s0.5206,s0.177413,s180.0
9,004_3,s-0.818657,s-0.228694,s180.0


In [2]:
# Calculate total rows: sum of 1 to 200
total_trees = sum(range(1, 201))
print(f'Expected rows: {total_trees}')
print(f'Actual rows: {len(df)}')

# Parse the data
df['n'] = df['id'].apply(lambda x: int(x.split('_')[0]))
df['tree_idx'] = df['id'].apply(lambda x: int(x.split('_')[1]))
df['x_val'] = df['x'].str.replace('s', '').astype(float)
df['y_val'] = df['y'].str.replace('s', '').astype(float)
df['deg_val'] = df['deg'].str.replace('s', '').astype(float)

print(f"\nN range: {df['n'].min()} to {df['n'].max()}")
print(f"X range: {df['x_val'].min():.4f} to {df['x_val'].max():.4f}")
print(f"Y range: {df['y_val'].min():.4f} to {df['y_val'].max():.4f}")
print(f"Deg range: {df['deg_val'].min():.1f} to {df['deg_val'].max():.1f}")

Expected rows: 20100
Actual rows: 20100

N range: 1 to 200
X range: -5.7984 to 5.2893
Y range: -5.6357 to 5.7987
Deg range: 0.0 to 270.0


In [3]:
# Calculate score for sample submission
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union

TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def get_tree_polygon(x, y, deg):
    base_poly = Polygon(zip(TX, TY))
    rotated = affinity.rotate(base_poly, deg, origin=(0, 0))
    return affinity.translate(rotated, xoff=x, yoff=y)

# Calculate score for each N
scores = []
for n in range(1, 201):
    group = df[df['n'] == n]
    polys = [get_tree_polygon(row['x_val'], row['y_val'], row['deg_val']) for _, row in group.iterrows()]
    union = unary_union(polys)
    bounds = union.bounds
    side = max(bounds[2] - bounds[0], bounds[3] - bounds[1])
    score_n = side**2 / n
    scores.append({'n': n, 'side': side, 'score': score_n})

scores_df = pd.DataFrame(scores)
total_score = scores_df['score'].sum()
print(f"Sample submission total score: {total_score:.6f}")
print(f"Target score to beat: 68.894234")
print(f"\nTop 10 worst scores (highest contribution):")
print(scores_df.nlargest(10, 'score')[['n', 'side', 'score']])

Sample submission total score: 173.652299
Target score to beat: 68.894234

Top 10 worst scores (highest contribution):
     n      side     score
7    8  3.441115  1.480159
12  13  4.147386  1.323139
8    9  3.441115  1.315697
13  14  4.147386  1.228629
6    7  2.901647  1.202794
9   10  3.441115  1.184127
14  15  4.147386  1.146721
10  11  3.441115  1.076479
15  16  4.147386  1.075051
3    4  2.039257  1.039642
