# Baseline Experiment: Evaluate Pre-optimized Submissions

This notebook:
1. Loads pre-optimized submissions from snapshots
2. Calculates scores for each
3. Selects the best one
4. Applies fix_direction optimization
5. Validates for overlaps
6. Creates submission

In [1]:
import numpy as np
import pandas as pd
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
from shapely.strtree import STRtree
from scipy.optimize import minimize_scalar
from scipy.spatial import ConvexHull
import os
import glob
from tqdm import tqdm

# Christmas tree polygon vertices (15 vertices)
TREE_VERTICES = np.array([
    (0.0, 0.8),      # Tip
    (0.125, 0.5),    # Right top tier
    (0.0625, 0.5),
    (0.2, 0.25),     # Right mid tier
    (0.1, 0.25),
    (0.35, 0.0),     # Right base
    (0.075, 0.0),    # Right trunk
    (0.075, -0.2),
    (-0.075, -0.2),  # Left trunk
    (-0.075, 0.0),
    (-0.35, 0.0),    # Left base
    (-0.1, 0.25),    # Left mid tier
    (-0.2, 0.25),
    (-0.0625, 0.5),  # Left top tier
    (-0.125, 0.5),
])

print("Tree vertices shape:", TREE_VERTICES.shape)

Tree vertices shape: (15, 2)


In [2]:
class ChristmasTree:
    """Represents a Christmas tree with position and rotation."""
    def __init__(self, x=0, y=0, deg=0):
        self.x = x
        self.y = y
        self.deg = deg
        self._polygon = None
    
    @property
    def polygon(self):
        if self._polygon is None:
            base = Polygon(TREE_VERTICES)
            rotated = rotate(base, self.deg, origin=(0, 0))
            self._polygon = translate(rotated, self.x, self.y)
        return self._polygon
    
    def invalidate(self):
        self._polygon = None

def load_submission(filepath):
    """Load a submission CSV file."""
    df = pd.read_csv(filepath)
    # Remove 's' prefix from values
    for col in ['x', 'y', 'deg']:
        df[col] = df[col].astype(str).str.replace('s', '', regex=False).astype(float)
    return df

def get_trees_for_n(df, n):
    """Get list of ChristmasTree objects for configuration n."""
    prefix = f"{n:03d}_"
    subset = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in subset.iterrows():
        trees.append(ChristmasTree(row['x'], row['y'], row['deg']))
    return trees

def get_bounding_box_side(trees):
    """Calculate the side length of the square bounding box."""
    if not trees:
        return 0
    all_points = []
    for tree in trees:
        coords = np.array(tree.polygon.exterior.coords)
        all_points.append(coords)
    all_points = np.vstack(all_points)
    min_xy = all_points.min(axis=0)
    max_xy = all_points.max(axis=0)
    return max(max_xy - min_xy)

def calculate_total_score(df):
    """Calculate the total score for a submission."""
    total = 0
    for n in range(1, 201):
        trees = get_trees_for_n(df, n)
        side = get_bounding_box_side(trees)
        total += side**2 / n
    return total

print("Functions defined.")

Functions defined.


In [3]:
# Find all pre-optimized submissions
preopt_dir = "/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized"

# List all CSV files
csv_files = []
for root, dirs, files in os.walk(preopt_dir):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

print(f"Found {len(csv_files)} CSV files")
for f in csv_files[:10]:
    print(f"  {f}")

Found 30 CSV files
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/submission.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa-2025.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/best_ensemble.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/72.49.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/71.97.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/telegram_extracted/72.49.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/telegram/telegram_extracted/71.97.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/submission_JKoT4.csv
  /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/santa25-public/New_Tree_144_196.csv


In [4]:
# Evaluate each submission and find the best one
results = []

for filepath in tqdm(csv_files):
    try:
        df = load_submission(filepath)
        # Check if it has the right number of rows (20100 = 1+2+...+200)
        if len(df) != 20100:
            print(f"Skipping {filepath}: wrong row count ({len(df)})")
            continue
        score = calculate_total_score(df)
        results.append({'filepath': filepath, 'score': score})
        print(f"{os.path.basename(filepath)}: {score:.6f}")
    except Exception as e:
        print(f"Error with {filepath}: {e}")

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('score')
print("\nTop 5 submissions:")
print(results_df.head())

  0%|          | 0/30 [00:00<?, ?it/s]

  3%|▎         | 1/30 [00:02<01:24,  2.93s/it]

ensemble.csv: 70.676102


  7%|▋         | 2/30 [00:05<01:21,  2.93s/it]

submission.csv: 70.676501


 10%|█         | 3/30 [00:08<01:18,  2.92s/it]

santa-2025.csv: 70.676102


 13%|█▎        | 4/30 [00:11<01:15,  2.92s/it]

best_ensemble.csv: 70.676102


 17%|█▋        | 5/30 [00:14<01:12,  2.92s/it]

72.49.csv: 72.495739


 20%|██        | 6/30 [00:17<01:10,  2.92s/it]

71.97.csv: 71.972027


 23%|██▎       | 7/30 [00:20<01:07,  2.92s/it]

72.49.csv: 72.495739


 27%|██▋       | 8/30 [00:23<01:04,  2.92s/it]

71.97.csv: 71.972027


 30%|███       | 9/30 [00:26<01:01,  2.92s/it]

submission_JKoT4.csv: 72.489504


 33%|███▎      | 10/30 [00:29<00:58,  2.92s/it]

New_Tree_144_196.csv: 72.927920


 37%|███▋      | 11/30 [00:32<00:55,  2.92s/it]

submission_JKoT3.csv: 72.489488


 40%|████      | 12/30 [00:35<00:52,  2.91s/it]

santa2025_ver2_v61.csv: 72.951925


 43%|████▎     | 13/30 [00:37<00:49,  2.91s/it]

submission_JKoT2.csv: 72.489348


 47%|████▋     | 14/30 [00:40<00:46,  2.91s/it]

santa2025_ver2_v67.csv: 72.938567


 50%|█████     | 15/30 [00:43<00:43,  2.91s/it]

santa2025_ver2_v76.csv: 72.826444


 53%|█████▎    | 16/30 [00:46<00:40,  2.92s/it]

submission_70_936673758122.csv: 70.936674


 57%|█████▋    | 17/30 [00:49<00:37,  2.92s/it]

santa2025_ver2_v65.csv: 72.935294


 60%|██████    | 18/30 [00:52<00:34,  2.92s/it]

submission_70_926149550346.csv: 70.926150


 63%|██████▎   | 19/30 [00:55<00:32,  2.91s/it]

santa2025_ver2_v66.csv: 72.938599


 67%|██████▋   | 20/30 [00:58<00:29,  2.91s/it]

santa2025_ver2_v63.csv: 72.947427


 70%|███████   | 21/30 [01:01<00:26,  2.91s/it]

santa2025_ver2_v69.csv: 72.850110


 73%|███████▎  | 22/30 [01:04<00:23,  2.91s/it]

submission_JKoT1.csv: 72.489483


 77%|███████▋  | 23/30 [01:07<00:20,  2.91s/it]

submission_opt1.csv: 70.990692


 80%|████████  | 24/30 [01:09<00:17,  2.92s/it]

santa2025_ver2_v68.csv: 72.939233


 83%|████████▎ | 25/30 [01:12<00:14,  2.92s/it]

santa-2025.csv: 70.676102


 87%|████████▋ | 26/30 [01:15<00:11,  2.92s/it]

submission.csv: 70.676501


 90%|█████████ | 27/30 [01:18<00:08,  2.92s/it]

submission (77).csv: 72.135010


 93%|█████████▎| 28/30 [01:21<00:05,  2.92s/it]

submission.csv: 72.935294


 97%|█████████▋| 29/30 [01:24<00:02,  2.93s/it]

submission_sa.csv: 72.935294


100%|██████████| 30/30 [01:27<00:00,  2.92s/it]

100%|██████████| 30/30 [01:27<00:00,  2.92s/it]

submission_best.csv: 70.926150

Top 5 submissions:
                                             filepath      score
0   /home/nonroot/snapshots/santa-2025/21116303805...  70.676102
2   /home/nonroot/snapshots/santa-2025/21116303805...  70.676102
3   /home/nonroot/snapshots/santa-2025/21116303805...  70.676102
24  /home/nonroot/snapshots/santa-2025/21116303805...  70.676102
25  /home/nonroot/snapshots/santa-2025/21116303805...  70.676501





In [5]:
# Load the best submission
best_filepath = results_df.iloc[0]['filepath']
best_score = results_df.iloc[0]['score']
print(f"Best submission: {best_filepath}")
print(f"Best score: {best_score:.6f}")

best_df = load_submission(best_filepath)

Best submission: /home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv
Best score: 70.676102


In [6]:
def has_overlap(trees):
    """Check if any trees overlap."""
    if len(trees) <= 1:
        return False
    polygons = [t.polygon for t in trees]
    tree_index = STRtree(polygons)
    
    for i, poly in enumerate(polygons):
        candidates = tree_index.query(poly)
        for j in candidates:
            if i < j:  # Only check each pair once
                if poly.intersects(polygons[j]) and not poly.touches(polygons[j]):
                    # Check if intersection is more than just touching
                    intersection = poly.intersection(polygons[j])
                    if intersection.area > 1e-10:
                        return True
    return False

# Validate the best submission for overlaps
print("Validating for overlaps...")
overlap_count = 0
for n in tqdm(range(1, 201)):
    trees = get_trees_for_n(best_df, n)
    if has_overlap(trees):
        overlap_count += 1
        print(f"  Overlap found in n={n}")

print(f"\nTotal configurations with overlaps: {overlap_count}")

Validating for overlaps...


  0%|          | 0/200 [00:00<?, ?it/s]

 12%|█▏        | 24/200 [00:00<00:00, 227.12it/s]

 24%|██▎       | 47/200 [00:00<00:00, 159.60it/s]

 32%|███▎      | 65/200 [00:00<00:01, 127.24it/s]

 40%|███▉      | 79/200 [00:00<00:01, 108.45it/s]

 46%|████▌     | 91/200 [00:00<00:01, 94.93it/s] 

 50%|█████     | 101/200 [00:00<00:01, 85.24it/s]

 55%|█████▌    | 110/200 [00:01<00:01, 77.16it/s]

 59%|█████▉    | 118/200 [00:01<00:01, 69.96it/s]

 63%|██████▎   | 126/200 [00:01<00:01, 64.44it/s]

 66%|██████▋   | 133/200 [00:01<00:01, 59.99it/s]

 70%|██████▉   | 139/200 [00:01<00:01, 56.82it/s]

 72%|███████▎  | 145/200 [00:01<00:01, 53.37it/s]

 76%|███████▌  | 151/200 [00:01<00:00, 50.64it/s]

 78%|███████▊  | 157/200 [00:02<00:00, 48.16it/s]

 81%|████████  | 162/200 [00:02<00:00, 46.18it/s]

 84%|████████▎ | 167/200 [00:02<00:00, 44.08it/s]

 86%|████████▌ | 172/200 [00:02<00:00, 42.85it/s]

 88%|████████▊ | 177/200 [00:02<00:00, 41.53it/s]

 91%|█████████ | 182/200 [00:02<00:00, 40.01it/s]

 93%|█████████▎| 186/200 [00:02<00:00, 38.56it/s]

 95%|█████████▌| 190/200 [00:02<00:00, 37.76it/s]

 97%|█████████▋| 194/200 [00:03<00:00, 37.22it/s]

 99%|█████████▉| 198/200 [00:03<00:00, 36.63it/s]

100%|██████████| 200/200 [00:03<00:00, 61.52it/s]


Total configurations with overlaps: 0





In [7]:
def optimize_rotation_for_config(trees):
    """Optimize global rotation angle to minimize bounding box."""
    if len(trees) <= 1:
        return 0, get_bounding_box_side(trees)
    
    # Get all points from all trees
    all_points = []
    for tree in trees:
        coords = np.array(tree.polygon.exterior.coords)
        all_points.append(coords)
    all_points = np.vstack(all_points)
    
    # Get convex hull for efficiency
    try:
        hull = ConvexHull(all_points)
        hull_points = all_points[hull.vertices]
    except:
        hull_points = all_points
    
    def bbox_at_angle(angle_deg):
        angle_rad = np.radians(angle_deg)
        c, s = np.cos(angle_rad), np.sin(angle_rad)
        rot_matrix = np.array([[c, s], [-s, c]])
        rotated = hull_points.dot(rot_matrix.T)
        min_xy = rotated.min(axis=0)
        max_xy = rotated.max(axis=0)
        return max(max_xy - min_xy)
    
    # Search for optimal angle
    res = minimize_scalar(bbox_at_angle, bounds=(0.001, 89.999), method='bounded')
    return res.x, res.fun

# Test on a few configurations
print("Testing fix_direction on a few configurations:")
for n in [10, 50, 100, 150, 200]:
    trees = get_trees_for_n(best_df, n)
    original_side = get_bounding_box_side(trees)
    opt_angle, opt_side = optimize_rotation_for_config(trees)
    improvement = original_side - opt_side
    print(f"  n={n}: original={original_side:.6f}, optimized={opt_side:.6f}, improvement={improvement:.6f}, angle={opt_angle:.2f}")

Testing fix_direction on a few configurations:
  n=10: original=1.940696, optimized=1.940728, improvement=-0.000032, angle=0.00
  n=50: original=4.247076, optimized=4.247145, improvement=-0.000069, angle=90.00
  n=100: original=5.878188, optimized=5.878257, improvement=-0.000069, angle=0.00
  n=150: original=7.110543, optimized=7.110588, improvement=-0.000045, angle=0.00
  n=200: original=8.218653, optimized=8.218772, improvement=-0.000119, angle=0.00


In [8]:
def apply_rotation_to_config(df, n, angle_deg):
    """Apply global rotation to a configuration."""
    prefix = f"{n:03d}_"
    mask = df['id'].str.startswith(prefix)
    
    # Get current positions
    subset = df[mask].copy()
    
    # Rotate all positions around origin
    angle_rad = np.radians(angle_deg)
    c, s = np.cos(angle_rad), np.sin(angle_rad)
    
    new_x = c * subset['x'] + s * subset['y']
    new_y = -s * subset['x'] + c * subset['y']
    new_deg = (subset['deg'] + angle_deg) % 360
    
    df.loc[mask, 'x'] = new_x.values
    df.loc[mask, 'y'] = new_y.values
    df.loc[mask, 'deg'] = new_deg.values
    
    return df

# Apply fix_direction to all configurations
print("Applying fix_direction to all configurations...")
optimized_df = best_df.copy()

improvements = []
for n in tqdm(range(1, 201)):
    trees = get_trees_for_n(optimized_df, n)
    original_side = get_bounding_box_side(trees)
    opt_angle, opt_side = optimize_rotation_for_config(trees)
    
    if opt_side < original_side - 1e-8:
        optimized_df = apply_rotation_to_config(optimized_df, n, opt_angle)
        improvements.append({'n': n, 'original': original_side, 'optimized': opt_side, 'improvement': original_side - opt_side})

print(f"\nApplied improvements to {len(improvements)} configurations")
if improvements:
    imp_df = pd.DataFrame(improvements)
    print(f"Total improvement: {imp_df['improvement'].sum():.6f}")
    print(f"Average improvement: {imp_df['improvement'].mean():.6f}")

Applying fix_direction to all configurations...


  0%|          | 0/200 [00:00<?, ?it/s]

 10%|█         | 21/200 [00:00<00:00, 201.55it/s]

 21%|██        | 42/200 [00:00<00:01, 149.63it/s]

 29%|██▉       | 58/200 [00:00<00:01, 122.98it/s]

 36%|███▌      | 71/200 [00:00<00:01, 106.36it/s]

 42%|████▏     | 83/200 [00:00<00:01, 93.39it/s] 

 46%|████▋     | 93/200 [00:00<00:01, 83.96it/s]

 51%|█████     | 102/200 [00:01<00:01, 76.35it/s]

 55%|█████▌    | 110/200 [00:01<00:01, 70.16it/s]

 59%|█████▉    | 118/200 [00:01<00:01, 64.72it/s]

 62%|██████▎   | 125/200 [00:01<00:01, 60.57it/s]

 66%|██████▌   | 132/200 [00:01<00:01, 56.75it/s]

 69%|██████▉   | 138/200 [00:01<00:01, 53.27it/s]

 72%|███████▏  | 144/200 [00:01<00:01, 50.71it/s]

 75%|███████▌  | 150/200 [00:02<00:01, 47.96it/s]

 78%|███████▊  | 155/200 [00:02<00:00, 45.58it/s]

 80%|████████  | 160/200 [00:02<00:00, 44.07it/s]

 82%|████████▎ | 165/200 [00:02<00:00, 42.54it/s]

 85%|████████▌ | 170/200 [00:02<00:00, 41.15it/s]

 88%|████████▊ | 175/200 [00:02<00:00, 39.48it/s]

 90%|████████▉ | 179/200 [00:02<00:00, 38.72it/s]

 92%|█████████▏| 183/200 [00:02<00:00, 37.97it/s]

 94%|█████████▎| 187/200 [00:03<00:00, 37.20it/s]

 96%|█████████▌| 191/200 [00:03<00:00, 36.37it/s]

 98%|█████████▊| 195/200 [00:03<00:00, 35.56it/s]

100%|█████████▉| 199/200 [00:03<00:00, 34.64it/s]

100%|██████████| 200/200 [00:03<00:00, 58.27it/s]


Applied improvements to 0 configurations





In [9]:
# Calculate new score
new_score = calculate_total_score(optimized_df)
print(f"Original score: {best_score:.6f}")
print(f"New score: {new_score:.6f}")
print(f"Improvement: {best_score - new_score:.6f}")

Original score: 70.676102
New score: 70.676102
Improvement: 0.000000


In [10]:
# Validate optimized submission for overlaps
print("Validating optimized submission for overlaps...")
overlap_count = 0
for n in tqdm(range(1, 201)):
    trees = get_trees_for_n(optimized_df, n)
    if has_overlap(trees):
        overlap_count += 1
        print(f"  Overlap found in n={n}")

print(f"\nTotal configurations with overlaps: {overlap_count}")

Validating optimized submission for overlaps...


  0%|          | 0/200 [00:00<?, ?it/s]

 12%|█▏        | 24/200 [00:00<00:00, 229.25it/s]

 24%|██▎       | 47/200 [00:00<00:00, 160.82it/s]

 32%|███▎      | 65/200 [00:00<00:01, 128.96it/s]

 40%|███▉      | 79/200 [00:00<00:01, 108.82it/s]

 46%|████▌     | 91/200 [00:00<00:01, 95.37it/s] 

 50%|█████     | 101/200 [00:00<00:01, 85.44it/s]

 55%|█████▌    | 110/200 [00:01<00:01, 77.12it/s]

 59%|█████▉    | 118/200 [00:01<00:01, 70.66it/s]

 63%|██████▎   | 126/200 [00:01<00:01, 65.39it/s]

 66%|██████▋   | 133/200 [00:01<00:01, 60.80it/s]

 70%|███████   | 140/200 [00:01<00:01, 55.94it/s]

 73%|███████▎  | 146/200 [00:01<00:01, 53.21it/s]

 76%|███████▌  | 152/200 [00:01<00:00, 50.58it/s]

 79%|███████▉  | 158/200 [00:02<00:00, 48.15it/s]

 82%|████████▏ | 163/200 [00:02<00:00, 46.42it/s]

 84%|████████▍ | 168/200 [00:02<00:00, 44.74it/s]

 86%|████████▋ | 173/200 [00:02<00:00, 42.77it/s]

 89%|████████▉ | 178/200 [00:02<00:00, 41.38it/s]

 92%|█████████▏| 183/200 [00:02<00:00, 39.81it/s]

 94%|█████████▎| 187/200 [00:02<00:00, 38.79it/s]

 96%|█████████▌| 191/200 [00:02<00:00, 37.90it/s]

 98%|█████████▊| 195/200 [00:03<00:00, 37.37it/s]

100%|█████████▉| 199/200 [00:03<00:00, 36.59it/s]

100%|██████████| 200/200 [00:03<00:00, 61.87it/s]


Total configurations with overlaps: 0





In [11]:
# Create submission file
def create_submission(df, output_path):
    """Create submission file with 's' prefix."""
    sub_df = df.copy()
    for col in ['x', 'y', 'deg']:
        sub_df[col] = 's' + sub_df[col].astype(str)
    sub_df.to_csv(output_path, index=False)
    print(f"Saved submission to {output_path}")
    return sub_df

# Save submission
submission_path = '/home/submission/submission.csv'
create_submission(optimized_df, submission_path)

# Verify row count
verify_df = pd.read_csv(submission_path)
print(f"Submission row count: {len(verify_df)} (expected: 20100)")

Saved submission to /home/submission/submission.csv
Submission row count: 20100 (expected: 20100)


In [12]:
# Summary
print("="*50)
print("BASELINE EXPERIMENT SUMMARY")
print("="*50)
print(f"Best pre-optimized submission: {os.path.basename(best_filepath)}")
print(f"Original score: {best_score:.6f}")
print(f"After fix_direction: {new_score:.6f}")
print(f"Improvement: {best_score - new_score:.6f}")
print(f"Overlaps: {overlap_count}")
print(f"Submission saved to: {submission_path}")
print("="*50)

BASELINE EXPERIMENT SUMMARY
Best pre-optimized submission: ensemble.csv
Original score: 70.676102
After fix_direction: 70.676102
Improvement: 0.000000
Overlaps: 0
Submission saved to: /home/submission/submission.csv


In [None]:
# Model class for submission (wrapper for pre-optimized solution)
class PreOptimizedModel:
    def __init__(self, data='single'):
        self.data = data
        self.best_filepath = "/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized/ensemble.csv"
    
    def train_model(self, X=None, Y=None):
        # No training needed - using pre-optimized solution
        pass
    
    def predict(self, X=None):
        # Load and return pre-optimized submission
        df = load_submission(self.best_filepath)
        return df

# Create model instance for submission
model = PreOptimizedModel(data='single')
print("Model created for submission")

In [None]:
# Generate submission using model
model = PreOptimizedModel(data='full')
predictions = model.predict()
print(f"Predictions shape: {len(predictions)}")