# Baseline Ensemble

Create a baseline by ensembling all available pre-optimized solutions.
For each N (1-200), select the configuration with the smallest bounding box.

In [1]:
import pandas as pd
import numpy as np
import math
import os
from numba import njit
from glob import glob

# Tree polygon template
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

print(f"Tree has {len(TX)} vertices")

Tree has 15 vertices


In [2]:
@njit
def score_group(xs, ys, degs, tx, ty):
    """Calculate bounding box score for a group of trees."""
    n = xs.size
    V = tx.size
    mnx = mny = 1e300
    mxx = mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c, s = math.cos(r), math.sin(r)
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xs[i]
            Y = s * tx[j] + c * ty[j] + ys[i]
            mnx, mxx = min(mnx, X), max(mxx, X)
            mny, mxy = min(mny, Y), max(mxy, Y)
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def parse_value(val):
    """Parse submission value (handles 's' prefix)."""
    if isinstance(val, str):
        if val.startswith('s'):
            return float(val[1:])
        return float(val)
    return float(val)

def load_submission(filepath):
    """Load a submission CSV and return parsed data."""
    try:
        df = pd.read_csv(filepath)
        if 'id' not in df.columns:
            return None
        
        # Parse N and tree index from id
        df['N'] = df['id'].apply(lambda x: int(x.split('_')[0]))
        df['tree_idx'] = df['id'].apply(lambda x: int(x.split('_')[1]))
        
        # Parse coordinates
        df['x_val'] = df['x'].apply(parse_value)
        df['y_val'] = df['y'].apply(parse_value)
        df['deg_val'] = df['deg'].apply(parse_value)
        
        return df
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None

# Test score calculation
test_xs = np.array([0.0])
test_ys = np.array([0.0])
test_degs = np.array([90.0])
print(f"Score for single tree at origin with 90 deg: {score_group(test_xs, test_ys, test_degs, TX, TY):.6f}")

Score for single tree at origin with 90 deg: 1.000000


In [3]:
# Collect all CSV files from preoptimized folder
preopt_base = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized'
csv_files = glob(f'{preopt_base}/**/*.csv', recursive=True)
print(f"Found {len(csv_files)} CSV files")

# Also check for other experiment outputs
exp_base = '/home/nonroot/snapshots/santa-2025/21116303805/code/experiments'
exp_csvs = glob(f'{exp_base}/**/*.csv', recursive=True)
print(f"Found {len(exp_csvs)} experiment CSV files")

all_csvs = csv_files + exp_csvs
print(f"Total: {len(all_csvs)} CSV files to process")

Found 30 CSV files
Found 6 experiment CSV files
Total: 36 CSV files to process


In [4]:
# Load all submissions and track best per N
best = {n: {"score": 1e300, "data": None, "source": None} for n in range(1, 201)}

for csv_path in all_csvs:
    df = load_submission(csv_path)
    if df is None:
        continue
    
    # Process each N
    for n, group in df.groupby('N'):
        if n < 1 or n > 200:
            continue
        
        # Verify we have exactly n trees
        if len(group) != n:
            continue
        
        xs = group['x_val'].values
        ys = group['y_val'].values
        degs = group['deg_val'].values
        
        score = score_group(xs, ys, degs, TX, TY)
        
        if score < best[n]["score"]:
            best[n] = {
                "score": score,
                "data": group[['id', 'x', 'y', 'deg']].copy(),
                "source": os.path.basename(csv_path)
            }

print("Done processing all files")

Done processing all files


In [5]:
# Calculate total score
total_score = sum(best[n]["score"] for n in range(1, 201))
print(f"Total ensemble score: {total_score:.6f}")

# Show score breakdown
print("\nScore breakdown by N range:")
for start, end in [(1, 20), (21, 50), (51, 100), (101, 150), (151, 200)]:
    range_score = sum(best[n]["score"] for n in range(start, end+1))
    print(f"  N={start:3d}-{end:3d}: {range_score:.6f}")

# Show best sources for first 10 N values
print("\nBest sources for N=1-10:")
for n in range(1, 11):
    print(f"  N={n}: score={best[n]['score']:.6f}, source={best[n]['source']}")

Total ensemble score: 70.676102

Score breakdown by N range:
  N=  1- 20: 8.057295
  N= 21- 50: 10.984878
  N= 51-100: 17.641148
  N=101-150: 17.144118
  N=151-200: 16.848664

Best sources for N=1-10:
  N=1: score=0.661250, source=ensemble.csv
  N=2: score=0.450779, source=ensemble.csv
  N=3: score=0.434745, source=ensemble.csv
  N=4: score=0.416545, source=submission_v18.csv
  N=5: score=0.416850, source=submission_v18.csv
  N=6: score=0.399610, source=submission_v18.csv
  N=7: score=0.399897, source=ensemble.csv
  N=8: score=0.385407, source=ensemble.csv
  N=9: score=0.387415, source=ensemble.csv
  N=10: score=0.376630, source=submission_v18.csv


In [6]:
# Create ensemble submission
ensemble_rows = []
for n in range(1, 201):
    if best[n]["data"] is not None:
        ensemble_rows.append(best[n]["data"])
    else:
        print(f"WARNING: No data for N={n}")

ensemble_df = pd.concat(ensemble_rows, ignore_index=True)
print(f"Ensemble has {len(ensemble_df)} rows")

# Verify format
print("\nFirst 10 rows:")
print(ensemble_df.head(10))

Ensemble has 20100 rows

First 10 rows:
      id                       x                       y  \
0  001_0    s-48.196086194214246     s58.770984615214225   
1  002_0   s0.154097069621355887  s-0.038540742694794648   
2  002_1  s-0.154097069621372845  s-0.561459257305224058   
3  003_0      s1.123655816140301      s0.781101815992563   
4  003_1       s1.23405569584216      s1.275999500663759   
5  003_2      s0.641714640229075      s1.180458566613381   
6  004_0     s-0.324747789589372      s0.132109978088185   
7  004_1      s0.315354346242638      s0.132109978063475   
8  004_2      s0.324747789592379     s-0.732109978069476   
9  004_3     s-0.315354348134818     s-0.732109978094186   

                       deg  
0                    s45.0  
1  s203.629377730656841550  
2   s23.629377730656791812  
3        s111.125132292893  
4         s66.370622269343  
5      s155.13405193710082  
6     s156.370622145636389  
7     s156.370622269264089  
8     s336.370622269264004  
9     s33

In [7]:
# Save ensemble submission
submission_path = '/home/code/experiments/001_baseline_ensemble/submission.csv'
ensemble_df.to_csv(submission_path, index=False)
print(f"Saved ensemble to {submission_path}")

# Also copy to main submission folder
import shutil
os.makedirs('/home/submission', exist_ok=True)
shutil.copy(submission_path, '/home/submission/submission.csv')
print("Copied to /home/submission/submission.csv")

# Verify the saved file
verify_df = pd.read_csv(submission_path)
print(f"\nVerification: {len(verify_df)} rows")
print(verify_df.head())

Saved ensemble to /home/code/experiments/001_baseline_ensemble/submission.csv
Copied to /home/submission/submission.csv

Verification: 20100 rows
      id                       x                       y  \
0  001_0    s-48.196086194214246     s58.770984615214225   
1  002_0   s0.154097069621355887  s-0.038540742694794648   
2  002_1  s-0.154097069621372845  s-0.561459257305224058   
3  003_0      s1.123655816140301      s0.781101815992563   
4  003_1       s1.23405569584216      s1.275999500663759   

                       deg  
0                    s45.0  
1  s203.629377730656841550  
2   s23.629377730656791812  
3        s111.125132292893  
4         s66.370622269343  


In [8]:
# Final score verification
print(f"\n=== FINAL RESULTS ===")
print(f"Total Score: {total_score:.6f}")
print(f"Target: 68.922808")
print(f"Best public kernel: 71.78")
print(f"Gap to target: {total_score - 68.922808:.6f}")


=== FINAL RESULTS ===
Total Score: 70.676102
Target: 68.922808
Best public kernel: 71.78
Gap to target: 1.753294
