# Experiment 004: Ensemble from Multiple Pre-optimized Files

Take the best N from each pre-optimized file to create an ensemble.

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
import os
import json
from glob import glob

# Tree geometry
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]
BASE_TREE = Polygon(zip(TX, TY))

def parse_value(s):
    if isinstance(s, str) and s.startswith('s'):
        return float(s[1:])
    return float(s)

def create_tree_polygon(x, y, deg):
    tree = affinity.rotate(BASE_TREE, deg, origin=(0, 0))
    tree = affinity.translate(tree, x, y)
    return tree

def get_score_for_n(df, n):
    """Get the bounding box side and score for a specific N"""
    prefix = f"{n:03d}_"
    n_trees = df[df['id'].str.startswith(prefix)]
    
    if len(n_trees) != n:
        return None, None
    
    all_coords = []
    for _, row in n_trees.iterrows():
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        poly = create_tree_polygon(x, y, deg)
        coords = np.array(poly.exterior.coords)
        all_coords.append(coords)
    
    all_coords = np.vstack(all_coords)
    x_range = all_coords[:, 0].max() - all_coords[:, 0].min()
    y_range = all_coords[:, 1].max() - all_coords[:, 1].min()
    side = max(x_range, y_range)
    score = side**2 / n
    
    return side, score

print("Functions defined")

Functions defined


In [2]:
# Load all pre-optimized files
base_path = '/home/nonroot/snapshots/santa-2025/21116303805/code/preoptimized'
files = [
    f'{base_path}/santa-2025.csv',
    f'{base_path}/best_ensemble.csv',
    f'{base_path}/ensemble.csv',
] + glob(f'{base_path}/santa25-public/*.csv')

print(f"Found {len(files)} files")
for f in files:
    print(f"  {os.path.basename(f)}")

Found 19 files
  santa-2025.csv
  best_ensemble.csv
  ensemble.csv
  submission_JKoT4.csv
  New_Tree_144_196.csv
  submission_JKoT3.csv
  santa2025_ver2_v61.csv
  submission_JKoT2.csv
  santa2025_ver2_v67.csv
  santa2025_ver2_v76.csv
  submission_70_936673758122.csv
  santa2025_ver2_v65.csv
  submission_70_926149550346.csv
  santa2025_ver2_v66.csv
  santa2025_ver2_v63.csv
  santa2025_ver2_v69.csv
  submission_JKoT1.csv
  submission_opt1.csv
  santa2025_ver2_v68.csv


In [3]:
# Load all dataframes and calculate scores for each N
all_data = {}
for filepath in files:
    try:
        df = pd.read_csv(filepath)
        if len(df) != 20100:
            print(f"Skipping {os.path.basename(filepath)}: {len(df)} rows")
            continue
        
        scores = {}
        for n in range(1, 201):
            side, score = get_score_for_n(df, n)
            if score is not None:
                scores[n] = {'side': side, 'score': score, 'df': df}
        
        total = sum(s['score'] for s in scores.values())
        all_data[filepath] = {'scores': scores, 'total': total}
        print(f"{os.path.basename(filepath)}: total={total:.6f}")
    except Exception as e:
        print(f"Error loading {filepath}: {e}")

santa-2025.csv: total=70.676102


best_ensemble.csv: total=70.676102


ensemble.csv: total=70.676102


submission_JKoT4.csv: total=72.489504


New_Tree_144_196.csv: total=72.927920


submission_JKoT3.csv: total=72.489488


santa2025_ver2_v61.csv: total=72.951925


submission_JKoT2.csv: total=72.489348


santa2025_ver2_v67.csv: total=72.938567


santa2025_ver2_v76.csv: total=72.826444


submission_70_936673758122.csv: total=70.936674


santa2025_ver2_v65.csv: total=72.935294


submission_70_926149550346.csv: total=70.926150


santa2025_ver2_v66.csv: total=72.938599


santa2025_ver2_v63.csv: total=72.947427


santa2025_ver2_v69.csv: total=72.850110


submission_JKoT1.csv: total=72.489483


submission_opt1.csv: total=70.990692


santa2025_ver2_v68.csv: total=72.939233


In [4]:
# Find the best source for each N
best_for_n = {}
for n in range(1, 201):
    best_score = float('inf')
    best_source = None
    
    for filepath, data in all_data.items():
        if n in data['scores']:
            score = data['scores'][n]['score']
            if score < best_score:
                best_score = score
                best_source = filepath
    
    if best_source:
        best_for_n[n] = {'source': best_source, 'score': best_score}

# Calculate ensemble total
ensemble_total = sum(d['score'] for d in best_for_n.values())
print(f"\nEnsemble total score: {ensemble_total:.6f}")

# Compare with best single file
best_single = min(all_data.items(), key=lambda x: x[1]['total'])
print(f"Best single file: {os.path.basename(best_single[0])} = {best_single[1]['total']:.6f}")
print(f"Improvement from ensemble: {best_single[1]['total'] - ensemble_total:.6f}")


Ensemble total score: 70.676102
Best single file: santa-2025.csv = 70.676102
Improvement from ensemble: 0.000000


In [5]:
# Show which source is best for each N
from collections import Counter
source_counts = Counter(d['source'] for d in best_for_n.values())
print("\nSource distribution:")
for source, count in source_counts.most_common():
    print(f"  {os.path.basename(source)}: {count} N values")


Source distribution:
  santa-2025.csv: 200 N values


In [6]:
# Create the ensemble submission
ensemble_rows = []
for n in range(1, 201):
    source = best_for_n[n]['source']
    df = all_data[source]['scores'][n]['df']
    prefix = f"{n:03d}_"
    n_trees = df[df['id'].str.startswith(prefix)]
    ensemble_rows.append(n_trees)

ensemble_df = pd.concat(ensemble_rows, ignore_index=True)
print(f"Ensemble has {len(ensemble_df)} rows")
print(ensemble_df.head(10))

Ensemble has 20100 rows
      id                       x                       y  \
0  001_0    s-48.196086194214246     s58.770984615214225   
1  002_0   s0.154097069621355887  s-0.038540742694794648   
2  002_1  s-0.154097069621372845  s-0.561459257305224058   
3  003_0      s1.123655816140301      s0.781101815992563   
4  003_1       s1.23405569584216      s1.275999500663759   
5  003_2      s0.641714640229075      s1.180458566613381   
6  004_0  s-0.324747789589372171   s0.132109978088185392   
7  004_1   s0.315354346242637695   s0.132109978063475492   
8  004_2   s0.324747789592379210  s-0.732109978069475531   
9  004_3  s-0.315354348134818330  s-0.732109978094185987   

                       deg  
0                    s45.0  
1  s203.629377730656841550  
2   s23.629377730656791812  
3        s111.125132292893  
4         s66.370622269343  
5      s155.13405193710082  
6  s156.370622145636389178  
7  s156.370622269264089255  
8  s336.370622269264003990  
9  s336.37062214563644602

In [7]:
# Verify ensemble score
def calculate_total_score(df):
    total = 0
    for n in range(1, 201):
        side, score = get_score_for_n(df, n)
        if score:
            total += score
    return total

verified_score = calculate_total_score(ensemble_df)
print(f"Verified ensemble score: {verified_score:.6f}")

Verified ensemble score: 70.676102


In [8]:
# Save ensemble
os.makedirs('/home/code/experiments/004_ensemble', exist_ok=True)
ensemble_df.to_csv('/home/code/experiments/004_ensemble/ensemble.csv', index=False)
print("Saved ensemble to experiments/004_ensemble/ensemble.csv")

# Copy to submission
import shutil
shutil.copy('/home/code/experiments/004_ensemble/ensemble.csv', '/home/submission/submission.csv')
print("Copied to /home/submission/submission.csv")

Saved ensemble to experiments/004_ensemble/ensemble.csv
Copied to /home/submission/submission.csv


In [None]:
# Save metrics
metrics = {'cv_score': verified_score}
with open('/home/code/experiments/004_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f)
print(f"Saved metrics: {metrics}")