# Loop 4 Analysis: Build Ensemble from Multiple Sources

The evaluator correctly identified that we need to ensemble from more sources. We now have 20+ CSV files from different optimizers. Let's build a proper ensemble by selecting the best configuration for each N.

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely import affinity
from shapely.geometry import Polygon
import glob
import os

getcontext().prec = 30

class ChristmasTree:
    def __init__(self, center_x='0', center_y='0', angle='0'):
        self.center_x = Decimal(center_x)
        self.center_y = Decimal(center_y)
        self.angle = Decimal(angle)

        trunk_w = Decimal('0.15')
        trunk_h = Decimal('0.2')
        base_w = Decimal('0.7')
        mid_w = Decimal('0.4')
        top_w = Decimal('0.25')
        tip_y = Decimal('0.8')
        tier_1_y = Decimal('0.5')
        tier_2_y = Decimal('0.25')
        base_y = Decimal('0.0')
        trunk_bottom_y = -trunk_h

        initial_polygon = Polygon([
            (float(0), float(tip_y)),
            (float(top_w / 2), float(tier_1_y)),
            (float(top_w / 4), float(tier_1_y)),
            (float(mid_w / 2), float(tier_2_y)),
            (float(mid_w / 4), float(tier_2_y)),
            (float(base_w / 2), float(base_y)),
            (float(trunk_w / 2), float(base_y)),
            (float(trunk_w / 2), float(trunk_bottom_y)),
            (float(-trunk_w / 2), float(trunk_bottom_y)),
            (float(-trunk_w / 2), float(base_y)),
            (float(-base_w / 2), float(base_y)),
            (float(-mid_w / 4), float(tier_2_y)),
            (float(-mid_w / 2), float(tier_2_y)),
            (float(-top_w / 4), float(tier_1_y)),
            (float(-top_w / 2), float(tier_1_y)),
        ])

        rotated = affinity.rotate(initial_polygon, float(self.angle), origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=float(self.center_x), yoff=float(self.center_y))

def load_trees_for_n(df, n):
    prefix = f"{n:03d}_"
    subset = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in subset.iterrows():
        x = str(row['x']).lstrip('s')
        y = str(row['y']).lstrip('s')
        deg = str(row['deg']).lstrip('s')
        trees.append(ChristmasTree(x, y, deg))
    return trees

def get_bounding_box_side(trees):
    if not trees:
        return float('inf')
    all_coords = []
    for tree in trees:
        coords = np.array(tree.polygon.exterior.coords)
        all_coords.append(coords)
    all_coords = np.vstack(all_coords)
    x_range = all_coords[:, 0].max() - all_coords[:, 0].min()
    y_range = all_coords[:, 1].max() - all_coords[:, 1].min()
    return max(x_range, y_range)

def score_submission(df, max_n=200):
    total_score = 0
    for n in range(1, max_n + 1):
        trees = load_trees_for_n(df, n)
        if len(trees) != n:
            continue
        side = get_bounding_box_side(trees)
        score_n = (side ** 2) / n
        total_score += score_n
    return total_score

print("Functions defined")

Functions defined


In [2]:
# Find all CSV files
csv_files = glob.glob('/home/code/preoptimized/**/*.csv', recursive=True)
print(f"Found {len(csv_files)} CSV files")

# Score each file
scores = {}
for f in csv_files:
    try:
        df = pd.read_csv(f)
        # Check if it has the right columns
        if 'id' in df.columns and 'x' in df.columns and 'y' in df.columns and 'deg' in df.columns:
            # Check if it has all 200 N values
            n_values = set()
            for idx in df['id']:
                n = int(str(idx).split('_')[0])
                n_values.add(n)
            if len(n_values) >= 200:
                score = score_submission(df)
                scores[f] = score
                print(f"{os.path.basename(f)}: {score:.6f}")
    except Exception as e:
        print(f"Error with {f}: {e}")

print(f"\nValid files: {len(scores)}")
print(f"Best score: {min(scores.values()):.6f}" if scores else "No valid files")

Found 30 CSV files


ensemble.csv: 70.676102


submission.csv: 70.676501


santa-2025.csv: 70.676102


best_ensemble.csv: 70.676102


72.49.csv: 72.495739


71.97.csv: 71.972027


72.49.csv: 72.495739


71.97.csv: 71.972027


submission_JKoT4.csv: 72.489504


New_Tree_144_196.csv: 72.927920


submission_JKoT3.csv: 72.489488


santa2025_ver2_v61.csv: 72.951925


submission_JKoT2.csv: 72.489348


santa2025_ver2_v67.csv: 72.938567


santa2025_ver2_v76.csv: 72.826444


submission_70_936673758122.csv: 70.936674


santa2025_ver2_v65.csv: 72.935294


submission_70_926149550346.csv: 70.926150


santa2025_ver2_v66.csv: 72.938599


santa2025_ver2_v63.csv: 72.947427


santa2025_ver2_v69.csv: 72.850110


submission_JKoT1.csv: 72.489483


submission_opt1.csv: 70.990692


santa2025_ver2_v68.csv: 72.939233


santa-2025.csv: 70.676102


submission.csv: 70.676501


submission (77).csv: 72.135010


submission.csv: 72.935294


submission_sa.csv: 72.935294


submission_best.csv: 70.926150

Valid files: 30
Best score: 70.676102


In [3]:
# Sort by score
sorted_files = sorted(scores.items(), key=lambda x: x[1])
print("\nTop 10 files by score:")
for f, s in sorted_files[:10]:
    print(f"{s:.6f}: {os.path.basename(f)}")


Top 10 files by score:
70.676102: ensemble.csv
70.676102: santa-2025.csv
70.676102: best_ensemble.csv
70.676102: santa-2025.csv
70.676501: submission.csv
70.676501: submission.csv
70.926150: submission_70_926149550346.csv
70.926150: submission_best.csv
70.936674: submission_70_936673758122.csv
70.990692: submission_opt1.csv


In [4]:
# Now build ensemble: for each N, select the best configuration from all sources
print("Building ensemble from all sources...")

# Load all dataframes
dfs = {}
for f, s in sorted_files:
    try:
        dfs[f] = pd.read_csv(f)
    except:
        pass

print(f"Loaded {len(dfs)} dataframes")

# For each N, find the best configuration
best_configs = {}  # n -> (file, side, rows)

for n in range(1, 201):
    best_side = float('inf')
    best_file = None
    best_rows = None
    
    for f, df in dfs.items():
        trees = load_trees_for_n(df, n)
        if len(trees) != n:
            continue
        side = get_bounding_box_side(trees)
        if side < best_side:
            best_side = side
            best_file = f
            # Get the rows for this N
            prefix = f"{n:03d}_"
            best_rows = df[df['id'].str.startswith(prefix)].copy()
    
    if best_rows is not None:
        best_configs[n] = (best_file, best_side, best_rows)
    
    if n % 50 == 0:
        print(f"Processed n={n}")

print(f"\nFound best configs for {len(best_configs)} N values")

Building ensemble from all sources...


Loaded 30 dataframes


Processed n=50


Processed n=100


Processed n=150


Processed n=200

Found best configs for 200 N values


In [5]:
# Calculate ensemble score
ensemble_score = 0
for n in range(1, 201):
    if n in best_configs:
        _, side, _ = best_configs[n]
        ensemble_score += (side ** 2) / n

print(f"Ensemble score: {ensemble_score:.6f}")
print(f"Best single file score: {min(scores.values()):.6f}")
print(f"Improvement: {min(scores.values()) - ensemble_score:.6f}")
print(f"Target: 68.922808")
print(f"Gap to target: {ensemble_score - 68.922808:.6f}")

Ensemble score: 70.676102
Best single file score: 70.676102
Improvement: 0.000000
Target: 68.922808
Gap to target: 1.753294


In [6]:
# Show which files contribute to the ensemble
file_counts = {}
for n, (f, side, rows) in best_configs.items():
    fname = os.path.basename(f)
    if fname not in file_counts:
        file_counts[fname] = []
    file_counts[fname].append(n)

print("\nFile contributions to ensemble:")
for fname, ns in sorted(file_counts.items(), key=lambda x: -len(x[1])):
    print(f"{fname}: {len(ns)} N values")


File contributions to ensemble:
ensemble.csv: 200 N values


In [None]:
# Build the ensemble submission
ensemble_rows = []
for n in range(1, 201):
    if n in best_configs:
        _, _, rows = best_configs[n]
        ensemble_rows.append(rows)

ensemble_df = pd.concat(ensemble_rows, ignore_index=True)
print(f"Ensemble has {len(ensemble_df)} rows")

# Save the ensemble
ensemble_df.to_csv('/home/code/preoptimized/new_ensemble.csv', index=False)
print("Saved to /home/code/preoptimized/new_ensemble.csv")

In [None]:
# Verify the ensemble score
df_verify = pd.read_csv('/home/code/preoptimized/new_ensemble.csv')
verify_score = score_submission(df_verify)
print(f"Verified ensemble score: {verify_score:.6f}")

In [None]:
# Check for overlaps in the ensemble
def has_overlap_for_trees(trees):
    """Check if any trees overlap."""
    if len(trees) <= 1:
        return False
    for i in range(len(trees)):
        for j in range(i+1, len(trees)):
            poly_i = trees[i].polygon
            poly_j = trees[j].polygon
            if poly_i.intersects(poly_j) and not poly_i.touches(poly_j):
                intersection = poly_i.intersection(poly_j)
                if intersection.area > 1e-12:
                    return True
    return False

print("Checking for overlaps...")
overlap_count = 0
for n in range(1, 201):
    trees = load_trees_for_n(df_verify, n)
    if has_overlap_for_trees(trees):
        print(f"Overlap detected in n={n}")
        overlap_count += 1

if overlap_count == 0:
    print("No overlaps detected!")
else:
    print(f"Total overlaps: {overlap_count}")