# Loop 2 Analysis: External Dataset Ensemble

Create a comprehensive ensemble from ALL external data sources:
1. bucket-of-chump dataset
2. telegram-shared solutions (71.97, 72.49)
3. santa25-public dataset
4. santa-2025-try3 dataset
5. SmartManoj GitHub
6. All 78 snapshots

In [1]:
import numpy as np
import pandas as pd
import os
import json
import glob
from shapely.geometry import Polygon
from shapely import STRtree

os.chdir('/home/code')

# Tree vertices
TX = np.array([0,0.125,0.0625,0.2,0.1,0.35,0.075,0.075,-0.075,-0.075,-0.35,-0.1,-0.2,-0.0625,-0.125])
TY = np.array([0.8,0.5,0.5,0.25,0.25,0,0,-0.2,-0.2,0,0,0.25,0.25,0.5,0.5])

def get_tree_polygon(x, y, deg):
    """Get tree polygon at position (x, y) with rotation deg"""
    rad = np.radians(deg)
    c, s = np.cos(rad), np.sin(rad)
    vertices = []
    for j in range(len(TX)):
        vx = TX[j] * c - TY[j] * s + x
        vy = TX[j] * s + TY[j] * c + y
        vertices.append((vx, vy))
    return Polygon(vertices)

def score_group(xs, ys, degs):
    """Calculate score for a single N-tree configuration"""
    n = len(xs)
    all_x, all_y = [], []
    for i in range(n):
        rad = np.radians(degs[i])
        c, s = np.cos(rad), np.sin(rad)
        for j in range(len(TX)):
            x = TX[j] * c - TY[j] * s + xs[i]
            y = TX[j] * s + TY[j] * c + ys[i]
            all_x.append(x)
            all_y.append(y)
    side = max(max(all_x) - min(all_x), max(all_y) - min(all_y))
    return side * side / n

def check_overlaps(xs, ys, degs):
    """Check if any trees overlap using Shapely"""
    polygons = [get_tree_polygon(xs[i], ys[i], degs[i]) for i in range(len(xs))]
    tree = STRtree(polygons)
    for i, poly in enumerate(polygons):
        candidates = tree.query(poly)
        for j in candidates:
            if i < j:
                if polygons[i].intersection(polygons[j]).area > 0:
                    return True
    return False

def parse_submission(df):
    """Parse submission dataframe"""
    df = df.copy()
    df['x_val'] = df['x'].astype(str).str.replace('s', '').astype(float)
    df['y_val'] = df['y'].astype(str).str.replace('s', '').astype(float)
    df['deg_val'] = df['deg'].astype(str).str.replace('s', '').astype(float)
    df['n'] = df['id'].str.split('_').str[0].astype(int)
    return df

def calculate_per_n_scores(df):
    """Calculate score for each N value"""
    per_n_scores = {}
    for n in range(1, 201):
        group = df[df['n'] == n]
        if len(group) == n:
            xs = group['x_val'].values
            ys = group['y_val'].values
            degs = group['deg_val'].values
            per_n_scores[n] = score_group(xs, ys, degs)
    return per_n_scores

print("Functions defined.")

Functions defined.


In [2]:
# Collect ALL CSV files from external datasets
print("Collecting all CSV files from external datasets...")

external_csvs = []

# 1. bucket-of-chump
external_csvs.extend(glob.glob('/home/code/external_data/bucket-of-chump/*.csv'))

# 2. telegram-shared
external_csvs.extend(glob.glob('/home/code/external_data/telegram-shared/*.csv'))

# 3. santa25-public
external_csvs.extend(glob.glob('/home/code/external_data/santa25-public/*.csv'))

# 4. santa-2025-try3
external_csvs.extend(glob.glob('/home/code/external_data/santa-2025-try3/*.csv'))

# 5. SmartManoj GitHub (already downloaded)
if os.path.exists('/home/code/experiments/002_external_ensemble/smartmanoj_submission.csv'):
    external_csvs.append('/home/code/experiments/002_external_ensemble/smartmanoj_submission.csv')

print(f"Found {len(external_csvs)} external CSV files:")
for f in external_csvs:
    print(f"  {f}")

Collecting all CSV files from external datasets...
Found 22 external CSV files:
  /home/code/external_data/bucket-of-chump/submission.csv
  /home/code/external_data/telegram-shared/72.49.csv
  /home/code/external_data/telegram-shared/71.97.csv
  /home/code/external_data/santa25-public/submission_JKoT4.csv
  /home/code/external_data/santa25-public/New_Tree_144_196.csv
  /home/code/external_data/santa25-public/submission_JKoT3.csv
  /home/code/external_data/santa25-public/santa2025_ver2_v61.csv
  /home/code/external_data/santa25-public/submission_JKoT2.csv
  /home/code/external_data/santa25-public/santa2025_ver2_v67.csv
  /home/code/external_data/santa25-public/santa2025_ver2_v76.csv
  /home/code/external_data/santa25-public/submission_70_936673758122.csv
  /home/code/external_data/santa25-public/santa2025_ver2_v65.csv
  /home/code/external_data/santa25-public/submission_70_926149550346.csv
  /home/code/external_data/santa25-public/santa2025_ver2_v66.csv
  /home/code/external_data/santa2

In [3]:
# Load all submissions
submissions = {}

# Load external CSVs
print("\nLoading external submissions...")
for fp in external_csvs:
    try:
        df = pd.read_csv(fp)
        if {'id', 'x', 'y', 'deg'}.issubset(df.columns):
            df = parse_submission(df)
            name = os.path.basename(fp).replace('.csv', '')
            submissions[name] = df
            print(f"  Loaded: {name} ({len(df)} rows)")
    except Exception as e:
        print(f"  Failed: {fp} - {e}")

# Load current best ensemble
print("\nLoading current ensemble...")
df_current = pd.read_csv('/home/submission/submission.csv')
df_current = parse_submission(df_current)
submissions['current_ensemble'] = df_current

# Load all snapshots
print("\nLoading snapshot submissions...")
snapshot_dir = '/home/nonroot/snapshots/santa-2025'
for snap_id in os.listdir(snapshot_dir):
    sub_path = os.path.join(snapshot_dir, snap_id, 'submission', 'submission.csv')
    if os.path.exists(sub_path):
        try:
            df = pd.read_csv(sub_path)
            df = parse_submission(df)
            submissions[f'snapshot_{snap_id}'] = df
        except:
            pass

print(f"\nTotal submissions loaded: {len(submissions)}")


Loading external submissions...
  Loaded: submission (20100 rows)
  Loaded: 72.49 (20100 rows)


  Loaded: 71.97 (20100 rows)
  Loaded: submission_JKoT4 (20100 rows)
  Loaded: New_Tree_144_196 (20100 rows)
  Loaded: submission_JKoT3 (20100 rows)
  Loaded: santa2025_ver2_v61 (20100 rows)


  Loaded: submission_JKoT2 (20100 rows)
  Loaded: santa2025_ver2_v67 (20100 rows)
  Loaded: santa2025_ver2_v76 (20100 rows)
  Loaded: submission_70_936673758122 (20100 rows)


  Loaded: santa2025_ver2_v65 (20100 rows)
  Loaded: submission_70_926149550346 (20100 rows)
  Loaded: santa2025_ver2_v66 (20100 rows)
  Loaded: santa2025_ver2_v63 (20100 rows)


  Loaded: santa2025_ver2_v69 (20100 rows)
  Loaded: submission_JKoT1 (20100 rows)
  Loaded: submission_opt1 (20100 rows)
  Loaded: santa2025_ver2_v68 (20100 rows)


  Loaded: submission (20100 rows)
  Loaded: submission_sa (20100 rows)
  Loaded: smartmanoj_submission (20100 rows)

Loading current ensemble...

Loading snapshot submissions...



Total submissions loaded: 98


In [4]:
# Calculate per-N scores for all submissions
print("Calculating per-N scores for all submissions...")
all_per_n_scores = {}

for name, df in submissions.items():
    try:
        scores = calculate_per_n_scores(df)
        if len(scores) == 200:  # Valid submission
            all_per_n_scores[name] = scores
            total = sum(scores.values())
            print(f"  {name}: total={total:.6f}")
    except Exception as e:
        print(f"  {name}: ERROR - {e}")

print(f"\nValid submissions: {len(all_per_n_scores)}")

Calculating per-N scores for all submissions...


  submission: total=72.935294
  72.49: total=72.495739


  71.97: total=71.972027


  submission_JKoT4: total=72.489504


  New_Tree_144_196: total=72.927920


  submission_JKoT3: total=72.489488


  santa2025_ver2_v61: total=72.951925


  submission_JKoT2: total=72.489348


  santa2025_ver2_v67: total=72.938567


  santa2025_ver2_v76: total=72.826444


  submission_70_936673758122: total=70.936674


  santa2025_ver2_v65: total=72.935294


  submission_70_926149550346: total=70.926150


  santa2025_ver2_v66: total=72.938599


  santa2025_ver2_v63: total=72.947427


  santa2025_ver2_v69: total=72.850110


  submission_JKoT1: total=72.489483
  submission_opt1: total=70.990692


  santa2025_ver2_v68: total=72.939233
  submission_sa: total=72.935294


  smartmanoj_submission: total=70.743774


  current_ensemble: total=70.615745


  snapshot_21116303805: total=70.676102


  snapshot_21328309254: total=70.615745


  snapshot_21121776553: total=70.936674


  snapshot_21165872902: total=70.647306


  snapshot_21198893057: total=70.625918


  snapshot_21129617858: total=70.676764
  snapshot_20992536951: total=87.804045


  snapshot_21190224310: total=70.630465


  snapshot_21322576451: total=70.619825


  snapshot_21123768399: total=70.676102


  snapshot_20971964134: total=87.804045


  snapshot_21086827828: total=114.587809


  snapshot_21121942239: total=70.676102
  snapshot_21145965159: total=70.659944


  snapshot_21191207951: total=70.627608


  snapshot_21165876936: total=70.647306


  snapshot_20992150197: total=217.576225


  snapshot_21139436707: total=162.204811


  snapshot_21123763369: total=70.743774


  snapshot_21122904233: total=118.230882


  snapshot_20991308120: total=87.804045


  snapshot_21222392487: total=70.626088
  snapshot_21180221700: total=70.630478


  snapshot_21222390477: total=70.624381


  snapshot_20970671503: total=164.820045
  snapshot_21191209482: total=70.625918


  snapshot_21139436611: total=170.867211


  snapshot_21322577324: total=70.625376


  snapshot_21328308881: total=70.676102


  snapshot_21139436695: total=151.174322


  snapshot_21156852373: total=70.676102


  snapshot_20952569566: total=163.194569


  snapshot_21156853393: total=70.676102
  snapshot_21198891805: total=70.627582


  snapshot_21108486172: total=70.734327


  snapshot_21139436684: total=148.177124


  snapshot_21179744881: total=70.676102


  snapshot_21121943993: total=70.676102


  snapshot_21117626902: total=70.676145


  snapshot_21222373488: total=70.624381


  snapshot_21129619422: total=170.909275


  snapshot_21016257921: total=87.364112


  snapshot_21191212682: total=70.630455


  snapshot_21129622493: total=129.272924


  snapshot_21191206469: total=70.630455


  snapshot_20984924920: total=173.688052
  snapshot_21328310479: total=70.615745


  snapshot_21156851249: total=70.659437


  snapshot_21180223864: total=70.630429


  snapshot_21191211160: total=70.627582


  snapshot_21165878844: total=70.659436


  snapshot_21180219583: total=70.630478


  snapshot_21145966992: total=70.572798


  snapshot_21139436732: total=164.924007


  snapshot_21322576827: total=70.616145


  snapshot_21105319338: total=70.734327


  snapshot_21328309666: total=70.619825


  snapshot_21129625840: total=98.875886


  snapshot_21156850282: total=70.659437


  snapshot_21198925328: total=70.624381


  snapshot_21145968755: total=70.659959
  snapshot_21104669204: total=70.734327


  snapshot_21145961371: total=70.676102


  snapshot_21198928571: total=70.625918


  snapshot_21129620891: total=88.329998


  snapshot_21198790429: total=70.627582


  snapshot_21328310048: total=70.626088


  snapshot_21222377956: total=70.624381


  snapshot_21322578388: total=70.926150


  snapshot_21190222820: total=70.630455


  snapshot_21222375510: total=70.624381


  snapshot_21090949260: total=84.711359


  snapshot_21165874980: total=70.630478


  snapshot_21165870845: total=70.676102


  snapshot_21117525284: total=70.676104


  snapshot_21198927060: total=70.624381

Valid submissions: 98


In [None]:
# Find best VALID (no overlap) solution for each N
print("\nFinding best VALID solution for each N (checking overlaps)...")

best_valid_per_n = {}
best_valid_source = {}

for n in range(1, 201):
    best_score = float('inf')
    best_src = None
    
    # Check all submissions for this N
    for name, df in submissions.items():
        if name not in all_per_n_scores:
            continue
        
        group = df[df['n'] == n]
        if len(group) != n:
            continue
            
        xs = group['x_val'].values
        ys = group['y_val'].values
        degs = group['deg_val'].values
        
        # Check for overlaps
        if check_overlaps(xs, ys, degs):
            continue
        
        score = all_per_n_scores[name][n]
        if score < best_score:
            best_score = score
            best_src = name
    
    if best_src is not None:
        best_valid_per_n[n] = best_score
        best_valid_source[n] = best_src
    else:
        print(f"WARNING: No valid solution for N={n}!")
        # Use current ensemble as fallback
        best_valid_source[n] = 'current_ensemble'
        best_valid_per_n[n] = all_per_n_scores['current_ensemble'][n]

# Calculate total score
total_valid = sum(best_valid_per_n.values())
print(f"\nBest VALID ensemble total score: {total_valid:.6f}")
print(f"Target: 68.894234")
print(f"Gap to target: {total_valid - 68.894234:.6f}")

In [None]:
# Show improvements from external datasets
print("\nImprovements from external datasets:")
current_scores = all_per_n_scores.get('current_ensemble', {})
current_total = sum(current_scores.values())

improvements = []
for n in range(1, 201):
    if n in current_scores and n in best_valid_per_n:
        diff = current_scores[n] - best_valid_per_n[n]
        if diff > 1e-10:
            improvements.append((n, diff, best_valid_source[n]))

improvements.sort(key=lambda x: x[1], reverse=True)
print(f"Total N values improved: {len(improvements)}")
print(f"Total improvement: {current_total - total_valid:.6f}")

print("\nTop 30 improvements:")
for n, diff, src in improvements[:30]:
    print(f"  N={n}: improved by {diff:.6f} from {src}")

In [None]:
# Show source distribution
print("\nSource distribution for best solutions:")
source_counts = {}
for n, src in best_valid_source.items():
    source_counts[src] = source_counts.get(src, 0) + 1

for src, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f"  {src}: {count} N values")

In [None]:
# Create the valid ensemble submission
print("Creating valid ensemble submission...")

new_valid_rows = []
for n in range(1, 201):
    best_src = best_valid_source[n]
    df_best = submissions[best_src]
    group = df_best[df_best['n'] == n]
    for _, row in group.iterrows():
        new_valid_rows.append({
            'id': row['id'],
            'x': row['x'],
            'y': row['y'],
            'deg': row['deg']
        })

df_valid = pd.DataFrame(new_valid_rows)
print(f"Valid ensemble shape: {df_valid.shape}")

In [None]:
# Final verification - check no overlaps in valid ensemble
print("Final verification - checking no overlaps...")
df_valid_parsed = parse_submission(df_valid)

has_overlap = False
overlap_count = 0
for n in range(1, 201):
    group = df_valid_parsed[df_valid_parsed['n'] == n]
    xs = group['x_val'].values
    ys = group['y_val'].values
    degs = group['deg_val'].values
    
    if check_overlaps(xs, ys, degs):
        print(f"  WARNING: Overlap detected in N={n}!")
        has_overlap = True
        overlap_count += 1

if not has_overlap:
    print("  ✓ No overlaps detected - submission is valid!")
else:
    print(f"  ✗ {overlap_count} N values have overlaps!")

In [None]:
# Calculate and save final score
final_valid_scores = calculate_per_n_scores(df_valid_parsed)
final_valid_total = sum(final_valid_scores.values())
print(f"\nFinal VALID ensemble score: {final_valid_total:.6f}")
print(f"Previous best: {current_total:.6f}")
print(f"Improvement: {current_total - final_valid_total:.6f}")
print(f"Target: 68.894234")
print(f"Gap to target: {final_valid_total - 68.894234:.6f}")

# Save submission
df_valid.to_csv('/home/submission/submission.csv', index=False)
print("\nSaved to /home/submission/submission.csv")