# Loop 3 Analysis: Strategy Assessment

## Key Questions:
1. What public submissions are available for ensembling?
2. What is the best score achievable from existing submissions?
3. What techniques from top kernels haven't we tried yet?

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from numba import njit
import math

# Tree polygon template
@njit
def make_polygon_template():
    tw=0.15; th=0.2; bw=0.7; mw=0.4; ow=0.25
    tip=0.8; t1=0.5; t2=0.25; base=0.0; tbot=-th
    x=np.array([0,ow/2,ow/4,mw/2,mw/4,bw/2,tw/2,tw/2,-tw/2,-tw/2,-bw/2,-mw/4,-mw/2,-ow/4,-ow/2],np.float64)
    y=np.array([tip,t1,t1,t2,t2,base,base,tbot,tbot,base,base,t2,t2,t1,t1],np.float64)
    return x,y

@njit
def score_group(xs,ys,degs,tx,ty):
    n=xs.size; V=tx.size
    mnx=1e300; mny=1e300; mxx=-1e300; mxy=-1e300
    for i in range(n):
        r=degs[i]*math.pi/180.0
        c=math.cos(r); s=math.sin(r)
        xi=xs[i]; yi=ys[i]
        for j in range(V):
            X=c*tx[j]-s*ty[j]+xi
            Y=s*tx[j]+c*ty[j]+yi
            if X<mnx: mnx=X
            if X>mxx: mxx=X
            if Y<mny: mny=Y
            if Y>mxy: mxy=Y
    side=max(mxx-mnx,mxy-mny)
    return side*side/n

def strip(a):
    return np.array([float(str(v).replace("s","")) for v in a],np.float64)

tx, ty = make_polygon_template()
print("Scoring functions ready")

Scoring functions ready


In [2]:
# Find all CSV files
all_csv = []
for root, dirs, files in os.walk('/home/code/datasets'):
    for f in files:
        if f.endswith('.csv'):
            all_csv.append(os.path.join(root, f))

print(f"Found {len(all_csv)} CSV files in datasets:")
for f in sorted(all_csv):
    print(f"  {f}")

Found 23 CSV files in datasets:
  /home/code/datasets/bucket-of-chump/submission.csv
  /home/code/datasets/santa-2025-csv/santa-2025.csv
  /home/code/datasets/santa-2025-try3/submission.csv
  /home/code/datasets/santa-2025-try3/submission_sa.csv
  /home/code/datasets/santa25-public/New_Tree_144_196.csv
  /home/code/datasets/santa25-public/santa2025_ver2_v61.csv
  /home/code/datasets/santa25-public/santa2025_ver2_v63.csv
  /home/code/datasets/santa25-public/santa2025_ver2_v65.csv
  /home/code/datasets/santa25-public/santa2025_ver2_v66.csv
  /home/code/datasets/santa25-public/santa2025_ver2_v67.csv
  /home/code/datasets/santa25-public/santa2025_ver2_v68.csv
  /home/code/datasets/santa25-public/santa2025_ver2_v69.csv
  /home/code/datasets/santa25-public/santa2025_ver2_v76.csv
  /home/code/datasets/santa25-public/submission_70_926149550346.csv
  /home/code/datasets/santa25-public/submission_70_936673758122.csv
  /home/code/datasets/santa25-public/submission_JKoT1.csv
  /home/code/datasets/

In [3]:
# Score each CSV file
results = []

for fp in all_csv:
    try:
        df = pd.read_csv(fp)
        if not {'id','x','y','deg'}.issubset(df.columns):
            continue
        
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        total_score = 0
        n_count = 0
        
        for n, g in df.groupby('N'):
            if n < 1 or n > 200:
                continue
            xs = strip(g['x'].to_numpy())
            ys = strip(g['y'].to_numpy())
            ds = strip(g['deg'].to_numpy())
            if len(xs) != n:
                continue
            sc = score_group(xs, ys, ds, tx, ty)
            total_score += sc
            n_count += 1
        
        if n_count == 200:
            results.append({'file': fp, 'score': total_score})
            print(f"{fp.split('/')[-1]}: {total_score:.6f}")
    except Exception as e:
        pass

results_df = pd.DataFrame(results).sort_values('score')
print(f"\nTop 10 submissions:")
print(results_df.head(10))

smartmanoj_submission.csv: 70.743774


submission_JKoT4.csv: 72.489504


New_Tree_144_196.csv: 72.927920
submission_JKoT3.csv: 72.489488


santa2025_ver2_v61.csv: 72.951925


submission_JKoT2.csv: 72.489348


santa2025_ver2_v67.csv: 72.938567
santa2025_ver2_v76.csv: 72.826444


submission_70_936673758122.csv: 70.936674


santa2025_ver2_v65.csv: 72.935294


submission_70_926149550346.csv: 70.926150
santa2025_ver2_v66.csv: 72.938599


santa2025_ver2_v63.csv: 72.947427


santa2025_ver2_v69.csv: 72.850110


submission_JKoT1.csv: 72.489483
submission_opt1.csv: 70.990692
santa2025_ver2_v68.csv: 72.939233


santa-2025.csv: 70.734327


submission.csv: 70.750676
72.49.csv: 72.495739
71.97.csv: 71.972027


submission.csv: 72.935294


submission_sa.csv: 72.935294

Top 10 submissions:
                                                 file      score
17  /home/code/datasets/santa-2025-csv/santa-2025.csv  70.734327
0       /home/code/datasets/smartmanoj_submission.csv  70.743774
18  /home/code/datasets/bucket-of-chump/submission...  70.750676
10  /home/code/datasets/santa25-public/submission_...  70.926150
8   /home/code/datasets/santa25-public/submission_...  70.936674
15  /home/code/datasets/santa25-public/submission_...  70.990692
20      /home/code/datasets/telegram-public/71.97.csv  71.972027
5   /home/code/datasets/santa25-public/submission_...  72.489348
14  /home/code/datasets/santa25-public/submission_...  72.489483
3   /home/code/datasets/santa25-public/submission_...  72.489488


In [4]:
# Now let's do proper ensembling - pick best N from all sources
best = {n: {'score': 1e300, 'data': None, 'src': None} for n in range(1, 201)}

for fp in all_csv:
    try:
        df = pd.read_csv(fp)
        if not {'id','x','y','deg'}.issubset(df.columns):
            continue
        
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        
        for n, g in df.groupby('N'):
            if n < 1 or n > 200:
                continue
            xs = strip(g['x'].to_numpy())
            ys = strip(g['y'].to_numpy())
            ds = strip(g['deg'].to_numpy())
            if len(xs) != n:
                continue
            sc = score_group(xs, ys, ds, tx, ty)
            if sc < best[n]['score']:
                best[n]['score'] = sc
                best[n]['data'] = g.drop(columns=['N']).copy()
                best[n]['src'] = fp.split('/')[-1]
    except Exception as e:
        pass

# Calculate ensemble score
ensemble_score = sum(best[n]['score'] for n in range(1, 201))
print(f"\nBest ensemble score from all sources: {ensemble_score:.10f}")
print(f"Current best: 70.734327013")
print(f"Target: 68.931058")
print(f"Gap to target: {ensemble_score - 68.931058:.6f}")

# Show source distribution
from collections import Counter
sources = Counter(best[n]['src'] for n in range(1, 201))
print(f"\nSource distribution:")
for src, count in sources.most_common():
    print(f"  {src}: {count} N values")


Best ensemble score from all sources: 70.7343270130
Current best: 70.734327013
Target: 68.931058
Gap to target: 1.803269

Source distribution:
  santa-2025.csv: 195 N values
  smartmanoj_submission.csv: 5 N values


In [5]:
# Identify N values where we're furthest from optimal
print("\nWorst N values (highest contribution to score):")
n_scores = [(n, best[n]['score'], best[n]['src']) for n in range(1, 201)]
n_scores.sort(key=lambda x: -x[1])  # Sort by score descending

for n, score, src in n_scores[:20]:
    print(f"  N={n:3d}: score={score:.6f} (src: {src})")


Worst N values (highest contribution to score):
  N=  1: score=0.661250 (src: smartmanoj_submission.csv)
  N=  2: score=0.450779 (src: smartmanoj_submission.csv)
  N=  3: score=0.434745 (src: smartmanoj_submission.csv)
  N=  5: score=0.416850 (src: santa-2025.csv)
  N=  4: score=0.416545 (src: smartmanoj_submission.csv)
  N=  7: score=0.399897 (src: smartmanoj_submission.csv)
  N=  6: score=0.399610 (src: santa-2025.csv)
  N=  9: score=0.387415 (src: santa-2025.csv)
  N=  8: score=0.385407 (src: santa-2025.csv)
  N= 15: score=0.379203 (src: santa-2025.csv)
  N= 10: score=0.376630 (src: santa-2025.csv)
  N= 21: score=0.376451 (src: santa-2025.csv)
  N= 20: score=0.376057 (src: santa-2025.csv)
  N= 11: score=0.375736 (src: santa-2025.csv)
  N= 22: score=0.375258 (src: santa-2025.csv)
  N= 16: score=0.374128 (src: santa-2025.csv)
  N= 26: score=0.373997 (src: santa-2025.csv)
  N= 12: score=0.372724 (src: santa-2025.csv)
  N= 13: score=0.372323 (src: santa-2025.csv)
  N= 25: score=0.37214

In [6]:
# Save the ensemble submission
rows = []
for n in range(1, 201):
    if best[n]['data'] is not None:
        rows.append(best[n]['data'])

ensemble_df = pd.concat(rows, ignore_index=True)
ensemble_df['sn'] = ensemble_df['id'].str.split('_').str[0].astype(int)
ensemble_df['si'] = ensemble_df['id'].str.split('_').str[1].astype(int)
ensemble_df = ensemble_df.sort_values(['sn', 'si']).drop(columns=['sn', 'si'])
ensemble_df = ensemble_df[['id', 'x', 'y', 'deg']]

ensemble_path = '/home/code/experiments/003_periodic_structures/ensemble_all.csv'
ensemble_df.to_csv(ensemble_path, index=False)
print(f"Saved ensemble to {ensemble_path}")
print(f"Ensemble score: {ensemble_score:.10f}")

Saved ensemble to /home/code/experiments/003_periodic_structures/ensemble_all.csv
Ensemble score: 70.7343270130


In [7]:
# Check if there are any telegram submissions we haven't looked at
telegram_files = glob.glob('/home/code/datasets/telegram-public/*.csv')
print(f"\nTelegram public submissions: {len(telegram_files)}")
for f in telegram_files[:10]:
    print(f"  {f}")


Telegram public submissions: 2
  /home/code/datasets/telegram-public/72.49.csv
  /home/code/datasets/telegram-public/71.97.csv


In [8]:
# Check santa-2025-try3 dataset
try3_files = glob.glob('/home/code/datasets/santa-2025-try3/*.csv')
print(f"\nsanta-2025-try3 submissions: {len(try3_files)}")
for f in try3_files[:10]:
    print(f"  {f}")


santa-2025-try3 submissions: 2
  /home/code/datasets/santa-2025-try3/submission.csv
  /home/code/datasets/santa-2025-try3/submission_sa.csv


In [9]:
# Key insight: What's the theoretical minimum?
# For N trees, the minimum bounding box is limited by the tree size
# Tree dimensions: width ~0.7, height ~1.0 (from -0.2 to 0.8)

# Let's calculate what the best possible score might be
print("Analysis of score components:")
print("="*50)

# Small N (1-20) - these are well-optimized
small_n_score = sum(best[n]['score'] for n in range(1, 21))
print(f"N=1-20 contribution: {small_n_score:.6f} ({small_n_score/ensemble_score*100:.1f}%)")

# Medium N (21-60)
medium_n_score = sum(best[n]['score'] for n in range(21, 61))
print(f"N=21-60 contribution: {medium_n_score:.6f} ({medium_n_score/ensemble_score*100:.1f}%)")

# Large N (61-200)
large_n_score = sum(best[n]['score'] for n in range(61, 201))
print(f"N=61-200 contribution: {large_n_score:.6f} ({large_n_score/ensemble_score*100:.1f}%)")

print(f"\nTotal: {ensemble_score:.6f}")
print(f"Target: 68.931058")
print(f"Need to save: {ensemble_score - 68.931058:.6f} points")

Analysis of score components:
N=1-20 contribution: 8.057838 (11.4%)
N=21-60 contribution: 14.585859 (20.6%)
N=61-200 contribution: 48.090629 (68.0%)

Total: 70.734327
Target: 68.931058
Need to save: 1.803269 points
