# Loop 3 Analysis: Strategy Assessment

## Key Questions:
1. What public submissions are available for ensembling?
2. What is the best score achievable from existing submissions?
3. What techniques from top kernels haven't we tried yet?

In [None]:
import pandas as pd
import numpy as np
import os
import glob
from numba import njit
import math

# Tree polygon template
@njit
def make_polygon_template():
    tw=0.15; th=0.2; bw=0.7; mw=0.4; ow=0.25
    tip=0.8; t1=0.5; t2=0.25; base=0.0; tbot=-th
    x=np.array([0,ow/2,ow/4,mw/2,mw/4,bw/2,tw/2,tw/2,-tw/2,-tw/2,-bw/2,-mw/4,-mw/2,-ow/4,-ow/2],np.float64)
    y=np.array([tip,t1,t1,t2,t2,base,base,tbot,tbot,base,base,t2,t2,t1,t1],np.float64)
    return x,y

@njit
def score_group(xs,ys,degs,tx,ty):
    n=xs.size; V=tx.size
    mnx=1e300; mny=1e300; mxx=-1e300; mxy=-1e300
    for i in range(n):
        r=degs[i]*math.pi/180.0
        c=math.cos(r); s=math.sin(r)
        xi=xs[i]; yi=ys[i]
        for j in range(V):
            X=c*tx[j]-s*ty[j]+xi
            Y=s*tx[j]+c*ty[j]+yi
            if X<mnx: mnx=X
            if X>mxx: mxx=X
            if Y<mny: mny=Y
            if Y>mxy: mxy=Y
    side=max(mxx-mnx,mxy-mny)
    return side*side/n

def strip(a):
    return np.array([float(str(v).replace("s","")) for v in a],np.float64)

tx, ty = make_polygon_template()
print("Scoring functions ready")

In [None]:
# Find all CSV files
all_csv = []
for root, dirs, files in os.walk('/home/code/datasets'):
    for f in files:
        if f.endswith('.csv'):
            all_csv.append(os.path.join(root, f))

print(f"Found {len(all_csv)} CSV files in datasets:")
for f in sorted(all_csv):
    print(f"  {f}")

In [None]:
# Score each CSV file
results = []

for fp in all_csv:
    try:
        df = pd.read_csv(fp)
        if not {'id','x','y','deg'}.issubset(df.columns):
            continue
        
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        total_score = 0
        n_count = 0
        
        for n, g in df.groupby('N'):
            if n < 1 or n > 200:
                continue
            xs = strip(g['x'].to_numpy())
            ys = strip(g['y'].to_numpy())
            ds = strip(g['deg'].to_numpy())
            if len(xs) != n:
                continue
            sc = score_group(xs, ys, ds, tx, ty)
            total_score += sc
            n_count += 1
        
        if n_count == 200:
            results.append({'file': fp, 'score': total_score})
            print(f"{fp.split('/')[-1]}: {total_score:.6f}")
    except Exception as e:
        pass

results_df = pd.DataFrame(results).sort_values('score')
print(f"\nTop 10 submissions:")
print(results_df.head(10))

In [None]:
# Now let's do proper ensembling - pick best N from all sources
best = {n: {'score': 1e300, 'data': None, 'src': None} for n in range(1, 201)}

for fp in all_csv:
    try:
        df = pd.read_csv(fp)
        if not {'id','x','y','deg'}.issubset(df.columns):
            continue
        
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        
        for n, g in df.groupby('N'):
            if n < 1 or n > 200:
                continue
            xs = strip(g['x'].to_numpy())
            ys = strip(g['y'].to_numpy())
            ds = strip(g['deg'].to_numpy())
            if len(xs) != n:
                continue
            sc = score_group(xs, ys, ds, tx, ty)
            if sc < best[n]['score']:
                best[n]['score'] = sc
                best[n]['data'] = g.drop(columns=['N']).copy()
                best[n]['src'] = fp.split('/')[-1]
    except Exception as e:
        pass

# Calculate ensemble score
ensemble_score = sum(best[n]['score'] for n in range(1, 201))
print(f"\nBest ensemble score from all sources: {ensemble_score:.10f}")
print(f"Current best: 70.734327013")
print(f"Target: 68.931058")
print(f"Gap to target: {ensemble_score - 68.931058:.6f}")

# Show source distribution
from collections import Counter
sources = Counter(best[n]['src'] for n in range(1, 201))
print(f"\nSource distribution:")
for src, count in sources.most_common():
    print(f"  {src}: {count} N values")

In [None]:
# Identify N values where we're furthest from optimal
print("\nWorst N values (highest contribution to score):")
n_scores = [(n, best[n]['score'], best[n]['src']) for n in range(1, 201)]
n_scores.sort(key=lambda x: -x[1])  # Sort by score descending

for n, score, src in n_scores[:20]:
    print(f"  N={n:3d}: score={score:.6f} (src: {src})")

In [None]:
# Save the ensemble submission
rows = []
for n in range(1, 201):
    if best[n]['data'] is not None:
        rows.append(best[n]['data'])

ensemble_df = pd.concat(rows, ignore_index=True)
ensemble_df['sn'] = ensemble_df['id'].str.split('_').str[0].astype(int)
ensemble_df['si'] = ensemble_df['id'].str.split('_').str[1].astype(int)
ensemble_df = ensemble_df.sort_values(['sn', 'si']).drop(columns=['sn', 'si'])
ensemble_df = ensemble_df[['id', 'x', 'y', 'deg']]

ensemble_path = '/home/code/experiments/003_periodic_structures/ensemble_all.csv'
ensemble_df.to_csv(ensemble_path, index=False)
print(f"Saved ensemble to {ensemble_path}")
print(f"Ensemble score: {ensemble_score:.10f}")

In [None]:
# Check if there are any telegram submissions we haven't looked at
telegram_files = glob.glob('/home/code/datasets/telegram-public/*.csv')
print(f"\nTelegram public submissions: {len(telegram_files)}")
for f in telegram_files[:10]:
    print(f"  {f}")

In [None]:
# Check santa-2025-try3 dataset
try3_files = glob.glob('/home/code/datasets/santa-2025-try3/*.csv')
print(f"\nsanta-2025-try3 submissions: {len(try3_files)}")
for f in try3_files[:10]:
    print(f"  {f}")

In [None]:
# Key insight: What's the theoretical minimum?
# For N trees, the minimum bounding box is limited by the tree size
# Tree dimensions: width ~0.7, height ~1.0 (from -0.2 to 0.8)

# Let's calculate what the best possible score might be
print("Analysis of score components:")
print("="*50)

# Small N (1-20) - these are well-optimized
small_n_score = sum(best[n]['score'] for n in range(1, 21))
print(f"N=1-20 contribution: {small_n_score:.6f} ({small_n_score/ensemble_score*100:.1f}%)")

# Medium N (21-60)
medium_n_score = sum(best[n]['score'] for n in range(21, 61))
print(f"N=21-60 contribution: {medium_n_score:.6f} ({medium_n_score/ensemble_score*100:.1f}%)")

# Large N (61-200)
large_n_score = sum(best[n]['score'] for n in range(61, 201))
print(f"N=61-200 contribution: {large_n_score:.6f} ({large_n_score/ensemble_score*100:.1f}%)")

print(f"\nTotal: {ensemble_score:.6f}")
print(f"Target: 68.931058")
print(f"Need to save: {ensemble_score - 68.931058:.6f} points")