# External Ensemble - Access External Data Sources

Access external data sources to find better solutions for specific N values:
1. SmartManoj GitHub submission
2. Kaggle datasets (bucket-of-chump, santa25-public, telegram-shared)
3. Kernel outputs

In [1]:
import numpy as np
import pandas as pd
import os
import json
from shapely.geometry import Polygon
from shapely import STRtree

os.chdir('/home/code/experiments/002_external_ensemble')

# Tree vertices
TX = np.array([0,0.125,0.0625,0.2,0.1,0.35,0.075,0.075,-0.075,-0.075,-0.35,-0.1,-0.2,-0.0625,-0.125])
TY = np.array([0.8,0.5,0.5,0.25,0.25,0,0,-0.2,-0.2,0,0,0.25,0.25,0.5,0.5])

def get_tree_polygon(x, y, deg):
    """Get tree polygon at position (x, y) with rotation deg"""
    rad = np.radians(deg)
    c, s = np.cos(rad), np.sin(rad)
    vertices = []
    for j in range(len(TX)):
        vx = TX[j] * c - TY[j] * s + x
        vy = TX[j] * s + TY[j] * c + y
        vertices.append((vx, vy))
    return Polygon(vertices)

def score_group(xs, ys, degs):
    """Calculate score for a single N-tree configuration"""
    n = len(xs)
    all_x, all_y = [], []
    for i in range(n):
        rad = np.radians(degs[i])
        c, s = np.cos(rad), np.sin(rad)
        for j in range(len(TX)):
            x = TX[j] * c - TY[j] * s + xs[i]
            y = TX[j] * s + TY[j] * c + ys[i]
            all_x.append(x)
            all_y.append(y)
    side = max(max(all_x) - min(all_x), max(all_y) - min(all_y))
    return side * side / n

def check_overlaps(xs, ys, degs):
    """Check if any trees overlap using Shapely"""
    polygons = [get_tree_polygon(xs[i], ys[i], degs[i]) for i in range(len(xs))]
    tree = STRtree(polygons)
    for i, poly in enumerate(polygons):
        candidates = tree.query(poly)
        for j in candidates:
            if i < j:
                if polygons[i].intersection(polygons[j]).area > 0:
                    return True
    return False

def parse_submission(df):
    """Parse submission dataframe"""
    df = df.copy()
    df['x_val'] = df['x'].str[1:].astype(float)
    df['y_val'] = df['y'].str[1:].astype(float)
    df['deg_val'] = df['deg'].str[1:].astype(float)
    df['n'] = df['id'].str.split('_').str[0].astype(int)
    return df

def calculate_per_n_scores(df):
    """Calculate score for each N value"""
    per_n_scores = {}
    for n in range(1, 201):
        group = df[df['n'] == n]
        if len(group) == n:
            xs = group['x_val'].values
            ys = group['y_val'].values
            degs = group['deg_val'].values
            per_n_scores[n] = score_group(xs, ys, degs)
    return per_n_scores

print("Functions defined.")

Functions defined.


In [2]:
# Try to download SmartManoj GitHub submission
import urllib.request

smartmanoj_url = 'https://raw.githubusercontent.com/SmartManoj/Santa-Scoreboard/main/submission.csv'

try:
    print(f"Downloading from {smartmanoj_url}...")
    urllib.request.urlretrieve(smartmanoj_url, 'smartmanoj_submission.csv')
    print("Downloaded SmartManoj submission!")
    
    df_smartmanoj = pd.read_csv('smartmanoj_submission.csv')
    print(f"Shape: {df_smartmanoj.shape}")
    print(df_smartmanoj.head())
except Exception as e:
    print(f"Failed to download: {e}")
    df_smartmanoj = None

Downloading from https://raw.githubusercontent.com/SmartManoj/Santa-Scoreboard/main/submission.csv...
Downloaded SmartManoj submission!
Shape: (20100, 4)
      id                       x                      y  \
0  001_0  s-48.19608619421424578  s58.77098461521422479   
1  002_0    s0.15409706962136058  s-0.03854074269477708   
2  002_1   s-0.15409706962135647  s-0.56145925730522794   
3  003_0    s1.12365581614030097   s0.78110181599256301   
4  003_1    s1.23405569584216002   s1.27599950066375900   

                      deg  
0   s45.00000000000000000  
1  s203.62937773064953717  
2   s23.62937773064970415  
3  s111.12513229289299943  
4   s66.37062226934300213  


In [3]:
# Check for Kaggle datasets in research folder
import glob

print("Checking research kernels for submission files...")
kernel_paths = glob.glob('/home/code/research/kernels/*/submission*.csv')
print(f"Found {len(kernel_paths)} submission files in kernels:")
for p in kernel_paths:
    print(f"  {p}")

# Also check for any CSV files that might be submissions
all_csvs = glob.glob('/home/code/research/kernels/*/*.csv')
print(f"\nAll CSV files in kernels ({len(all_csvs)}):")
for p in all_csvs[:20]:  # Show first 20
    print(f"  {p}")

Checking research kernels for submission files...
Found 0 submission files in kernels:

All CSV files in kernels (0):


In [4]:
# Check for datasets in data folder
print("Checking data folder...")
data_csvs = glob.glob('/home/data/*.csv')
print(f"Found {len(data_csvs)} CSV files in data:")
for p in data_csvs:
    print(f"  {p}")

# Check for any additional data sources
print("\nChecking for Kaggle input datasets...")
if os.path.exists('/kaggle/input'):
    for d in os.listdir('/kaggle/input'):
        print(f"  /kaggle/input/{d}")
else:
    print("  /kaggle/input not found")

Checking data folder...
Found 1 CSV files in data:
  /home/data/sample_submission.csv

Checking for Kaggle input datasets...
  /kaggle/input not found


In [None]:
# Load all available submissions and score them
submissions = {}

# 1. Current best ensemble from snapshots
print("Loading current ensemble submission...")
df_current = pd.read_csv('/home/submission/submission.csv')
df_current = parse_submission(df_current)
submissions['current_ensemble'] = df_current
print(f"  Current ensemble loaded: {df_current.shape}")

# 2. SmartManoj if available
if df_smartmanoj is not None:
    try:
        df_sm = parse_submission(df_smartmanoj)
        submissions['smartmanoj'] = df_sm
        print(f"  SmartManoj loaded: {df_sm.shape}")
    except Exception as e:
        print(f"  Failed to parse SmartManoj: {e}")

# 3. Load all snapshot submissions
print("\nLoading snapshot submissions...")
snapshot_dir = '/home/nonroot/snapshots/santa-2025'
for snap_id in os.listdir(snapshot_dir):
    sub_path = os.path.join(snapshot_dir, snap_id, 'submission', 'submission.csv')
    if os.path.exists(sub_path):
        try:
            df = pd.read_csv(sub_path)
            df = parse_submission(df)
            submissions[f'snapshot_{snap_id}'] = df
        except:
            pass

print(f"Total submissions loaded: {len(submissions)}")

In [None]:
# Calculate per-N scores for all submissions
print("Calculating per-N scores for all submissions...")
all_per_n_scores = {}

for name, df in submissions.items():
    try:
        scores = calculate_per_n_scores(df)
        if len(scores) == 200:  # Valid submission
            all_per_n_scores[name] = scores
            total = sum(scores.values())
            print(f"  {name}: total={total:.6f}")
    except Exception as e:
        print(f"  {name}: ERROR - {e}")

print(f"\nValid submissions: {len(all_per_n_scores)}")

In [None]:
# Find best score for each N across all submissions
print("Finding best score for each N...")
best_per_n = {}
best_source = {}

for n in range(1, 201):
    best_score = float('inf')
    best_src = None
    for name, scores in all_per_n_scores.items():
        if n in scores and scores[n] < best_score:
            best_score = scores[n]
            best_src = name
    best_per_n[n] = best_score
    best_source[n] = best_src

# Calculate total score
total_best = sum(best_per_n.values())
print(f"\nBest ensemble total score: {total_best:.6f}")

# Compare to current ensemble
current_scores = all_per_n_scores.get('current_ensemble', {})
current_total = sum(current_scores.values()) if current_scores else 0
print(f"Current ensemble total: {current_total:.6f}")
print(f"Improvement: {current_total - total_best:.6f}")

In [None]:
# Show which N values improved and from which source
print("\nN values with improvements:")
improvements = []
for n in range(1, 201):
    if n in current_scores and n in best_per_n:
        diff = current_scores[n] - best_per_n[n]
        if diff > 1e-10:
            improvements.append((n, diff, best_source[n]))

improvements.sort(key=lambda x: x[1], reverse=True)
print(f"Total N values improved: {len(improvements)}")
print("\nTop 20 improvements:")
for n, diff, src in improvements[:20]:
    print(f"  N={n}: improved by {diff:.6f} from {src}")

In [None]:
# Create new ensemble submission by taking best solution for each N
print("Creating new ensemble submission...")

new_rows = []
for n in range(1, 201):
    best_src = best_source[n]
    df_best = submissions[best_src]
    group = df_best[df_best['n'] == n]
    for _, row in group.iterrows():
        new_rows.append({
            'id': row['id'],
            'x': row['x'],
            'y': row['y'],
            'deg': row['deg']
        })

df_new = pd.DataFrame(new_rows)
print(f"New ensemble shape: {df_new.shape}")
print(df_new.head())

In [None]:
# Verify no overlaps in new ensemble
print("Verifying no overlaps in new ensemble...")
df_new_parsed = parse_submission(df_new)

has_overlap = False
for n in range(1, 201):
    group = df_new_parsed[df_new_parsed['n'] == n]
    xs = group['x_val'].values
    ys = group['y_val'].values
    degs = group['deg_val'].values
    
    if check_overlaps(xs, ys, degs):
        print(f"  WARNING: Overlap detected in N={n}!")
        has_overlap = True

if not has_overlap:
    print("  No overlaps detected!")
else:
    print("  OVERLAPS FOUND - submission may be rejected!")

In [None]:
# Calculate final score
final_scores = calculate_per_n_scores(df_new_parsed)
final_total = sum(final_scores.values())
print(f"\nFinal ensemble score: {final_total:.6f}")
print(f"Target: 68.894234")
print(f"Gap to target: {final_total - 68.894234:.6f}")

# Save submission
df_new.to_csv('/home/submission/submission.csv', index=False)
print("\nSaved to /home/submission/submission.csv")

# Save metrics
metrics = {'cv_score': final_total}
with open('metrics.json', 'w') as f:
    json.dump(metrics, f)
print(f"Saved metrics: {metrics}")