# External Ensemble - Access External Data Sources

Access external data sources to find better solutions for specific N values:
1. SmartManoj GitHub submission
2. Kaggle datasets (bucket-of-chump, santa25-public, telegram-shared)
3. Kernel outputs

In [1]:
import numpy as np
import pandas as pd
import os
import json
from shapely.geometry import Polygon
from shapely import STRtree

os.chdir('/home/code/experiments/002_external_ensemble')

# Tree vertices
TX = np.array([0,0.125,0.0625,0.2,0.1,0.35,0.075,0.075,-0.075,-0.075,-0.35,-0.1,-0.2,-0.0625,-0.125])
TY = np.array([0.8,0.5,0.5,0.25,0.25,0,0,-0.2,-0.2,0,0,0.25,0.25,0.5,0.5])

def get_tree_polygon(x, y, deg):
    """Get tree polygon at position (x, y) with rotation deg"""
    rad = np.radians(deg)
    c, s = np.cos(rad), np.sin(rad)
    vertices = []
    for j in range(len(TX)):
        vx = TX[j] * c - TY[j] * s + x
        vy = TX[j] * s + TY[j] * c + y
        vertices.append((vx, vy))
    return Polygon(vertices)

def score_group(xs, ys, degs):
    """Calculate score for a single N-tree configuration"""
    n = len(xs)
    all_x, all_y = [], []
    for i in range(n):
        rad = np.radians(degs[i])
        c, s = np.cos(rad), np.sin(rad)
        for j in range(len(TX)):
            x = TX[j] * c - TY[j] * s + xs[i]
            y = TX[j] * s + TY[j] * c + ys[i]
            all_x.append(x)
            all_y.append(y)
    side = max(max(all_x) - min(all_x), max(all_y) - min(all_y))
    return side * side / n

def check_overlaps(xs, ys, degs):
    """Check if any trees overlap using Shapely"""
    polygons = [get_tree_polygon(xs[i], ys[i], degs[i]) for i in range(len(xs))]
    tree = STRtree(polygons)
    for i, poly in enumerate(polygons):
        candidates = tree.query(poly)
        for j in candidates:
            if i < j:
                if polygons[i].intersection(polygons[j]).area > 0:
                    return True
    return False

def parse_submission(df):
    """Parse submission dataframe"""
    df = df.copy()
    df['x_val'] = df['x'].str[1:].astype(float)
    df['y_val'] = df['y'].str[1:].astype(float)
    df['deg_val'] = df['deg'].str[1:].astype(float)
    df['n'] = df['id'].str.split('_').str[0].astype(int)
    return df

def calculate_per_n_scores(df):
    """Calculate score for each N value"""
    per_n_scores = {}
    for n in range(1, 201):
        group = df[df['n'] == n]
        if len(group) == n:
            xs = group['x_val'].values
            ys = group['y_val'].values
            degs = group['deg_val'].values
            per_n_scores[n] = score_group(xs, ys, degs)
    return per_n_scores

print("Functions defined.")

Functions defined.


In [2]:
# Try to download SmartManoj GitHub submission
import urllib.request

smartmanoj_url = 'https://raw.githubusercontent.com/SmartManoj/Santa-Scoreboard/main/submission.csv'

try:
    print(f"Downloading from {smartmanoj_url}...")
    urllib.request.urlretrieve(smartmanoj_url, 'smartmanoj_submission.csv')
    print("Downloaded SmartManoj submission!")
    
    df_smartmanoj = pd.read_csv('smartmanoj_submission.csv')
    print(f"Shape: {df_smartmanoj.shape}")
    print(df_smartmanoj.head())
except Exception as e:
    print(f"Failed to download: {e}")
    df_smartmanoj = None

Downloading from https://raw.githubusercontent.com/SmartManoj/Santa-Scoreboard/main/submission.csv...
Downloaded SmartManoj submission!
Shape: (20100, 4)
      id                       x                      y  \
0  001_0  s-48.19608619421424578  s58.77098461521422479   
1  002_0    s0.15409706962136058  s-0.03854074269477708   
2  002_1   s-0.15409706962135647  s-0.56145925730522794   
3  003_0    s1.12365581614030097   s0.78110181599256301   
4  003_1    s1.23405569584216002   s1.27599950066375900   

                      deg  
0   s45.00000000000000000  
1  s203.62937773064953717  
2   s23.62937773064970415  
3  s111.12513229289299943  
4   s66.37062226934300213  


In [3]:
# Check for Kaggle datasets in research folder
import glob

print("Checking research kernels for submission files...")
kernel_paths = glob.glob('/home/code/research/kernels/*/submission*.csv')
print(f"Found {len(kernel_paths)} submission files in kernels:")
for p in kernel_paths:
    print(f"  {p}")

# Also check for any CSV files that might be submissions
all_csvs = glob.glob('/home/code/research/kernels/*/*.csv')
print(f"\nAll CSV files in kernels ({len(all_csvs)}):")
for p in all_csvs[:20]:  # Show first 20
    print(f"  {p}")

Checking research kernels for submission files...
Found 0 submission files in kernels:

All CSV files in kernels (0):


In [4]:
# Check for datasets in data folder
print("Checking data folder...")
data_csvs = glob.glob('/home/data/*.csv')
print(f"Found {len(data_csvs)} CSV files in data:")
for p in data_csvs:
    print(f"  {p}")

# Check for any additional data sources
print("\nChecking for Kaggle input datasets...")
if os.path.exists('/kaggle/input'):
    for d in os.listdir('/kaggle/input'):
        print(f"  /kaggle/input/{d}")
else:
    print("  /kaggle/input not found")

Checking data folder...
Found 1 CSV files in data:
  /home/data/sample_submission.csv

Checking for Kaggle input datasets...
  /kaggle/input not found


In [5]:
# Load all available submissions and score them
submissions = {}

# 1. Current best ensemble from snapshots
print("Loading current ensemble submission...")
df_current = pd.read_csv('/home/submission/submission.csv')
df_current = parse_submission(df_current)
submissions['current_ensemble'] = df_current
print(f"  Current ensemble loaded: {df_current.shape}")

# 2. SmartManoj if available
if df_smartmanoj is not None:
    try:
        df_sm = parse_submission(df_smartmanoj)
        submissions['smartmanoj'] = df_sm
        print(f"  SmartManoj loaded: {df_sm.shape}")
    except Exception as e:
        print(f"  Failed to parse SmartManoj: {e}")

# 3. Load all snapshot submissions
print("\nLoading snapshot submissions...")
snapshot_dir = '/home/nonroot/snapshots/santa-2025'
for snap_id in os.listdir(snapshot_dir):
    sub_path = os.path.join(snapshot_dir, snap_id, 'submission', 'submission.csv')
    if os.path.exists(sub_path):
        try:
            df = pd.read_csv(sub_path)
            df = parse_submission(df)
            submissions[f'snapshot_{snap_id}'] = df
        except:
            pass

print(f"Total submissions loaded: {len(submissions)}")

Loading current ensemble submission...
  Current ensemble loaded: (20100, 8)
  SmartManoj loaded: (20100, 8)

Loading snapshot submissions...


Total submissions loaded: 78


In [6]:
# Calculate per-N scores for all submissions
print("Calculating per-N scores for all submissions...")
all_per_n_scores = {}

for name, df in submissions.items():
    try:
        scores = calculate_per_n_scores(df)
        if len(scores) == 200:  # Valid submission
            all_per_n_scores[name] = scores
            total = sum(scores.values())
            print(f"  {name}: total={total:.6f}")
    except Exception as e:
        print(f"  {name}: ERROR - {e}")

print(f"\nValid submissions: {len(all_per_n_scores)}")

Calculating per-N scores for all submissions...


  current_ensemble: total=70.615745


  smartmanoj: total=70.743774


  snapshot_21116303805: total=70.676102


  snapshot_21328309254: total=70.615745


  snapshot_21121776553: total=70.936674


  snapshot_21165872902: total=70.647306


  snapshot_21198893057: total=70.625918


  snapshot_21129617858: total=70.676764


  snapshot_20992536951: total=87.804045


  snapshot_21190224310: total=70.630465


  snapshot_21322576451: total=70.619825


  snapshot_21123768399: total=70.676102


  snapshot_20971964134: total=87.804045
  snapshot_21086827828: total=114.587809


  snapshot_21121942239: total=70.676102


  snapshot_21145965159: total=38.909987


  snapshot_21191207951: total=70.627608


  snapshot_21165876936: total=70.647306


  snapshot_20992150197: total=217.576225


  snapshot_21139436707: total=162.204811


  snapshot_21123763369: total=70.743774


  snapshot_21122904233: total=118.230882


  snapshot_20991308120: total=87.804045


  snapshot_21222392487: total=70.626088
  snapshot_21180221700: total=70.630478


  snapshot_21222390477: total=70.624381


  snapshot_20970671503: total=164.820045


  snapshot_21191209482: total=70.625918


  snapshot_21139436611: total=170.867211


  snapshot_21322577324: total=70.625376


  snapshot_21328308881: total=70.676102


  snapshot_21139436695: total=151.174322


  snapshot_21156852373: total=70.676102


  snapshot_20952569566: total=163.194569
  snapshot_21156853393: total=70.676102


  snapshot_21198891805: total=70.627582
  snapshot_21108486172: total=70.734327


  snapshot_21139436684: total=148.177124


  snapshot_21179744881: total=70.676102


  snapshot_21121943993: total=70.676102


  snapshot_21117626902: total=70.676145


  snapshot_21222373488: total=70.624381


  snapshot_21129619422: total=170.909275


  snapshot_21016257921: total=87.364112
  snapshot_21191212682: total=70.630455


  snapshot_21129622493: total=129.272924
  snapshot_21191206469: total=70.630455


  snapshot_20984924920: total=173.688052


  snapshot_21328310479: total=70.615745
  snapshot_21156851249: total=70.659437


  snapshot_21180223864: total=70.630429


  snapshot_21191211160: total=70.627582


  snapshot_21165878844: total=70.659436
  snapshot_21180219583: total=70.630478


  snapshot_21145966992: total=70.572798


  snapshot_21139436732: total=164.924007


  snapshot_21322576827: total=70.616145


  snapshot_21105319338: total=70.734327
  snapshot_21328309666: total=70.619825


  snapshot_21129625840: total=98.875886


  snapshot_21156850282: total=70.659437


  snapshot_21198925328: total=70.624381


  snapshot_21145968755: total=70.659959


  snapshot_21104669204: total=70.734327


  snapshot_21145961371: total=70.676102


  snapshot_21198928571: total=70.625918


  snapshot_21129620891: total=88.329998


  snapshot_21198790429: total=70.627582


  snapshot_21328310048: total=70.626088


  snapshot_21222377956: total=70.624381


  snapshot_21322578388: total=70.926150


  snapshot_21190222820: total=70.630455


  snapshot_21222375510: total=70.624381
  snapshot_21090949260: total=84.711359


  snapshot_21165874980: total=70.630478


  snapshot_21165870845: total=70.676102


  snapshot_21117525284: total=70.676104


  snapshot_21198927060: total=70.624381

Valid submissions: 78


In [7]:
# Find best score for each N across all submissions
print("Finding best score for each N...")
best_per_n = {}
best_source = {}

for n in range(1, 201):
    best_score = float('inf')
    best_src = None
    for name, scores in all_per_n_scores.items():
        if n in scores and scores[n] < best_score:
            best_score = scores[n]
            best_src = name
    best_per_n[n] = best_score
    best_source[n] = best_src

# Calculate total score
total_best = sum(best_per_n.values())
print(f"\nBest ensemble total score: {total_best:.6f}")

# Compare to current ensemble
current_scores = all_per_n_scores.get('current_ensemble', {})
current_total = sum(current_scores.values()) if current_scores else 0
print(f"Current ensemble total: {current_total:.6f}")
print(f"Improvement: {current_total - total_best:.6f}")

Finding best score for each N...

Best ensemble total score: 37.714008
Current ensemble total: 70.615745
Improvement: 32.901736


In [8]:
# Show which N values improved and from which source
print("\nN values with improvements:")
improvements = []
for n in range(1, 201):
    if n in current_scores and n in best_per_n:
        diff = current_scores[n] - best_per_n[n]
        if diff > 1e-10:
            improvements.append((n, diff, best_source[n]))

improvements.sort(key=lambda x: x[1], reverse=True)
print(f"Total N values improved: {len(improvements)}")
print("\nTop 20 improvements:")
for n, diff, src in improvements[:20]:
    print(f"  N={n}: improved by {diff:.6f} from {src}")


N values with improvements:
Total N values improved: 192

Top 20 improvements:
  N=170: improved by 0.214495 from snapshot_21145965159
  N=149: improved by 0.213597 from snapshot_21145965159
  N=191: improved by 0.213593 from snapshot_21145965159
  N=185: improved by 0.213412 from snapshot_21145965159
  N=187: improved by 0.213275 from snapshot_21145965159
  N=192: improved by 0.212948 from snapshot_21145965159
  N=200: improved by 0.212607 from snapshot_21145965159
  N=198: improved by 0.211845 from snapshot_21145965159
  N=189: improved by 0.211286 from snapshot_21145965159
  N=130: improved by 0.210859 from snapshot_21145965159
  N=150: improved by 0.210813 from snapshot_21145965159
  N=136: improved by 0.210700 from snapshot_21145965159
  N=120: improved by 0.210693 from snapshot_21145965159
  N=105: improved by 0.210492 from snapshot_21145965159
  N=145: improved by 0.210159 from snapshot_21145965159
  N=171: improved by 0.209623 from snapshot_21145965159
  N=172: improved by 0.2

In [9]:
# Create new ensemble submission by taking best solution for each N
print("Creating new ensemble submission...")

new_rows = []
for n in range(1, 201):
    best_src = best_source[n]
    df_best = submissions[best_src]
    group = df_best[df_best['n'] == n]
    for _, row in group.iterrows():
        new_rows.append({
            'id': row['id'],
            'x': row['x'],
            'y': row['y'],
            'deg': row['deg']
        })

df_new = pd.DataFrame(new_rows)
print(f"New ensemble shape: {df_new.shape}")
print(df_new.head())

Creating new ensemble submission...


New ensemble shape: (20100, 4)
      id                          x                         y  \
0  001_0  s-48.19608619421424577922  s58.77098461521422478882   
1  002_0      s0.154097069621355887    s-0.038540742694794648   
2  002_1     s-0.154097069621372845    s-0.561459257305224058   
3  003_0         s0.254937643697833       s-0.233436061549416   
4  003_1         s0.357722754471247        s0.250360566787394   

                        deg  
0  s45.00000000000000000000  
1   s144.272760863123583164  
2   s324.272760863123533426  
3       s113.56326044172948  
4          s66.370622269343  


In [10]:
# Verify no overlaps in new ensemble
print("Verifying no overlaps in new ensemble...")
df_new_parsed = parse_submission(df_new)

has_overlap = False
for n in range(1, 201):
    group = df_new_parsed[df_new_parsed['n'] == n]
    xs = group['x_val'].values
    ys = group['y_val'].values
    degs = group['deg_val'].values
    
    if check_overlaps(xs, ys, degs):
        print(f"  WARNING: Overlap detected in N={n}!")
        has_overlap = True

if not has_overlap:
    print("  No overlaps detected!")
else:
    print("  OVERLAPS FOUND - submission may be rejected!")

Verifying no overlaps in new ensemble...












  OVERLAPS FOUND - submission may be rejected!


In [11]:
# Calculate final score
final_scores = calculate_per_n_scores(df_new_parsed)
final_total = sum(final_scores.values())
print(f"\nFinal ensemble score: {final_total:.6f}")
print(f"Target: 68.894234")
print(f"Gap to target: {final_total - 68.894234:.6f}")

# Save submission
df_new.to_csv('/home/submission/submission.csv', index=False)
print("\nSaved to /home/submission/submission.csv")

# Save metrics
metrics = {'cv_score': final_total}
with open('metrics.json', 'w') as f:
    json.dump(metrics, f)
print(f"Saved metrics: {metrics}")


Final ensemble score: 37.714008
Target: 68.894234
Gap to target: -31.180226

Saved to /home/submission/submission.csv
Saved metrics: {'cv_score': 37.71400835684302}


In [12]:
# Investigate the best snapshot - snapshot_21145965159
print("Investigating snapshot_21145965159...")
df_best_snap = submissions['snapshot_21145965159']

# Check overlaps for each N in this snapshot
overlap_ns = []
for n in range(1, 201):
    group = df_best_snap[df_best_snap['n'] == n]
    xs = group['x_val'].values
    ys = group['y_val'].values
    degs = group['deg_val'].values
    
    if check_overlaps(xs, ys, degs):
        overlap_ns.append(n)

print(f"N values with overlaps in snapshot_21145965159: {len(overlap_ns)}")
print(f"First 20 overlap N values: {overlap_ns[:20]}")

Investigating snapshot_21145965159...


N values with overlaps in snapshot_21145965159: 196
First 20 overlap N values: [2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24]


In [13]:
# Create a VALID ensemble - only use solutions that pass overlap detection
print("Creating VALID ensemble (only overlap-free solutions)...")

# First, identify which submissions have valid (no overlap) solutions for each N
valid_solutions = {}  # {n: [(source, score), ...]}

for n in range(1, 201):
    valid_solutions[n] = []
    for name, df in submissions.items():
        group = df[df['n'] == n]
        if len(group) == n:
            xs = group['x_val'].values
            ys = group['y_val'].values
            degs = group['deg_val'].values
            
            # Check for overlaps
            if not check_overlaps(xs, ys, degs):
                score = all_per_n_scores[name][n]
                valid_solutions[n].append((name, score))

# Count how many N values have valid solutions
valid_counts = {n: len(v) for n, v in valid_solutions.items()}
print(f"N values with at least 1 valid solution: {sum(1 for v in valid_counts.values() if v > 0)}")
print(f"N values with NO valid solution: {sum(1 for v in valid_counts.values() if v == 0)}")

Creating VALID ensemble (only overlap-free solutions)...


N values with at least 1 valid solution: 200
N values with NO valid solution: 0


In [14]:
# Find best VALID solution for each N
print("Finding best VALID solution for each N...")
best_valid_per_n = {}
best_valid_source = {}

for n in range(1, 201):
    if valid_solutions[n]:
        # Sort by score and take best
        valid_solutions[n].sort(key=lambda x: x[1])
        best_valid_source[n] = valid_solutions[n][0][0]
        best_valid_per_n[n] = valid_solutions[n][0][1]
    else:
        print(f"WARNING: No valid solution for N={n}!")
        # Use current ensemble as fallback
        best_valid_source[n] = 'current_ensemble'
        best_valid_per_n[n] = all_per_n_scores['current_ensemble'][n]

# Calculate total score
total_valid = sum(best_valid_per_n.values())
print(f"\nBest VALID ensemble total score: {total_valid:.6f}")
print(f"Current ensemble total: {current_total:.6f}")
print(f"Improvement: {current_total - total_valid:.6f}")

Finding best VALID solution for each N...

Best VALID ensemble total score: 70.615745
Current ensemble total: 70.615745
Improvement: -0.000000


In [15]:
# Show improvements from valid ensemble
print("\nN values with improvements (valid only):")
valid_improvements = []
for n in range(1, 201):
    if n in current_scores and n in best_valid_per_n:
        diff = current_scores[n] - best_valid_per_n[n]
        if diff > 1e-10:
            valid_improvements.append((n, diff, best_valid_source[n]))

valid_improvements.sort(key=lambda x: x[1], reverse=True)
print(f"Total N values improved: {len(valid_improvements)}")
print("\nTop 20 improvements:")
for n, diff, src in valid_improvements[:20]:
    print(f"  N={n}: improved by {diff:.6f} from {src}")


N values with improvements (valid only):
Total N values improved: 0

Top 20 improvements:


In [16]:
# Create the valid ensemble submission
print("Creating valid ensemble submission...")

new_valid_rows = []
for n in range(1, 201):
    best_src = best_valid_source[n]
    df_best = submissions[best_src]
    group = df_best[df_best['n'] == n]
    for _, row in group.iterrows():
        new_valid_rows.append({
            'id': row['id'],
            'x': row['x'],
            'y': row['y'],
            'deg': row['deg']
        })

df_valid = pd.DataFrame(new_valid_rows)
print(f"Valid ensemble shape: {df_valid.shape}")

Creating valid ensemble submission...


Valid ensemble shape: (20100, 4)


In [17]:
# Final verification - check no overlaps in valid ensemble
print("Final verification - checking no overlaps...")
df_valid_parsed = parse_submission(df_valid)

has_overlap = False
overlap_count = 0
for n in range(1, 201):
    group = df_valid_parsed[df_valid_parsed['n'] == n]
    xs = group['x_val'].values
    ys = group['y_val'].values
    degs = group['deg_val'].values
    
    if check_overlaps(xs, ys, degs):
        print(f"  WARNING: Overlap detected in N={n}!")
        has_overlap = True
        overlap_count += 1

if not has_overlap:
    print("  ✓ No overlaps detected - submission is valid!")
else:
    print(f"  ✗ {overlap_count} N values have overlaps!")

Final verification - checking no overlaps...


  ✓ No overlaps detected - submission is valid!


In [18]:
# Calculate and save final score
final_valid_scores = calculate_per_n_scores(df_valid_parsed)
final_valid_total = sum(final_valid_scores.values())
print(f"\nFinal VALID ensemble score: {final_valid_total:.6f}")
print(f"Target: 68.894234")
print(f"Gap to target: {final_valid_total - 68.894234:.6f}")

# Save submission
df_valid.to_csv('/home/submission/submission.csv', index=False)
print("\nSaved to /home/submission/submission.csv")

# Save metrics
metrics = {'cv_score': final_valid_total}
with open('metrics.json', 'w') as f:
    json.dump(metrics, f)
print(f"Saved metrics: {metrics}")


Final VALID ensemble score: 70.615745


Target: 68.894234
Gap to target: 1.721511

Saved to /home/submission/submission.csv
Saved metrics: {'cv_score': 70.615744771173}


In [19]:
# Check SmartManoj submission for valid improvements
print("Checking SmartManoj submission for valid improvements...")

smartmanoj_scores = all_per_n_scores.get('smartmanoj', {})
current_scores = all_per_n_scores.get('current_ensemble', {})

sm_improvements = []
for n in range(1, 201):
    if n in smartmanoj_scores and n in current_scores:
        # Check if SmartManoj has better score
        if smartmanoj_scores[n] < current_scores[n]:
            # Check if SmartManoj solution is valid (no overlaps)
            group = submissions['smartmanoj'][submissions['smartmanoj']['n'] == n]
            xs = group['x_val'].values
            ys = group['y_val'].values
            degs = group['deg_val'].values
            
            if not check_overlaps(xs, ys, degs):
                diff = current_scores[n] - smartmanoj_scores[n]
                sm_improvements.append((n, diff))

print(f"SmartManoj valid improvements: {len(sm_improvements)}")
if sm_improvements:
    sm_improvements.sort(key=lambda x: x[1], reverse=True)
    print("Top improvements:")
    for n, diff in sm_improvements[:10]:
        print(f"  N={n}: {diff:.6f}")

Checking SmartManoj submission for valid improvements...
SmartManoj valid improvements: 0
