# Evolver Loop 10 Analysis

## Goal: Find more diverse sources and ensemble to beat 70.364

Current best: 70.364392 (exp_010_full_ensemble)
Target: 68.879467
Gap: 1.485 points

In [None]:
import pandas as pd
import numpy as np
import os
import math
from numba import njit
from decimal import Decimal, getcontext
from shapely.geometry import Polygon
from shapely import affinity

getcontext().prec = 30
SCALE = 10**18

@njit
def make_polygon_template():
    tw=0.15; th=0.2; bw=0.7; mw=0.4; ow=0.25
    tip=0.8; t1=0.5; t2=0.25; base=0.0; tbot=-th
    x=np.array([0,ow/2,ow/4,mw/2,mw/4,bw/2,tw/2,tw/2,-tw/2,-tw/2,-bw/2,-mw/4,-mw/2,-ow/4,-ow/2],np.float64)
    y=np.array([tip,t1,t1,t2,t2,base,base,tbot,tbot,base,base,t2,t2,t1,t1],np.float64)
    return x,y

@njit
def score_group(xs, ys, degs, tx, ty):
    """Compute score for a group of trees."""
    n = xs.size
    V = tx.size
    mnx = 1e300; mny = 1e300; mxx = -1e300; mxy = -1e300
    for i in range(n):
        r = degs[i] * math.pi / 180.0
        c = math.cos(r); s = math.sin(r)
        xi = xs[i]; yi = ys[i]
        for j in range(V):
            X = c * tx[j] - s * ty[j] + xi
            Y = s * tx[j] + c * ty[j] + yi
            if X < mnx: mnx = X
            if X > mxx: mxx = X
            if Y < mny: mny = Y
            if Y > mxy: mxy = Y
    side = max(mxx - mnx, mxy - mny)
    return side * side / n

def strip(a):
    return np.array([float(str(v).replace("s", "")) for v in a], np.float64)

tx, ty = make_polygon_template()

def compute_score_for_n(df, n):
    """Compute score for a specific N."""
    n_df = df[df['N'] == n]
    if len(n_df) == 0:
        return float('inf')
    xs = strip(n_df['x'].to_numpy())
    ys = strip(n_df['y'].to_numpy())
    ds = strip(n_df['deg'].to_numpy())
    return score_group(xs, ys, ds, tx, ty)

def compute_total_score(df):
    """Compute total score for all N."""
    df = df.copy()
    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    total = 0
    for n in range(1, 201):
        score = compute_score_for_n(df, n)
        total += score
    return total

def load_and_prepare(filepath):
    """Load CSV and add N column."""
    df = pd.read_csv(filepath)
    df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
    return df

print("Functions loaded successfully")

In [None]:
# Check all available CSV files in kaggle_datasets
import glob

csv_files = []
for root, dirs, files in os.walk('/home/code/kaggle_datasets'):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

print(f"Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(f"  {f}")

In [None]:
# Score each CSV file
scores = {}
for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        if 'id' in df.columns and 'x' in df.columns:
            score = compute_total_score(df)
            scores[csv_file] = score
            print(f"{os.path.basename(csv_file)}: {score:.6f}")
    except Exception as e:
        print(f"{os.path.basename(csv_file)}: ERROR - {e}")

print(f"\nScored {len(scores)} files")

In [None]:
# Load current best (exp_010)
best_df = pd.read_csv('/home/code/experiments/010_full_ensemble/submission.csv')
best_score = compute_total_score(best_df)
print(f"Current best (exp_010): {best_score:.6f}")

# Also load exp_009 for comparison
exp009_df = pd.read_csv('/home/code/experiments/009_santa_ensemble/submission.csv')
exp009_score = compute_total_score(exp009_df)
print(f"exp_009: {exp009_score:.6f}")

In [None]:
# Check chistyakov submission_best.csv specifically
chistyakov_best = pd.read_csv('/home/code/kaggle_datasets/chistyakov_best_public/submission_best.csv')
chistyakov_score = compute_total_score(chistyakov_best)
print(f"chistyakov submission_best.csv: {chistyakov_score:.6f}")

# Check if it has overlaps
def check_overlaps_strict(df, n):
    """Check for overlaps using strict 1e18 scaling."""
    n_str = str(n).zfill(3)
    n_df = df[df['id'].str.startswith(n_str + '_')]
    if len(n_df) == 0:
        return True, []
    
    polygons = []
    for _, row in n_df.iterrows():
        tree_idx = int(row['id'].split('_')[1])
        tree_id = tree_idx + 1
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        
        poly = get_tree_polygon(tree_id, x, y, deg)
        # Scale to integers for strict comparison
        coords = [(int(Decimal(str(c[0])) * SCALE), int(Decimal(str(c[1])) * SCALE)) 
                  for c in poly.exterior.coords]
        polygons.append(Polygon(coords))
    
    overlaps = []
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                overlaps.append((i, j))
    
    return len(overlaps) == 0, overlaps

# Check overlaps for chistyakov
overlap_ns = []
for n in range(1, 201):
    ok, overlaps = check_overlaps_strict(chistyakov_best, n)
    if not ok:
        overlap_ns.append(n)

print(f"\nchistyakov submission_best.csv has overlaps in {len(overlap_ns)} N values: {overlap_ns[:10]}...")

In [None]:
# Find the best source for each N
print("\nFinding best source for each N...")

# Load all valid sources
sources = {}

# exp_010 (current best)
sources['exp_010'] = pd.read_csv('/home/code/experiments/010_full_ensemble/submission.csv')

# All kaggle_datasets CSVs
for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        if 'id' in df.columns and 'x' in df.columns:
            name = os.path.basename(csv_file)
            sources[name] = df
    except:
        pass

print(f"Loaded {len(sources)} sources")

# For each N, find best valid source
best_per_n = {}
for n in range(1, 201):
    best_score = float('inf')
    best_source = None
    
    for name, df in sources.items():
        score = compute_score_for_n(df, n)
        if score < best_score:
            # Check for overlaps
            ok, _ = check_overlaps_strict(df, n)
            if ok:
                best_score = score
                best_source = name
    
    best_per_n[n] = (best_source, best_score)
    if n <= 10 or n % 20 == 0:
        print(f"N={n}: best={best_source} score={best_score:.6f}")

In [None]:
# Count how many N values each source contributes
source_counts = {}
for n, (source, score) in best_per_n.items():
    if source not in source_counts:
        source_counts[source] = 0
    source_counts[source] += 1

print("\nSource contributions:")
for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
    print(f"  {source}: {count} N values")

# Calculate total score from best per N
total_best = sum(score for _, score in best_per_n.values())
print(f"\nTotal score from best per N: {total_best:.6f}")
print(f"Current best (exp_010): {best_score:.6f}")
print(f"Improvement: {best_score - total_best:.6f}")