# Experiment 015: New Sources Ensemble

Ensemble with newly downloaded kernel outputs:
- egortrushin_improved_sa
- roshaw_keep_trying
- eyestrain_blending
- seshurajup_tpu
- datafad_bronze
- telegram_public
- chistyakov_fix_direction

In [None]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
import json
import os
import glob
import warnings
warnings.filterwarnings('ignore')

getcontext().prec = 30
SCALE_FACTOR = Decimal('1e18')

print("Setup complete")

In [None]:
# Tree shape vertices
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

def create_tree_polygon(x, y, angle):
    x, y, angle = float(x), float(y), float(angle)
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def create_high_precision_tree(x, y, angle):
    x = Decimal(str(x))
    y = Decimal(str(y))
    angle = Decimal(str(angle))
    sf = SCALE_FACTOR
    vertices = [
        (float(Decimal('0.0') * sf), float(Decimal('0.8') * sf)),
        (float(Decimal('0.125') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('-0.125') * sf), float(Decimal('0.5') * sf)),
    ]
    poly = Polygon(vertices)
    poly = affinity.rotate(poly, float(angle), origin=(0, 0))
    poly = affinity.translate(poly, xoff=float(x * sf), yoff=float(y * sf))
    return poly

def validate_no_overlap_strict(trees_data):
    if len(trees_data) <= 1:
        return True
    polygons = [create_high_precision_tree(t['x'], t['y'], t['deg']) for t in trees_data]
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                return False
    return True

def get_bbox_side(trees):
    if len(trees) == 0:
        return 0
    polygons = [create_tree_polygon(t['x'], t['y'], t['deg']) for t in trees]
    union = unary_union(polygons)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def get_score(trees, n):
    side = get_bbox_side(trees)
    return (side ** 2) / n

print("Core functions defined")

In [None]:
# Load current best submission (exp_012)
print("Loading current best submission...")
df_best = pd.read_csv('/home/code/experiments/012_full_snapshot_ensemble/submission.csv')
df_best['N'] = df_best['id'].astype(str).str.split('_').str[0].astype(int)

best_trees = {}
best_scores = {}

for n, g in df_best.groupby('N'):
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    best_trees[n] = trees
    best_scores[n] = get_score(trees, n)

baseline_total = sum(best_scores.values())
print(f"Current best score: {baseline_total:.6f}")

In [None]:
def load_csv_trees(filepath):
    """Load trees from a CSV file."""
    try:
        df = pd.read_csv(filepath)
        if 'id' not in df.columns:
            return None
        df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        
        trees_by_n = {}
        for n, g in df.groupby('N'):
            trees = []
            for _, row in g.iterrows():
                x = str(row['x']).replace('s', '')
                y = str(row['y']).replace('s', '')
                deg = str(row['deg']).replace('s', '')
                trees.append({'x': x, 'y': y, 'deg': deg})
            if len(trees) == n:
                trees_by_n[n] = trees
        return trees_by_n
    except Exception as e:
        return None

print("CSV loader defined")

In [None]:
# Find all new CSV files
new_sources = [
    '/home/code/kaggle_datasets/egortrushin_improved_sa/submission.csv',
    '/home/code/kaggle_datasets/roshaw_keep_trying/submission.csv',
    '/home/code/kaggle_datasets/roshaw_keep_trying/solutions/submission_72.469522.csv',
    '/home/code/kaggle_datasets/eyestrain_blending/submission.csv',
    '/home/code/kaggle_datasets/seshurajup_tpu/submission_ensemble.csv',
    '/home/code/kaggle_datasets/seshurajup_tpu/solutions/submission_71.655734.csv',
    '/home/code/kaggle_datasets/datafad_bronze/submission.csv',
    '/home/code/kaggle_datasets/telegram_public/71.97.csv',
    '/home/code/kaggle_datasets/telegram_public/72.49.csv',
    '/home/code/kaggle_datasets/chistyakov_fix_direction/results.csv',
]

print(f"New sources to check: {len(new_sources)}")
for src in new_sources:
    exists = os.path.exists(src)
    print(f"  {src.split('/')[-2]}/{src.split('/')[-1]}: {'EXISTS' if exists else 'MISSING'}")

In [None]:
# Load and evaluate each new source
print("\n" + "="*60)
print("EVALUATING NEW SOURCES")
print("="*60)

source_data = {}
for src in new_sources:
    if not os.path.exists(src):
        continue
    
    trees_by_n = load_csv_trees(src)
    if trees_by_n is None:
        print(f"  {src.split('/')[-1]}: FAILED to load")
        continue
    
    # Calculate score
    total_score = 0
    valid_n = 0
    for n in range(1, 201):
        if n in trees_by_n:
            score = get_score(trees_by_n[n], n)
            total_score += score
            valid_n += 1
    
    if valid_n == 200:
        source_data[src] = trees_by_n
        print(f"  {src.split('/')[-2]}/{src.split('/')[-1]}: Score = {total_score:.6f}")
    else:
        print(f"  {src.split('/')[-1]}: Only {valid_n}/200 N values")

print(f"\nLoaded {len(source_data)} new sources")

In [None]:
# Find improvements from new sources
print("\n" + "="*60)
print("FINDING IMPROVEMENTS FROM NEW SOURCES")
print("="*60)

improvements = []
for src, trees_by_n in source_data.items():
    src_name = f"{src.split('/')[-2]}/{src.split('/')[-1]}"
    for n in range(1, 201):
        if n not in trees_by_n:
            continue
        
        new_trees = trees_by_n[n]
        new_score = get_score(new_trees, n)
        
        if new_score < best_scores[n] - 1e-9:
            # Validate no overlaps
            if validate_no_overlap_strict(new_trees):
                improvement = best_scores[n] - new_score
                improvements.append((n, improvement, src_name, new_trees, new_score))

print(f"Found {len(improvements)} potential improvements")

# Sort by improvement amount
improvements.sort(key=lambda x: -x[1])

# Show top improvements
print("\nTop 20 improvements:")
for n, imp, src, trees, score in improvements[:20]:
    print(f"  N={n:3d}: +{imp:.6f} from {src}")

In [None]:
# Apply improvements
print("\n" + "="*60)
print("APPLYING IMPROVEMENTS")
print("="*60)

applied = 0
for n, imp, src, trees, score in improvements:
    best_trees[n] = trees
    best_scores[n] = score
    applied += 1

print(f"Applied {applied} improvements")

# Calculate final score
final_total = sum(best_scores.values())
print(f"\nBaseline score: {baseline_total:.6f}")
print(f"Final score: {final_total:.6f}")
print(f"Improvement: {baseline_total - final_total:.6f}")
print(f"Target: 68.879235")
print(f"Gap to target: {final_total - 68.879235:.6f}")

In [None]:
# Final validation
print("\n" + "="*60)
print("FINAL VALIDATION")
print("="*60)

final_overlaps = []
for n in range(1, 201):
    if not validate_no_overlap_strict(best_trees[n]):
        final_overlaps.append(n)

if final_overlaps:
    print(f"WARNING: {len(final_overlaps)} N values have overlaps!")
    print(f"Overlapping N values: {final_overlaps[:20]}...")
else:
    print("All N values pass strict validation!")

In [None]:
# Create submission
print("\n" + "="*60)
print("CREATE SUBMISSION")
print("="*60)

rows = []
for n in range(1, 201):
    trees = best_trees[n]
    for i, t in enumerate(trees):
        x_val = str(t['x']).replace('s', '')
        y_val = str(t['y']).replace('s', '')
        deg_val = str(t['deg']).replace('s', '')
        rows.append({
            'id': f"{n:03d}_{i}",
            'x': f"s{x_val}",
            'y': f"s{y_val}",
            'deg': f"s{deg_val}"
        })

submission_df = pd.DataFrame(rows)
print(f"Submission shape: {submission_df.shape}")

submission_df.to_csv('/home/code/experiments/015_new_sources_ensemble/submission.csv', index=False)
submission_df.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved!")

In [None]:
# Save metrics
metrics = {
    'cv_score': final_total,
    'baseline_score': baseline_total,
    'improvement': baseline_total - final_total,
    'n_improved': applied,
    'new_sources_used': len(source_data),
    'final_overlaps': len(final_overlaps),
    'target': 68.879235,
    'gap': final_total - 68.879235
}

with open('/home/code/experiments/015_new_sources_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("\nMetrics saved!")
print(json.dumps(metrics, indent=2))