# Experiment 018: New Sources Ensemble

Ensemble new kernel outputs with current best:
- egortrushin_sa_translations
- datafad_boxes_shrunk
- hvanphucs_ensemble
- aikhmelnytskyy_sa
- jazivxt_team_blend

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
import glob
import json
import time
import warnings
warnings.filterwarnings('ignore')

getcontext().prec = 30
SCALE_FACTOR = Decimal('1e18')

print("Setup complete")

Setup complete


In [2]:
# Tree shape vertices
TX = np.array([0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125])
TY = np.array([0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5])

def create_tree_polygon(x, y, angle):
    x, y, angle = float(x), float(y), float(angle)
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def create_high_precision_tree(x, y, angle):
    x = Decimal(str(x))
    y = Decimal(str(y))
    angle = Decimal(str(angle))
    sf = SCALE_FACTOR
    vertices = [
        (float(Decimal('0.0') * sf), float(Decimal('0.8') * sf)),
        (float(Decimal('0.125') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('-0.125') * sf), float(Decimal('0.5') * sf)),
    ]
    poly = Polygon(vertices)
    poly = affinity.rotate(poly, float(angle), origin=(0, 0))
    poly = affinity.translate(poly, xoff=float(x * sf), yoff=float(y * sf))
    return poly

def validate_no_overlap_strict(trees_data):
    if len(trees_data) <= 1:
        return True
    polygons = [create_high_precision_tree(t['x'], t['y'], t['deg']) for t in trees_data]
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                return False
    return True

def get_bbox_side(trees):
    if len(trees) == 0:
        return 0
    polygons = [create_tree_polygon(t['x'], t['y'], t['deg']) for t in trees]
    union = unary_union(polygons)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def get_score(trees, n):
    side = get_bbox_side(trees)
    return (side ** 2) / n

print("Core functions defined")

Core functions defined


In [3]:
# Load current best (exp_016/017)
print("Loading current best...")
baseline_path = '/home/code/experiments/016_jazivxt_ensemble/submission.csv'
df = pd.read_csv(baseline_path)
df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)

best_trees = {}
best_scores = {}
best_sources = {}

for n, g in df.groupby('N'):
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    best_trees[n] = trees
    best_scores[n] = get_score(trees, n)
    best_sources[n] = 'baseline'

baseline_total = sum(best_scores.values())
print(f"Baseline score: {baseline_total:.6f}")

Loading current best...


Baseline score: 70.329514


In [4]:
# New sources to try
new_sources = [
    '/home/code/kaggle_datasets/egortrushin_sa_translations/submission.csv',
    '/home/code/kaggle_datasets/datafad_boxes_shrunk/submission.csv',
    '/home/code/kaggle_datasets/hvanphucs_ensemble/submission.csv',
    '/home/code/kaggle_datasets/hvanphucs_ensemble/submission_ensemble.csv',
    '/home/code/kaggle_datasets/aikhmelnytskyy_sa/submission.csv',
    '/home/code/kaggle_datasets/jazivxt_team_blend/submission.csv',
    '/home/code/kaggle_datasets/jazivxt_team_blend/submission_ensemble.csv',
]

print(f"New sources to try: {len(new_sources)}")
for s in new_sources:
    print(f"  {s.split('/')[-2]}/{s.split('/')[-1]}")

New sources to try: 7
  egortrushin_sa_translations/submission.csv
  datafad_boxes_shrunk/submission.csv
  hvanphucs_ensemble/submission.csv
  hvanphucs_ensemble/submission_ensemble.csv
  aikhmelnytskyy_sa/submission.csv
  jazivxt_team_blend/submission.csv
  jazivxt_team_blend/submission_ensemble.csv


In [5]:
def load_trees_from_csv(csv_path, n):
    try:
        df = pd.read_csv(csv_path)
        if 'id' in df.columns:
            df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        elif 'n' in df.columns:
            df['N'] = df['n']
        else:
            return None
        
        g = df[df['N'] == n]
        if len(g) != n:
            return None
        
        trees = []
        for _, row in g.iterrows():
            x = str(row['x']).replace('s', '')
            y = str(row['y']).replace('s', '')
            deg = str(row['deg']).replace('s', '')
            trees.append({'x': x, 'y': y, 'deg': deg})
        return trees
    except Exception as e:
        return None

print("Load function defined")

Load function defined


In [6]:
# Process new sources
print("\n" + "=" * 60)
print("PROCESSING NEW SOURCES")
print("=" * 60)

improvements = []
start_time = time.time()

for csv_file in new_sources:
    source_name = csv_file.split('/')[-2] + '/' + csv_file.split('/')[-1]
    print(f"\nProcessing {source_name}...")
    source_improvements = 0
    
    for n in range(1, 201):
        trees = load_trees_from_csv(csv_file, n)
        if trees is None:
            continue
        
        # Calculate score
        score = get_score(trees, n)
        
        # Check if better
        if score < best_scores[n] - 1e-9:
            # Validate with strict 1e18 precision
            if validate_no_overlap_strict(trees):
                improvement = best_scores[n] - score
                improvements.append((n, improvement, source_name))
                best_trees[n] = trees
                best_scores[n] = score
                best_sources[n] = source_name
                source_improvements += 1
                print(f"  N={n:3d}: +{improvement:.6f}")
    
    print(f"  Total improvements from this source: {source_improvements}")

print(f"\nTotal time: {time.time() - start_time:.1f}s")
print(f"Total N values improved: {len(improvements)}")
if improvements:
    total_improvement = sum(imp for _, imp, _ in improvements)
    print(f"Total improvement: {total_improvement:.6f}")


PROCESSING NEW SOURCES

Processing egortrushin_sa_translations/submission.csv...


  Total improvements from this source: 0

Processing datafad_boxes_shrunk/submission.csv...


  N=171: +0.000000


  Total improvements from this source: 1

Processing hvanphucs_ensemble/submission.csv...


  N=  9: +0.000078


  N= 21: +0.001467


  N= 29: +0.000008


  N= 46: +0.000003
  N= 48: +0.000118


  N= 51: +0.000002
  N= 53: +0.000059


  N= 58: +0.000000


  N= 62: +0.000000
  N= 65: +0.000000


  N= 67: +0.001368
  N= 68: +0.000000
  N= 69: +0.000054


  N= 71: +0.000130
  N= 73: +0.000116
  N= 74: +0.000283


  N= 77: +0.000000


  N= 81: +0.000002
  N= 82: +0.000226
  N= 83: +0.000162


  N= 84: +0.000056
  N= 87: +0.000153


  N= 90: +0.000001
  N= 91: +0.000000


  N= 94: +0.001100
  N= 95: +0.000000


  N=100: +0.000000
  N=103: +0.000000


  N=106: +0.000028
  N=108: +0.000000


  N=109: +0.000000
  N=111: +0.000000


  N=112: +0.000994
  N=113: +0.000000
  N=114: +0.000389


  N=115: +0.000238
  N=116: +0.000001


  N=119: +0.000139


  N=122: +0.000440
  N=123: +0.002018
  N=124: +0.000187


  N=125: +0.000326
  N=126: +0.000006


  N=131: +0.000000
  N=133: +0.000001


  N=134: +0.000576
  N=135: +0.000115


  N=142: +0.000000


  N=146: +0.000060
  N=147: +0.000000


  N=151: +0.000041


  N=153: +0.000000


  N=159: +0.000238
  N=160: +0.000038


  N=161: +0.000000


  N=164: +0.000007


  N=170: +0.000004
  N=171: +0.000289


  N=181: +0.000000


  N=187: +0.000736


  N=197: +0.000048


  Total improvements from this source: 61

Processing hvanphucs_ensemble/submission_ensemble.csv...


  Total improvements from this source: 0

Processing aikhmelnytskyy_sa/submission.csv...


  Total improvements from this source: 0

Processing jazivxt_team_blend/submission.csv...


  N= 22: +0.000004
  N= 26: +0.000037


  N= 37: +0.000435


  N= 85: +0.000025


  N=131: +0.000001


  N=138: +0.000000


  Total improvements from this source: 6

Processing jazivxt_team_blend/submission_ensemble.csv...


  Total improvements from this source: 0

Total time: 73.9s
Total N values improved: 68
Total improvement: 0.012806


In [7]:
# Calculate final score
final_total = sum(best_scores.values())
print(f"\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Baseline score: {baseline_total:.6f}")
print(f"Final score: {final_total:.6f}")
print(f"Improvement: {baseline_total - final_total:.6f}")
print(f"Target: 68.876781")
print(f"Gap to target: {final_total - 68.876781:.6f}")


RESULTS
Baseline score: 70.329514
Final score: 70.316708
Improvement: 0.012806
Target: 68.876781
Gap to target: 1.439927


In [8]:
# Final validation
print("\n" + "=" * 60)
print("FINAL VALIDATION")
print("=" * 60)

final_overlaps = []
for n in range(1, 201):
    if not validate_no_overlap_strict(best_trees[n]):
        final_overlaps.append(n)

if final_overlaps:
    print(f"WARNING: {len(final_overlaps)} N values have overlaps!")
else:
    print("All N values pass strict validation!")


FINAL VALIDATION


All N values pass strict validation!


In [9]:
# Create submission
print("\n" + "=" * 60)
print("CREATE SUBMISSION")
print("=" * 60)

rows = []
for n in range(1, 201):
    trees = best_trees[n]
    for i, t in enumerate(trees):
        x_val = str(t['x']).replace('s', '')
        y_val = str(t['y']).replace('s', '')
        deg_val = str(t['deg']).replace('s', '')
        rows.append({
            'id': f"{n:03d}_{i}",
            'x': f"s{x_val}",
            'y': f"s{y_val}",
            'deg': f"s{deg_val}"
        })

submission_df = pd.DataFrame(rows)
print(f"Submission shape: {submission_df.shape}")

submission_df.to_csv('/home/code/experiments/018_rebuild_corners_fixed/submission.csv', index=False)
submission_df.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved!")


CREATE SUBMISSION
Submission shape: (20100, 4)
Submission saved!


In [None]:
# Save metrics
metrics = {
    'cv_score': final_total,
    'baseline_score': baseline_total,
    'improvement': baseline_total - final_total,
    'n_improved': len(improvements),
    'new_sources': len(new_sources),
    'final_overlaps': len(final_overlaps),
    'target': 68.876781,
    'gap': final_total - 68.876781
}

with open('/home/code/experiments/018_rebuild_corners_fixed/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("\nMetrics saved!")
print(json.dumps(metrics, indent=2))