# Experiment 006: Preserved Precision Ensemble

Fix the coordinate truncation bug by preserving ORIGINAL string values from source CSVs.

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
from shapely.ops import unary_union
from glob import glob
import json

print("Imports done")

Imports done


In [2]:
# Tree geometry
def get_tree_polygon():
    vertices = [
        (0.0, 0.8), (0.125, 0.5), (0.0625, 0.5),
        (0.2, 0.25), (0.1, 0.25), (0.35, 0.0),
        (0.075, 0.0), (0.075, -0.2), (-0.075, -0.2),
        (-0.075, 0.0), (-0.35, 0.0), (-0.1, 0.25),
        (-0.2, 0.25), (-0.0625, 0.5), (-0.125, 0.5),
    ]
    return Polygon(vertices)

TREE_POLY = get_tree_polygon()
print(f"Tree: {len(TREE_POLY.exterior.coords)} vertices")

Tree: 16 vertices


In [3]:
def parse_s(s_val):
    """Parse s-prefixed value to float for scoring."""
    if isinstance(s_val, str) and s_val.startswith('s'):
        return float(s_val[1:])
    return float(s_val)

def create_tree(x, y, deg):
    return translate(rotate(TREE_POLY, deg, origin=(0, 0)), x, y)

def get_bbox_side(polygons):
    if not polygons:
        return 0
    combined = unary_union(polygons)
    bounds = combined.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def check_overlaps_zero_tol(polygons):
    """Check for ANY overlap, no matter how small."""
    if len(polygons) <= 1:
        return False, None
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                if not polygons[i].touches(polygons[j]):
                    try:
                        inter = polygons[i].intersection(polygons[j])
                        if inter.area > 0:
                            return True, f"Trees {i},{j} overlap (area={inter.area:.2e})"
                    except:
                        return True, f"Trees {i},{j} error"
    return False, None

print("Functions defined")

Functions defined


In [4]:
def load_submission_preserve_strings(path):
    """Load CSV and preserve ORIGINAL string values."""
    try:
        # Load as strings to preserve precision
        df = pd.read_csv(path, dtype=str)
        if 'x' not in df.columns:
            return None
        # Parse floats for scoring only
        df['x_float'] = df['x'].apply(parse_s)
        df['y_float'] = df['y'].apply(parse_s)
        df['deg_float'] = df['deg'].apply(parse_s)
        df['n'] = df['id'].apply(lambda x: int(x.split('_')[0]))
        return df
    except:
        return None

print("Load function defined")

Load function defined


In [5]:
# Load the VALID baseline (this PASSED Kaggle validation)
baseline_path = '/home/nonroot/snapshots/santa-2025/21328309254/submission/submission.csv'
baseline_df = load_submission_preserve_strings(baseline_path)
print(f"Loaded baseline: {len(baseline_df)} rows")

# Verify N=2 precision is preserved
n2 = baseline_df[baseline_df['n'] == 2]
print(f"\nN=2 from baseline (original strings):")
print(f"  x[0] = {n2['x'].iloc[0]}")
print(f"  y[0] = {n2['y'].iloc[0]}")

Loaded baseline: 20100 rows

N=2 from baseline (original strings):
  x[0] = s0.1540970696213643
  y[0] = s-0.03854074269478543


In [6]:
# Compute baseline scores and store data with ORIGINAL strings
baseline_scores = {}
baseline_data = {}

for n in range(1, 201):
    n_df = baseline_df[baseline_df['n'] == n]
    if len(n_df) != n:
        print(f"ERROR: N={n}")
        continue
    
    xs = n_df['x_float'].tolist()
    ys = n_df['y_float'].tolist()
    degs = n_df['deg_float'].tolist()
    
    polygons = [create_tree(xs[i], ys[i], degs[i]) for i in range(n)]
    side = get_bbox_side(polygons)
    score = (side ** 2) / n
    
    baseline_scores[n] = score
    baseline_data[n] = {
        'xs': xs, 'ys': ys, 'degs': degs,
        # CRITICAL: Store ORIGINAL strings
        'x_strs': n_df['x'].tolist(),
        'y_strs': n_df['y'].tolist(),
        'deg_strs': n_df['deg'].tolist()
    }

baseline_total = sum(baseline_scores.values())
print(f"Baseline total: {baseline_total:.6f}")

Baseline total: 70.647327


In [7]:
# Find submission files
csv_files = glob('/home/nonroot/snapshots/santa-2025/*/submission/submission.csv')
print(f"Found {len(csv_files)} submission files")

Found 88 submission files


In [8]:
# Build ensemble with PRESERVED precision
best_per_n = {n: {
    'score': baseline_scores[n],
    'data': baseline_data[n],
    'source': 'baseline'
} for n in range(1, 201)}

improvement_count = 0

for idx, csv_path in enumerate(csv_files):
    if idx % 20 == 0:
        print(f"Processing {idx+1}/{len(csv_files)}...")
    
    df = load_submission_preserve_strings(csv_path)
    if df is None:
        continue
    
    for n in range(1, 201):
        n_df = df[df['n'] == n]
        if len(n_df) != n:
            continue
        
        xs = n_df['x_float'].tolist()
        ys = n_df['y_float'].tolist()
        degs = n_df['deg_float'].tolist()
        
        try:
            polygons = [create_tree(xs[i], ys[i], degs[i]) for i in range(n)]
            side = get_bbox_side(polygons)
            score = (side ** 2) / n
        except:
            continue
        
        # Only consider if better
        if score >= best_per_n[n]['score']:
            continue
        
        # ZERO tolerance overlap check
        has_overlap, msg = check_overlaps_zero_tol(polygons)
        if has_overlap:
            continue
        
        # Valid improvement - store with ORIGINAL strings
        improvement = best_per_n[n]['score'] - score
        if improvement > 0.001:
            print(f"  N={n}: {best_per_n[n]['score']:.6f} -> {score:.6f} ({improvement:.6f})")
        
        best_per_n[n] = {
            'score': score,
            'data': {
                'xs': xs, 'ys': ys, 'degs': degs,
                'x_strs': n_df['x'].tolist(),  # ORIGINAL strings!
                'y_strs': n_df['y'].tolist(),
                'deg_strs': n_df['deg'].tolist()
            },
            'source': csv_path
        }
        improvement_count += 1

print(f"\nFound {improvement_count} valid improvements")

Processing 1/88...


  N=15: 0.379203 -> 0.376950 (0.002253)
  N=43: 0.370040 -> 0.367065 (0.002975)


  N=54: 0.359140 -> 0.356435 (0.002705)
  N=64: 0.350468 -> 0.348740 (0.001728)


  N=76: 0.351218 -> 0.349724 (0.001494)


  N=88: 0.350643 -> 0.348173 (0.002471)


  N=95: 0.349092 -> 0.347754 (0.001338)
  N=100: 0.345531 -> 0.343397 (0.002134)


  N=65: 0.363285 -> 0.361611 (0.001674)


  N=87: 0.353691 -> 0.350389 (0.003302)


Processing 21/88...


Processing 41/88...


Processing 61/88...


Processing 81/88...



Found 364 valid improvements


In [9]:
# Compute ensemble score
ensemble_total = sum(best_per_n[n]['score'] for n in range(1, 201))
improvement = baseline_total - ensemble_total

print(f"\n{'='*50}")
print(f"Baseline: {baseline_total:.6f}")
print(f"Ensemble: {ensemble_total:.6f}")
print(f"Improvement: {improvement:.6f}")
print(f"{'='*50}")
print(f"\nTarget: 68.888293")
print(f"Gap: {ensemble_total - 68.888293:.6f}")


Baseline: 70.647327
Ensemble: 70.615106
Improvement: 0.032221

Target: 68.888293
Gap: 1.726813


In [10]:
# Final validation with ZERO tolerance
print("\nFinal validation (ZERO tolerance)...")
all_valid = True
for n in range(1, 201):
    data = best_per_n[n]['data']
    polygons = [create_tree(data['xs'][i], data['ys'][i], data['degs'][i]) for i in range(n)]
    has_overlap, msg = check_overlaps_zero_tol(polygons)
    if has_overlap:
        print(f"OVERLAP at N={n}: {msg}")
        all_valid = False
        # Fall back to baseline
        best_per_n[n] = {
            'score': baseline_scores[n],
            'data': baseline_data[n],
            'source': 'fallback'
        }

if all_valid:
    print("All 200 configurations VALID!")
else:
    ensemble_total = sum(best_per_n[n]['score'] for n in range(1, 201))
    print(f"After fallbacks: {ensemble_total:.6f}")


Final validation (ZERO tolerance)...


All 200 configurations VALID!


In [11]:
# Verify N=2 precision is preserved in output
print("\nVerifying N=2 precision:")
print(f"  Source: {best_per_n[2]['source']}")
print(f"  x[0] = {best_per_n[2]['data']['x_strs'][0]}")
print(f"  y[0] = {best_per_n[2]['data']['y_strs'][0]}")


Verifying N=2 precision:
  Source: baseline
  x[0] = s0.1540970696213643
  y[0] = s-0.03854074269478543


In [12]:
# Save submission with ORIGINAL strings
rows = []
for n in range(1, 201):
    data = best_per_n[n]['data']
    for i in range(n):
        rows.append({
            'id': f'{n:03d}_{i}',
            'x': data['x_strs'][i],  # ORIGINAL string!
            'y': data['y_strs'][i],
            'deg': data['deg_strs'][i]
        })

df_out = pd.DataFrame(rows)
df_out.to_csv('/home/code/experiments/006_preserved_precision/submission.csv', index=False)
df_out.to_csv('/home/submission/submission.csv', index=False)
print(f"Saved {len(df_out)} rows")

# Verify output
print("\nFirst 5 rows of output:")
print(df_out.head())

Saved 20100 rows

First 5 rows of output:
      id                     x                      y                  deg
0  001_0  s-48.196086194214246    s58.770984615214225                s45.0
1  002_0   s0.1540970696213643  s-0.03854074269478543  s203.62937773065684
2  002_1  s-0.1540970696213643   s-0.5614592573052146  s23.629377730656792
3  003_0    s1.123655816140301     s0.781101815992563    s111.125132292893
4  003_1     s1.23405569584216     s1.275999500663759     s66.370622269343


In [13]:
# Save metrics
metrics = {
    'cv_score': ensemble_total,
    'baseline_score': baseline_total,
    'improvement': baseline_total - ensemble_total,
    'target': 68.888293,
    'gap': ensemble_total - 68.888293
}

with open('/home/code/experiments/006_preserved_precision/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Metrics: {metrics}")

Metrics: {'cv_score': 70.61510625340564, 'baseline_score': 70.64732689763682, 'improvement': 0.03222064423117388, 'target': 68.888293, 'gap': 1.7268132534056377}
