# Experiment 007: Fixed Prefix Ensemble

Fix the 's' prefix issue - ensure ALL values have 's' prefix when writing output.
Also implement fractional translation for additional improvement.

In [None]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
from shapely.ops import unary_union
from glob import glob
import json
import math

print("Imports done")

In [None]:
# Tree geometry
def get_tree_polygon():
    vertices = [
        (0.0, 0.8), (0.125, 0.5), (0.0625, 0.5),
        (0.2, 0.25), (0.1, 0.25), (0.35, 0.0),
        (0.075, 0.0), (0.075, -0.2), (-0.075, -0.2),
        (-0.075, 0.0), (-0.35, 0.0), (-0.1, 0.25),
        (-0.2, 0.25), (-0.0625, 0.5), (-0.125, 0.5),
    ]
    return Polygon(vertices)

TREE_POLY = get_tree_polygon()
print(f"Tree: {len(TREE_POLY.exterior.coords)} vertices")

In [None]:
def parse_s(s_val):
    """Parse s-prefixed value to float for scoring."""
    if isinstance(s_val, str):
        if s_val.startswith('s'):
            return float(s_val[1:])
        return float(s_val)
    return float(s_val)

def format_s(val):
    """Format value with 's' prefix."""
    return f's{val}'

def create_tree(x, y, deg):
    return translate(rotate(TREE_POLY, deg, origin=(0, 0)), x, y)

def get_bbox_side(polygons):
    if not polygons:
        return 0
    combined = unary_union(polygons)
    bounds = combined.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def check_overlaps_zero_tol(polygons):
    """Check for ANY overlap, no matter how small."""
    if len(polygons) <= 1:
        return False, None
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]):
                if not polygons[i].touches(polygons[j]):
                    try:
                        inter = polygons[i].intersection(polygons[j])
                        if inter.area > 0:
                            return True, f"Trees {i},{j} overlap (area={inter.area:.2e})"
                    except:
                        return True, f"Trees {i},{j} error"
    return False, None

print("Functions defined")

In [None]:
def load_submission(path):
    """Load CSV and parse values to floats."""
    try:
        df = pd.read_csv(path, dtype=str)
        if 'x' not in df.columns:
            return None
        df['x_float'] = df['x'].apply(parse_s)
        df['y_float'] = df['y'].apply(parse_s)
        df['deg_float'] = df['deg'].apply(parse_s)
        df['n'] = df['id'].apply(lambda x: int(x.split('_')[0]))
        return df
    except:
        return None

print("Load function defined")

In [None]:
# Load the VALID baseline (this PASSED Kaggle validation)
baseline_path = '/home/nonroot/snapshots/santa-2025/21328309254/submission/submission.csv'
baseline_df = load_submission(baseline_path)
print(f"Loaded baseline: {len(baseline_df)} rows")

# Compute baseline scores and store data
baseline_scores = {}
baseline_data = {}

for n in range(1, 201):
    n_df = baseline_df[baseline_df['n'] == n]
    if len(n_df) != n:
        print(f"ERROR: N={n}")
        continue
    
    xs = n_df['x_float'].tolist()
    ys = n_df['y_float'].tolist()
    degs = n_df['deg_float'].tolist()
    
    polygons = [create_tree(xs[i], ys[i], degs[i]) for i in range(n)]
    side = get_bbox_side(polygons)
    score = (side ** 2) / n
    
    baseline_scores[n] = score
    baseline_data[n] = {'xs': xs, 'ys': ys, 'degs': degs}

baseline_total = sum(baseline_scores.values())
print(f"Baseline total: {baseline_total:.6f}")

In [None]:
# Find all submission files
csv_files = glob('/home/nonroot/snapshots/santa-2025/*/submission/submission.csv')
print(f"Found {len(csv_files)} submission files")

In [None]:
# Build ensemble - store FLOAT values, not strings
best_per_n = {n: {
    'score': baseline_scores[n],
    'data': baseline_data[n],
    'source': 'baseline'
} for n in range(1, 201)}

improvement_count = 0

for idx, csv_path in enumerate(csv_files):
    if idx % 20 == 0:
        print(f"Processing {idx+1}/{len(csv_files)}...")
    
    df = load_submission(csv_path)
    if df is None:
        continue
    
    for n in range(1, 201):
        n_df = df[df['n'] == n]
        if len(n_df) != n:
            continue
        
        xs = n_df['x_float'].tolist()
        ys = n_df['y_float'].tolist()
        degs = n_df['deg_float'].tolist()
        
        try:
            polygons = [create_tree(xs[i], ys[i], degs[i]) for i in range(n)]
            side = get_bbox_side(polygons)
            score = (side ** 2) / n
        except:
            continue
        
        # Only consider if better
        if score >= best_per_n[n]['score']:
            continue
        
        # ZERO tolerance overlap check
        has_overlap, msg = check_overlaps_zero_tol(polygons)
        if has_overlap:
            continue
        
        # Valid improvement - store FLOAT values
        improvement = best_per_n[n]['score'] - score
        if improvement > 0.001:
            print(f"  N={n}: {best_per_n[n]['score']:.6f} -> {score:.6f} ({improvement:.6f})")
        
        best_per_n[n] = {
            'score': score,
            'data': {'xs': xs, 'ys': ys, 'degs': degs},
            'source': csv_path
        }
        improvement_count += 1

print(f"\nFound {improvement_count} valid improvements")

In [None]:
# Compute ensemble score before fractional translation
ensemble_total = sum(best_per_n[n]['score'] for n in range(1, 201))
improvement = baseline_total - ensemble_total

print(f"\n{'='*50}")
print(f"Baseline: {baseline_total:.6f}")
print(f"Ensemble (before frac trans): {ensemble_total:.6f}")
print(f"Improvement: {improvement:.6f}")
print(f"{'='*50}")

In [None]:
# Implement fractional translation
def fractional_translation(xs, ys, degs, n, max_iter=100):
    """Apply fractional translation to improve score."""
    frac_steps = [0.001, 0.0005, 0.0002, 0.0001, 0.00005, 0.00002, 0.00001]
    directions = [(0, 1), (0, -1), (1, 0), (-1, 0), (1, 1), (1, -1), (-1, 1), (-1, -1)]
    
    best_xs = xs.copy()
    best_ys = ys.copy()
    
    polygons = [create_tree(best_xs[i], best_ys[i], degs[i]) for i in range(n)]
    best_score = (get_bbox_side(polygons) ** 2) / n
    
    for iteration in range(max_iter):
        improved = False
        for tree_idx in range(n):
            for step in frac_steps:
                for dx, dy in directions:
                    new_xs = best_xs.copy()
                    new_ys = best_ys.copy()
                    new_xs[tree_idx] += dx * step
                    new_ys[tree_idx] += dy * step
                    
                    # Create polygons and check
                    try:
                        new_polygons = [create_tree(new_xs[i], new_ys[i], degs[i]) for i in range(n)]
                        has_overlap, _ = check_overlaps_zero_tol(new_polygons)
                        if has_overlap:
                            continue
                        
                        new_score = (get_bbox_side(new_polygons) ** 2) / n
                        if new_score < best_score - 1e-12:
                            best_xs = new_xs
                            best_ys = new_ys
                            best_score = new_score
                            improved = True
                    except:
                        continue
        
        if not improved:
            break
    
    return best_xs, best_ys, best_score

print("Fractional translation function defined")

In [None]:
# Apply fractional translation to all N values
print("Applying fractional translation...")
frac_improvements = 0

for n in range(1, 201):
    if n % 20 == 0:
        print(f"  Processing N={n}...")
    
    data = best_per_n[n]['data']
    xs = data['xs'].copy() if isinstance(data['xs'], list) else list(data['xs'])
    ys = data['ys'].copy() if isinstance(data['ys'], list) else list(data['ys'])
    degs = data['degs'].copy() if isinstance(data['degs'], list) else list(data['degs'])
    
    old_score = best_per_n[n]['score']
    
    # Apply fractional translation (limit iterations for speed)
    new_xs, new_ys, new_score = fractional_translation(xs, ys, degs, n, max_iter=50)
    
    if new_score < old_score - 1e-10:
        improvement = old_score - new_score
        if improvement > 0.0001:
            print(f"    N={n}: {old_score:.8f} -> {new_score:.8f} ({improvement:.8f})")
        best_per_n[n]['data'] = {'xs': new_xs, 'ys': new_ys, 'degs': degs}
        best_per_n[n]['score'] = new_score
        frac_improvements += 1

print(f"\nFractional translation improved {frac_improvements} N values")

In [None]:
# Compute final score
final_total = sum(best_per_n[n]['score'] for n in range(1, 201))
frac_improvement = ensemble_total - final_total

print(f"\n{'='*50}")
print(f"Baseline: {baseline_total:.6f}")
print(f"After ensemble: {ensemble_total:.6f}")
print(f"After frac trans: {final_total:.6f}")
print(f"Frac trans improvement: {frac_improvement:.6f}")
print(f"Total improvement: {baseline_total - final_total:.6f}")
print(f"{'='*50}")
print(f"\nTarget: 68.888293")
print(f"Gap: {final_total - 68.888293:.6f}")

In [None]:
# Final validation
print("\nFinal validation (ZERO tolerance)...")
all_valid = True
for n in range(1, 201):
    data = best_per_n[n]['data']
    polygons = [create_tree(data['xs'][i], data['ys'][i], data['degs'][i]) for i in range(n)]
    has_overlap, msg = check_overlaps_zero_tol(polygons)
    if has_overlap:
        print(f"OVERLAP at N={n}: {msg}")
        all_valid = False
        # Fall back to baseline
        best_per_n[n] = {
            'score': baseline_scores[n],
            'data': baseline_data[n],
            'source': 'fallback'
        }

if all_valid:
    print("All 200 configurations VALID!")
else:
    final_total = sum(best_per_n[n]['score'] for n in range(1, 201))
    print(f"After fallbacks: {final_total:.6f}")

In [None]:
# Save submission with PROPER 's' prefix formatting
rows = []
for n in range(1, 201):
    data = best_per_n[n]['data']
    for i in range(n):
        rows.append({
            'id': f'{n:03d}_{i}',
            'x': format_s(data['xs'][i]),  # Always add 's' prefix!
            'y': format_s(data['ys'][i]),
            'deg': format_s(data['degs'][i])
        })

df_out = pd.DataFrame(rows)
df_out.to_csv('/home/code/experiments/007_fixed_prefix_ensemble/submission.csv', index=False)
df_out.to_csv('/home/submission/submission.csv', index=False)
print(f"Saved {len(df_out)} rows")

# Verify output format
print("\nFirst 5 rows of output:")
print(df_out.head())

# Verify N=17 has 's' prefix
n17 = df_out[df_out['id'].str.startswith('017_')]
print(f"\nN=17 sample:")
print(n17.head(3))

In [None]:
# Save metrics
metrics = {
    'cv_score': final_total,
    'baseline_score': baseline_total,
    'ensemble_score': ensemble_total,
    'frac_trans_improvement': frac_improvement,
    'total_improvement': baseline_total - final_total,
    'target': 68.888293,
    'gap': final_total - 68.888293
}

with open('/home/code/experiments/007_fixed_prefix_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Metrics: {metrics}")