# Experiment 008: REPAIR + ENSEMBLE Strategy

Implement the REPAIR strategy from yongsukprasertsuk kernel:
1. Load optimized zaburo solutions (88.33 score, 183 overlaps)
2. Validate each N with Shapely
3. REPAIR overlapping N values by replacing with baseline
4. Keep only valid improvements
5. Create ensemble submission

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely.affinity import rotate, translate
from shapely.strtree import STRtree
import json
import os
import shutil

# Tree shape coordinates
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

WORK_DIR = '/home/code/experiments/008_repair_ensemble'
BASELINE_CSV = '/home/code/exploration/datasets/submission.csv'
OPTIMIZED_CSV = '/home/code/experiments/007_sa_optimization/solutions/submission_88.326787.csv'

def create_tree_polygon(x, y, deg):
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = rotate(poly, deg, origin=(0, 0))
    poly = translate(poly, x, y)
    return poly

def has_overlap(trees):
    """Check if any trees in the configuration overlap using Shapely."""
    if len(trees) <= 1:
        return False
    polygons = [create_tree_polygon(x, y, deg) for x, y, deg in trees]
    tree_index = STRtree(polygons)
    for i, poly in enumerate(polygons):
        indices = tree_index.query(poly)
        for idx in indices:
            if idx == i:
                continue
            if poly.intersects(polygons[idx]) and not poly.touches(polygons[idx]):
                return True
    return False

def parse_submission(filepath):
    df = pd.read_csv(filepath)
    def parse_val(v):
        if isinstance(v, str) and v.startswith('s'):
            return float(v[1:])
        return float(v)
    df['x_val'] = df['x'].apply(parse_val)
    df['y_val'] = df['y'].apply(parse_val)
    df['deg_val'] = df['deg'].apply(parse_val)
    df['N'] = df['id'].apply(lambda x: int(x.split('_')[0]))
    result = {}
    for n, group in df.groupby('N'):
        trees = list(zip(group['x_val'], group['y_val'], group['deg_val']))
        result[n] = trees
    return result

def calculate_bounding_box_side(trees):
    all_x, all_y = [], []
    for x, y, deg in trees:
        poly = create_tree_polygon(x, y, deg)
        bounds = poly.bounds
        all_x.extend([bounds[0], bounds[2]])
        all_y.extend([bounds[1], bounds[3]])
    return max(max(all_x) - min(all_x), max(all_y) - min(all_y))

def calculate_score_contribution(trees, n):
    side = calculate_bounding_box_side(trees)
    return side**2 / n

print("Functions defined")

Functions defined


In [2]:
# Load baseline and optimized solutions
print("Loading solutions...")
baseline_trees = parse_submission(BASELINE_CSV)
optimized_trees = parse_submission(OPTIMIZED_CSV)

print(f"Baseline: {len(baseline_trees)} configurations")
print(f"Optimized: {len(optimized_trees)} configurations")

Loading solutions...
Baseline: 200 configurations
Optimized: 200 configurations


In [3]:
# Validate each N and implement REPAIR strategy
print("\nImplementing REPAIR + ENSEMBLE strategy...")
print("For each N: check overlap, compare scores, pick best valid solution")

final_trees = {}
repair_log = []

baseline_wins = 0
optimized_wins = 0
overlap_repairs = 0
score_repairs = 0

for n in range(1, 201):
    base_trees = baseline_trees[n]
    opt_trees = optimized_trees[n]
    
    # Check if optimized has overlap
    opt_has_overlap = has_overlap(opt_trees)
    
    # Calculate scores
    base_score = calculate_score_contribution(base_trees, n)
    opt_score = calculate_score_contribution(opt_trees, n)
    
    # Decision logic
    if opt_has_overlap:
        # REPAIR: Use baseline for overlapping N
        final_trees[n] = base_trees
        baseline_wins += 1
        overlap_repairs += 1
        repair_log.append({'n': n, 'reason': 'overlap', 'base_score': base_score, 'opt_score': opt_score})
    elif opt_score < base_score:
        # KEEP: Use optimized if it's better AND valid
        final_trees[n] = opt_trees
        optimized_wins += 1
        improvement = base_score - opt_score
        repair_log.append({'n': n, 'reason': 'improved', 'base_score': base_score, 'opt_score': opt_score, 'improvement': improvement})
    else:
        # KEEP: Use baseline if it's better
        final_trees[n] = base_trees
        baseline_wins += 1
        score_repairs += 1
        repair_log.append({'n': n, 'reason': 'baseline_better', 'base_score': base_score, 'opt_score': opt_score})

print(f"\n=== REPAIR SUMMARY ===")
print(f"Baseline wins: {baseline_wins}")
print(f"Optimized wins: {optimized_wins}")
print(f"  - Overlap repairs: {overlap_repairs}")
print(f"  - Score repairs (baseline better): {score_repairs}")


Implementing REPAIR + ENSEMBLE strategy...
For each N: check overlap, compare scores, pick best valid solution



=== REPAIR SUMMARY ===
Baseline wins: 200
Optimized wins: 0
  - Overlap repairs: 183
  - Score repairs (baseline better): 17


In [4]:
# Show the improvements
improvements = [r for r in repair_log if r['reason'] == 'improved']
print(f"\n=== IMPROVEMENTS ({len(improvements)} N values) ===")

total_improvement = 0
for r in sorted(improvements, key=lambda x: -x.get('improvement', 0)):
    imp = r.get('improvement', 0)
    total_improvement += imp
    print(f"N={r['n']}: {r['base_score']:.6f} -> {r['opt_score']:.6f} (improvement: {imp:.6f})")

print(f"\nTotal improvement from optimized N values: {total_improvement:.6f}")


=== IMPROVEMENTS (0 N values) ===

Total improvement from optimized N values: 0.000000


In [5]:
# Calculate final ensemble score
baseline_total = sum(calculate_score_contribution(baseline_trees[n], n) for n in range(1, 201))
ensemble_total = sum(calculate_score_contribution(final_trees[n], n) for n in range(1, 201))

print(f"\n=== FINAL SCORES ===")
print(f"Baseline score: {baseline_total:.6f}")
print(f"Ensemble score: {ensemble_total:.6f}")
print(f"Improvement: {baseline_total - ensemble_total:.6f}")
print(f"Target: 68.919")
print(f"Gap to target: {ensemble_total - 68.919:.6f}")


=== FINAL SCORES ===
Baseline score: 70.647327
Ensemble score: 70.647327
Improvement: 0.000000
Target: 68.919
Gap to target: 1.728327


In [6]:
# Final validation: Check ALL configurations in ensemble for overlaps
print("\nFinal validation: Checking all ensemble configurations for overlaps...")
failed_n = []
for n in range(1, 201):
    if has_overlap(final_trees[n]):
        failed_n.append(n)
        print(f"  N={n}: OVERLAP DETECTED")

if len(failed_n) == 0:
    print("All configurations passed overlap check!")
else:
    print(f"\nFailed N values: {failed_n}")


Final validation: Checking all ensemble configurations for overlaps...


All configurations passed overlap check!


In [7]:
# Save final submission
if len(failed_n) == 0:
    print("\nSaving final submission...")
    
    rows = []
    for n in range(1, 201):
        trees = final_trees[n]
        for i, (x, y, deg) in enumerate(trees):
            rows.append({
                'id': f"{n:03d}_{i}",
                'x': f"s{x}",
                'y': f"s{y}",
                'deg': f"s{deg}"
            })
    
    final_df = pd.DataFrame(rows)
    os.makedirs('/home/submission', exist_ok=True)
    final_df.to_csv('/home/submission/submission.csv', index=False)
    print(f"Saved {len(final_df)} rows to /home/submission/submission.csv")
    
    final_score = ensemble_total
else:
    print("\nUsing baseline due to validation failures")
    shutil.copy(BASELINE_CSV, '/home/submission/submission.csv')
    final_score = baseline_total


Saving final submission...
Saved 20100 rows to /home/submission/submission.csv


In [8]:
# Save metrics
metrics = {
    'cv_score': final_score,
    'baseline_score': baseline_total,
    'ensemble_score': ensemble_total,
    'improvement': baseline_total - ensemble_total,
    'optimized_wins': optimized_wins,
    'baseline_wins': baseline_wins,
    'overlap_repairs': overlap_repairs,
    'score_repairs': score_repairs,
    'validation_passed': len(failed_n) == 0
}

with open(os.path.join(WORK_DIR, 'metrics.json'), 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"\nFinal Score: {final_score:.6f}")
print(f"Target: 68.919")
print(f"Gap to target: {final_score - 68.919:.6f}")


Final Score: 70.647327
Target: 68.919
Gap to target: 1.728327
