# Experiment 002: Fix Ensemble Overlaps

The previous ensemble submission failed with "Overlapping trees in group 002".
We need to:
1. Check all N values with strict tolerance (1e-15)
2. Replace overlapping configurations with valid ones from baseline
3. Verify the fixed submission

In [None]:
import sys
sys.path.insert(0, '/home/code')

import pandas as pd
import numpy as np
from utils import (
    ChristmasTree, load_submission, load_trees_for_n, get_trees_data_for_n,
    has_overlap, has_overlap_strict, get_bounding_box_side, calculate_score_for_n,
    score_submission, verify_submission_no_overlaps
)
import json

print("Utilities loaded successfully!")

In [None]:
# Load the failed ensemble submission
ensemble_path = '/home/code/experiments/001_ensemble/submission.csv'
ensemble_df = load_submission(ensemble_path)
print(f"Ensemble loaded: {ensemble_df.shape}")

# Check for overlaps with strict tolerance
print("\nChecking for overlaps with strict tolerance (1e-15)...")
is_valid, overlapping_ns = verify_submission_no_overlaps(ensemble_df)
print(f"Is valid: {is_valid}")
print(f"Overlapping N values: {overlapping_ns}")

In [None]:
# Load the baseline (known to be valid)
baseline_path = '/home/code/experiments/000_baseline/submission.csv'
baseline_df = load_submission(baseline_path)
print(f"Baseline loaded: {baseline_df.shape}")

# Verify baseline has no overlaps
is_valid_baseline, baseline_overlaps = verify_submission_no_overlaps(baseline_df)
print(f"Baseline is valid: {is_valid_baseline}")
print(f"Baseline overlapping N values: {baseline_overlaps}")

In [None]:
# For each overlapping N, replace with baseline configuration
fixed_df = ensemble_df.copy()

for n in overlapping_ns:
    print(f"\nFixing N={n}...")
    
    # Get baseline data for this N
    baseline_n_data = get_trees_data_for_n(baseline_df, n)
    
    # Verify baseline has no overlap for this N
    baseline_trees = load_trees_for_n(baseline_df, n)
    has_ovlp, _ = has_overlap_strict(baseline_trees)
    
    if has_ovlp:
        print(f"  WARNING: Baseline also has overlap for N={n}!")
        continue
    
    # Calculate scores
    ensemble_trees = load_trees_for_n(ensemble_df, n)
    ensemble_score = calculate_score_for_n(ensemble_trees, n)
    baseline_score = calculate_score_for_n(baseline_trees, n)
    
    print(f"  Ensemble score: {ensemble_score:.6f}")
    print(f"  Baseline score: {baseline_score:.6f}")
    print(f"  Score change: {baseline_score - ensemble_score:+.6f}")
    
    # Replace in fixed_df
    prefix = f"{n:03d}_"
    fixed_df = fixed_df[~fixed_df['id'].str.startswith(prefix)]
    fixed_df = pd.concat([fixed_df, baseline_n_data], ignore_index=True)

# Sort by id to maintain order
fixed_df['n'] = fixed_df['id'].apply(lambda x: int(x.split('_')[0]))
fixed_df['tree_idx'] = fixed_df['id'].apply(lambda x: int(x.split('_')[1]))
fixed_df = fixed_df.sort_values(['n', 'tree_idx']).drop(columns=['n', 'tree_idx']).reset_index(drop=True)

print(f"\nFixed submission shape: {fixed_df.shape}")

In [None]:
# Verify the fixed submission
print("Verifying fixed submission...")
is_valid_fixed, fixed_overlaps = verify_submission_no_overlaps(fixed_df)
print(f"Fixed submission is valid: {is_valid_fixed}")
print(f"Fixed overlapping N values: {fixed_overlaps}")

# Calculate score
fixed_score, fixed_scores_by_n, _ = score_submission(fixed_df, check_overlaps=False)
print(f"\nFixed submission score: {fixed_score:.6f}")

In [None]:
# Compare with original ensemble and baseline
ensemble_score, _, _ = score_submission(ensemble_df, check_overlaps=False)
baseline_score, _, _ = score_submission(baseline_df, check_overlaps=False)

print("Score comparison:")
print(f"  Original ensemble: {ensemble_score:.6f} (had overlaps)")
print(f"  Fixed ensemble:    {fixed_score:.6f}")
print(f"  Baseline:          {baseline_score:.6f}")
print(f"\nImprovement over baseline: {baseline_score - fixed_score:.6f}")
print(f"\nTarget: 68.890873")
print(f"Gap to target: {fixed_score - 68.890873:.6f}")

In [None]:
# Save the fixed submission
if is_valid_fixed:
    # Save to experiment folder
    fixed_path = '/home/code/experiments/002_fixed_ensemble/submission.csv'
    fixed_df.to_csv(fixed_path, index=False)
    print(f"Saved fixed submission to {fixed_path}")
    
    # Copy to submission folder
    import shutil
    shutil.copy(fixed_path, '/home/submission/submission.csv')
    print("Copied to /home/submission/submission.csv")
    
    # Save metrics
    metrics = {
        'cv_score': fixed_score,
        'original_ensemble_score': ensemble_score,
        'baseline_score': baseline_score,
        'overlapping_ns_fixed': overlapping_ns,
        'is_valid': is_valid_fixed
    }
    with open('/home/code/experiments/002_fixed_ensemble/metrics.json', 'w') as f:
        json.dump(metrics, f, indent=2)
    print(f"\nMetrics saved: {metrics}")
else:
    print("ERROR: Fixed submission still has overlaps!")

In [None]:
# Final summary
print("=" * 60)
print("EXPERIMENT 002: FIXED ENSEMBLE SUMMARY")
print("=" * 60)
print(f"Original ensemble score: {ensemble_score:.6f} (INVALID - overlaps)")
print(f"Fixed ensemble score: {fixed_score:.6f}")
print(f"Baseline score: {baseline_score:.6f}")
print(f"N values fixed: {overlapping_ns}")
print(f"Improvement over baseline: {baseline_score - fixed_score:.6f}")
print(f"\nTarget: 68.890873")
print(f"Gap to target: {fixed_score - 68.890873:.6f} ({(fixed_score - 68.890873) / 68.890873 * 100:.2f}%)")
print(f"\nSubmission is valid: {is_valid_fixed}")
print("=" * 60)