# Baseline Validation

Validate the best available baseline submission and calculate per-N scores.

In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
import json
import warnings
warnings.filterwarnings('ignore')

# Tree shape vertices (from competition description)
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, angle):
    """Create a tree polygon at position (x, y) with given rotation angle."""
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def parse_value(val):
    """Parse value from submission format (with 's' prefix)."""
    if isinstance(val, str) and val.startswith('s'):
        return float(val[1:])
    return float(val)

print("Functions defined successfully")

Functions defined successfully


In [2]:
# Load submission
df = pd.read_csv('/home/code/experiments/000_baseline/submission.csv')
print(f"Submission shape: {df.shape}")
print(f"Expected rows: {sum(range(1, 201))} (1+2+...+200 = 20100)")
print(f"\nFirst few rows:")
print(df.head(10))

Submission shape: (20100, 4)
Expected rows: 20100 (1+2+...+200 = 20100)

First few rows:
      id                       x                       y  \
0  001_0    s-48.196086194214246     s58.770984615214225   
1  002_0   s0.154097069621355887  s-0.038540742694794648   
2  002_1  s-0.154097069621372845  s-0.561459257305224058   
3  003_0      s1.123655816140301      s0.781101815992563   
4  003_1       s1.23405569584216      s1.275999500663759   
5  003_2      s0.641714640229075      s1.180458566613381   
6  004_0  s-0.324747789589372171   s0.132109978088185392   
7  004_1   s0.315354346242637695   s0.132109978063475492   
8  004_2   s0.324747789592379210  s-0.732109978069475531   
9  004_3  s-0.315354348134818330  s-0.732109978094185987   

                       deg  
0                    s45.0  
1  s203.629377730656841550  
2   s23.629377730656791812  
3        s111.125132292893  
4         s66.370622269343  
5      s155.13405193710082  
6  s156.370622145636389178  
7  s156.3706222692

In [3]:
def has_overlap(polygons):
    """Check if any polygons overlap (not just touch)."""
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                # Check if intersection area is significant
                intersection = polygons[i].intersection(polygons[j])
                if intersection.area > 1e-10:
                    return True
    return False

def calculate_per_n_score(df):
    """Calculate score for each N value."""
    per_n_scores = {}
    per_n_sides = {}
    overlap_errors = []
    
    for n in range(1, 201):
        # Get trees for this N
        prefix = f"{n:03d}_"
        group = df[df['id'].str.startswith(prefix)]
        
        if len(group) != n:
            print(f"WARNING: N={n} has {len(group)} trees, expected {n}")
            continue
        
        # Create polygons
        polygons = []
        for _, row in group.iterrows():
            x = parse_value(row['x'])
            y = parse_value(row['y'])
            angle = parse_value(row['deg'])
            poly = create_tree_polygon(x, y, angle)
            polygons.append(poly)
        
        # Check for overlaps
        if has_overlap(polygons):
            overlap_errors.append(n)
        
        # Calculate bounding box
        union = unary_union(polygons)
        bounds = union.bounds  # (minx, miny, maxx, maxy)
        width = bounds[2] - bounds[0]
        height = bounds[3] - bounds[1]
        side = max(width, height)
        
        # Score contribution: side^2 / n
        score = (side ** 2) / n
        per_n_scores[n] = score
        per_n_sides[n] = side
    
    return per_n_scores, per_n_sides, overlap_errors

print("Calculating per-N scores...")
per_n_scores, per_n_sides, overlap_errors = calculate_per_n_score(df)
print(f"Done! Processed {len(per_n_scores)} N values")
print(f"Overlap errors: {len(overlap_errors)}")
if overlap_errors:
    print(f"N values with overlaps: {overlap_errors[:10]}...")

Calculating per-N scores...


Done! Processed 200 N values
Overlap errors: 0


In [4]:
# Calculate total score
total_score = sum(per_n_scores.values())
print(f"Total Score: {total_score:.6f}")
print(f"Target Score: 68.888293")
print(f"Gap: {total_score - 68.888293:.6f}")

# Show score breakdown by N ranges
print("\n=== Score Breakdown by N Range ===")
ranges = [(1, 1), (2, 5), (6, 10), (11, 50), (51, 100), (101, 200)]
for start, end in ranges:
    range_score = sum(per_n_scores[n] for n in range(start, end+1))
    print(f"N={start}-{end}: {range_score:.4f}")

Total Score: 70.615791
Target Score: 68.888293
Gap: 1.727498

=== Score Breakdown by N Range ===
N=1-1: 0.6612
N=2-5: 1.7189
N=6-10: 1.9490
N=11-50: 14.7036
N=51-100: 17.6063
N=101-200: 33.9768


In [5]:
# Show individual scores for small N (highest impact)
print("\n=== Small N Scores (Highest Impact) ===")
for n in range(1, 21):
    print(f"N={n:3d}: side={per_n_sides[n]:.6f}, score={per_n_scores[n]:.6f}")


=== Small N Scores (Highest Impact) ===
N=  1: side=0.813173, score=0.661250
N=  2: side=0.949504, score=0.450779
N=  3: side=1.142031, score=0.434745
N=  4: side=1.290806, score=0.416545
N=  5: side=1.443692, score=0.416850
N=  6: side=1.548438, score=0.399610
N=  7: side=1.673104, score=0.399897
N=  8: side=1.755921, score=0.385407
N=  9: side=1.867280, score=0.387415
N= 10: side=1.940696, score=0.376630
N= 11: side=2.030803, score=0.374924
N= 12: side=2.114873, score=0.372724
N= 13: side=2.199960, score=0.372294
N= 14: side=2.274555, score=0.369543
N= 15: side=2.377862, score=0.376949
N= 16: side=2.446640, score=0.374128
N= 17: side=2.508124, score=0.370040
N= 18: side=2.576409, score=0.368771
N= 19: side=2.646449, score=0.368615
N= 20: side=2.742469, score=0.376057


In [6]:
# Save metrics and per-N scores
metrics = {
    'cv_score': total_score,
    'total_score': total_score,
    'overlap_errors': len(overlap_errors),
    'target': 68.888293,
    'gap': total_score - 68.888293,
    'source': 'snapshot_21337107511'
}

with open('/home/code/experiments/000_baseline/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

# Save per-N scores for future comparison
per_n_data = {
    'scores': per_n_scores,
    'sides': per_n_sides
}
with open('/home/code/experiments/000_baseline/per_n_scores.json', 'w') as f:
    json.dump(per_n_data, f)

print("Metrics saved!")
print(f"\nFinal CV Score: {total_score:.6f}")

Metrics saved!

Final CV Score: 70.615791
