# Loop 6 Analysis: Understanding the Gap and Finding a Path Forward

## Current Situation
- Best LB: 71.81 (valid, non-touching)
- Target: 68.92
- Gap: 2.89 points (4.2%)

## Key Insight from Experiments
- All 6 experiments converged to 71.81 (valid ensemble)
- 3 experiments used baseline fallback (approaches FAILED)
- The valid ensemble is a LOCAL OPTIMUM for the 'convert existing solutions' approach

## The Core Problem
Kaggle requires trees to have distance > 0 (not just non-overlapping). The optimized solutions with touching trees score 70.65 but are rejected. The gap between valid and touching is ~1.2 points.

In [1]:
import pandas as pd
import numpy as np
from shapely import affinity
from shapely.geometry import Polygon
from itertools import combinations
import json
import os

# Load the valid submission that was accepted by Kaggle
df_valid = pd.read_csv('/home/code/submission_candidates/candidate_001.csv')

class ChristmasTree:
    def __init__(self, center_x, center_y, angle):
        self.center_x = float(center_x)
        self.center_y = float(center_y)
        self.angle = float(angle)
        
        initial_polygon = Polygon([
            (0.0, 0.8), (0.125, 0.5), (0.0625, 0.5),
            (0.2, 0.25), (0.1, 0.25), (0.35, 0.0),
            (0.075, 0.0), (0.075, -0.2), (-0.075, -0.2),
            (-0.075, 0.0), (-0.35, 0.0), (-0.1, 0.25),
            (-0.2, 0.25), (-0.0625, 0.5), (-0.125, 0.5),
        ])
        rotated = affinity.rotate(initial_polygon, self.angle, origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=self.center_x, yoff=self.center_y)

def parse_value(val):
    if isinstance(val, str) and val.startswith('s'):
        return val[1:]
    return str(val)

def load_trees_for_n(df, n):
    prefix = f"{n:03d}_"
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in rows.iterrows():
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        trees.append(ChristmasTree(x, y, deg))
    return trees

def get_bounding_box_side(trees):
    all_points = []
    for tree in trees:
        coords = np.array(tree.polygon.exterior.coords)
        all_points.append(coords)
    all_points = np.vstack(all_points)
    min_x, min_y = all_points.min(axis=0)
    max_x, max_y = all_points.max(axis=0)
    return max(max_x - min_x, max_y - min_y)

def get_min_distance(trees):
    if len(trees) <= 1:
        return float('inf')
    min_dist = float('inf')
    for i, j in combinations(range(len(trees)), 2):
        dist = trees[i].polygon.distance(trees[j].polygon)
        min_dist = min(min_dist, dist)
    return min_dist

print("Functions loaded")

Functions loaded


In [2]:
# Analyze the valid submission - where are the biggest score contributions?
print("Score breakdown by N range:")
print("="*60)

ranges = [(1, 10), (11, 50), (51, 100), (101, 150), (151, 200)]
range_scores = {}

for start, end in ranges:
    total = 0
    for n in range(start, end + 1):
        trees = load_trees_for_n(df_valid, n)
        if len(trees) == n:
            side = get_bounding_box_side(trees)
            contrib = (side ** 2) / n
            total += contrib
    range_scores[(start, end)] = total
    print(f"N={start:3d}-{end:3d}: {total:.4f}")

print(f"\nTotal: {sum(range_scores.values()):.4f}")

Score breakdown by N range:
N=  1- 10: 4.3317


N= 11- 50: 14.8588


N= 51-100: 17.8879


N=101-150: 17.4440


N=151-200: 17.2904

Total: 71.8128


In [3]:
# Find the N values with the largest score contributions (most room for improvement)
print("\nTop 20 N values by score contribution:")
print("="*60)

n_scores = []
for n in range(1, 201):
    trees = load_trees_for_n(df_valid, n)
    if len(trees) == n:
        side = get_bounding_box_side(trees)
        contrib = (side ** 2) / n
        min_d = get_min_distance(trees)
        n_scores.append((n, contrib, side, min_d))

n_scores.sort(key=lambda x: x[1], reverse=True)
for n, contrib, side, min_d in n_scores[:20]:
    print(f"N={n:3d}: score={contrib:.4f}, side={side:.4f}, min_dist={min_d:.6f}")


Top 20 N values by score contribution:


N=  1: score=0.6612, side=0.8132, min_dist=inf
N=  2: score=0.4508, side=0.9495, min_dist=0.000000
N=  3: score=0.4347, side=1.1420, min_dist=0.000000
N=  5: score=0.4170, side=1.4440, min_dist=0.000000
N=  4: score=0.4166, side=1.2909, min_dist=0.000000
N=  7: score=0.4002, side=1.6737, min_dist=0.000000
N=  6: score=0.3998, side=1.5489, min_dist=0.000000
N=  9: score=0.3875, side=1.8676, min_dist=0.000000
N=  8: score=0.3866, side=1.7586, min_dist=0.000000
N= 14: score=0.3821, side=2.3130, min_dist=0.000000
N= 15: score=0.3795, side=2.3858, min_dist=0.000000
N= 22: score=0.3790, side=2.8876, min_dist=0.000000
N= 19: score=0.3789, side=2.6830, min_dist=0.000000
N= 20: score=0.3786, side=2.7517, min_dist=0.000000
N= 27: score=0.3779, side=3.1941, min_dist=0.000000
N= 29: score=0.3774, side=3.3083, min_dist=0.000000
N= 21: score=0.3772, side=2.8145, min_dist=0.000000
N= 10: score=0.3771, side=1.9418, min_dist=0.000000
N= 11: score=0.3770, side=2.0364, min_dist=0.000000
N= 16: score=0.37

In [4]:
# Calculate theoretical lower bound
# Each tree has area ~0.35 (approximate)
# For N trees, minimum area needed is ~0.35*N
# Minimum side length is sqrt(0.35*N)
# Minimum score contribution is 0.35*N / N = 0.35

print("\nTheoretical analysis:")
print("="*60)

# Tree area calculation
tree_polygon = Polygon([
    (0.0, 0.8), (0.125, 0.5), (0.0625, 0.5),
    (0.2, 0.25), (0.1, 0.25), (0.35, 0.0),
    (0.075, 0.0), (0.075, -0.2), (-0.075, -0.2),
    (-0.075, 0.0), (-0.35, 0.0), (-0.1, 0.25),
    (-0.2, 0.25), (-0.0625, 0.5), (-0.125, 0.5),
])
print(f"Tree area: {tree_polygon.area:.4f}")
print(f"Tree bounding box: {tree_polygon.bounds}")
print(f"Tree width: {tree_polygon.bounds[2] - tree_polygon.bounds[0]:.4f}")
print(f"Tree height: {tree_polygon.bounds[3] - tree_polygon.bounds[1]:.4f}")

# Theoretical minimum score per N
theoretical_min = 0
for n in range(1, 201):
    # Minimum side length is approximately sqrt(tree_area * N / packing_efficiency)
    # Assuming 70% packing efficiency
    min_side = np.sqrt(tree_polygon.area * n / 0.7)
    min_contrib = (min_side ** 2) / n
    theoretical_min += min_contrib

print(f"\nTheoretical minimum score (70% packing): {theoretical_min:.4f}")
print(f"Current valid score: 71.81")
print(f"Target score: 68.92")
print(f"Gap to target: {71.81 - 68.92:.2f}")


Theoretical analysis:
Tree area: 0.2456
Tree bounding box: (-0.35, -0.2, 0.35, 0.8)
Tree width: 0.7000
Tree height: 1.0000

Theoretical minimum score (70% packing): 70.1786
Current valid score: 71.81
Target score: 68.92
Gap to target: 2.89


In [5]:
# CRITICAL: The valid submission shows min_dist=0.000000 for most N values
# But Kaggle accepted it! Let me check more carefully with higher precision

print("Checking min distances with higher precision:")
print("="*60)

for n in [2, 3, 4, 5, 10, 20, 50, 100, 200]:
    trees = load_trees_for_n(df_valid, n)
    if len(trees) == n:
        min_d = get_min_distance(trees)
        print(f"N={n:3d}: min_distance = {min_d:.15f}")

Checking min distances with higher precision:
N=  2: min_distance = 0.000000001970956
N=  3: min_distance = 0.000000038787745
N=  4: min_distance = 0.000000001092745
N=  5: min_distance = 0.000000001979614
N= 10: min_distance = 0.000000026993416
N= 20: min_distance = 0.000000002109242
N= 50: min_distance = 0.000000013050365
N=100: min_distance = 0.000000003614744


N=200: min_distance = 0.000000001060337


In [6]:
# The key question: How do we get from 71.81 to 68.92?\n# That's a 2.89 point improvement needed (4.0%)\n\n# Let's compare with the touching ensemble to understand the gap\ndf_touching = pd.read_csv('/home/code/submission_candidates/candidate_000.csv')\n\nprint("Comparing valid vs touching ensemble:")\nprint("="*60)\n\nvalid_total = 0\ntouching_total = 0\ngap_by_range = {}\n\nfor start, end in [(1, 10), (11, 50), (51, 100), (101, 150), (151, 200)]:\n    valid_range = 0\n    touching_range = 0\n    for n in range(start, end + 1):\n        trees_v = load_trees_for_n(df_valid, n)\n        trees_t = load_trees_for_n(df_touching, n)\n        if len(trees_v) == n and len(trees_t) == n:\n            side_v = get_bounding_box_side(trees_v)\n            side_t = get_bounding_box_side(trees_t)\n            valid_range += (side_v ** 2) / n\n            touching_range += (side_t ** 2) / n\n    valid_total += valid_range\n    touching_total += touching_range\n    gap = valid_range - touching_range\n    gap_by_range[(start, end)] = gap\n    print(f"N={start:3d}-{end:3d}: Valid={valid_range:.4f}, Touching={touching_range:.4f}, Gap={gap:.4f}")\n\nprint(f"\\nTotal: Valid={valid_total:.4f}, Touching={touching_total:.4f}, Gap={valid_total-touching_total:.4f}")\nprint(f"\\nTarget: 68.92")\nprint(f"Gap from valid to target: {valid_total - 68.92:.4f}")\nprint(f"Gap from touching to target: {touching_total - 68.92:.4f}")

In [None]:
# Check if there are any pre-optimized CSVs with better valid configurations
import glob

print("Searching for pre-optimized CSVs with valid configurations...")
print("="*60)

# Find all CSVs in snapshots
csv_files = glob.glob('/home/nonroot/snapshots/santa-2025/**/*.csv', recursive=True)
print(f"Found {len(csv_files)} CSV files")

# Sample a few to check their scores
sample_files = csv_files[:10]
for f in sample_files:
    try:
        df = pd.read_csv(f)
        if 'id' in df.columns and 'x' in df.columns:
            # Check if it has all 200 groups
            groups = df['id'].str.split('_').str[0].unique()
            if len(groups) >= 200:
                # Calculate score for N=1-10
                total = 0
                for n in range(1, 11):
                    trees = load_trees_for_n(df, n)
                    if len(trees) == n:
                        side = get_bounding_box_side(trees)
                        total += (side ** 2) / n
                print(f"{os.path.basename(f)}: N=1-10 score = {total:.4f}")
    except Exception as e:
        pass