# Tessellation with Gap Constraints

Implement tessellation/translation approach for large N values with gap constraints.
Based on egortrushin kernel but with distance > 0 requirement.

In [1]:
import pandas as pd
import numpy as np
from shapely import affinity
from shapely.geometry import Polygon
from itertools import combinations
import json
import copy
import random
import time

class ChristmasTree:
    def __init__(self, center_x, center_y, angle):
        self.center_x = float(center_x)
        self.center_y = float(center_y)
        self.angle = float(angle)
        
        initial_polygon = Polygon([
            (0.0, 0.8), (0.125, 0.5), (0.0625, 0.5),
            (0.2, 0.25), (0.1, 0.25), (0.35, 0.0),
            (0.075, 0.0), (0.075, -0.2), (-0.075, -0.2),
            (-0.075, 0.0), (-0.35, 0.0), (-0.1, 0.25),
            (-0.2, 0.25), (-0.0625, 0.5), (-0.125, 0.5),
        ])
        rotated = affinity.rotate(initial_polygon, self.angle, origin=(0, 0))
        self.polygon = affinity.translate(rotated, xoff=self.center_x, yoff=self.center_y)

def parse_value(val):
    if isinstance(val, str) and val.startswith('s'):
        return val[1:]
    return str(val)

def load_trees_for_n(df, n):
    prefix = f"{n:03d}_"
    rows = df[df['id'].str.startswith(prefix)]
    trees = []
    for _, row in rows.iterrows():
        x = parse_value(row['x'])
        y = parse_value(row['y'])
        deg = parse_value(row['deg'])
        trees.append(ChristmasTree(x, y, deg))
    return trees

def get_min_distance(trees):
    if len(trees) <= 1:
        return float('inf')
    min_dist = float('inf')
    for i, j in combinations(range(len(trees)), 2):
        dist = trees[i].polygon.distance(trees[j].polygon)
        min_dist = min(min_dist, dist)
    return min_dist

def get_bounding_box_side(trees):
    all_points = []
    for tree in trees:
        coords = np.array(tree.polygon.exterior.coords)
        all_points.append(coords)
    all_points = np.vstack(all_points)
    min_x, min_y = all_points.min(axis=0)
    max_x, max_y = all_points.max(axis=0)
    return max(max_x - min_x, max_y - min_y)

def has_collision_with_gap(trees, min_gap=1e-9):
    """Check if any pair of trees violates minimum gap"""
    if len(trees) <= 1:
        return False
    for i in range(len(trees)):
        for j in range(i+1, len(trees)):
            if trees[i].polygon.distance(trees[j].polygon) < min_gap:
                return True
    return False

print("Functions defined")

Functions defined


In [4]:
def create_tessellation(n, base_trees, dx, dy):
    """
    Create n trees using tessellation pattern.
    base_trees: list of 2 base trees with different angles
    dx, dy: translation distances
    """
    trees = []
    num_base = len(base_trees)
    
    # Calculate grid size needed
    grid_size = int(np.ceil(np.sqrt(n / num_base))) + 1
    
    for i in range(grid_size):
        for j in range(grid_size):
            for k, base in enumerate(base_trees):
                if len(trees) >= n:
                    break
                new_x = base.center_x + i * dx
                new_y = base.center_y + j * dy
                trees.append(ChristmasTree(new_x, new_y, base.angle))
            if len(trees) >= n:
                break
        if len(trees) >= n:
            break
    
    return trees[:n]

def tessellation_sa(n, iterations=5000, min_gap=1e-9):
    """
    Simulated annealing to optimize tessellation parameters.
    Optimizes: base tree positions, angles, and translation distances.
    """
    random.seed(42 + n)
    
    # Initialize with 2 base trees at different angles
    # Start with LARGER spacing to ensure validity
    base_trees = [
        ChristmasTree(0, 0, 0),
        ChristmasTree(0.5, 0.5, 180)  # Offset and rotated
    ]
    
    # Start with spacing that guarantees no collision
    # Tree width is ~0.7, height is ~1.0
    dx = 1.5  # Large initial spacing
    dy = 1.5
    
    # Create initial tessellation
    trees = create_tessellation(n, base_trees, dx, dy)
    
    # Verify initial configuration is valid
    if has_collision_with_gap(trees, min_gap):
        # Try even larger spacing
        for scale in [2.0, 2.5, 3.0, 4.0]:
            dx = scale
            dy = scale
            trees = create_tessellation(n, base_trees, dx, dy)
            if not has_collision_with_gap(trees, min_gap):
                break
    
    if has_collision_with_gap(trees, min_gap):
        print(f"N={n}: Could not create valid initial tessellation")
        return None
    
    best_side = get_bounding_box_side(trees)
    best_params = {
        'base_trees': [(t.center_x, t.center_y, t.angle) for t in base_trees],
        'dx': dx,
        'dy': dy
    }
    
    current_side = best_side
    T = 1.0
    T_min = 0.0001
    alpha = (T_min / T) ** (1.0 / iterations)
    
    for it in range(iterations):
        # Choose what to perturb
        move_type = random.randint(0, 4)
        
        # Save current state
        old_base = [(t.center_x, t.center_y, t.angle) for t in base_trees]
        old_dx, old_dy = dx, dy
        
        sc = T  # Scale factor
        
        if move_type == 0:  # Perturb base tree 0 position
            base_trees[0] = ChristmasTree(
                base_trees[0].center_x + random.uniform(-0.1, 0.1) * sc,
                base_trees[0].center_y + random.uniform(-0.1, 0.1) * sc,
                base_trees[0].angle
            )
        elif move_type == 1:  # Perturb base tree 1 position
            base_trees[1] = ChristmasTree(
                base_trees[1].center_x + random.uniform(-0.1, 0.1) * sc,
                base_trees[1].center_y + random.uniform(-0.1, 0.1) * sc,
                base_trees[1].angle
            )
        elif move_type == 2:  # Perturb base tree angles
            idx = random.randint(0, 1)
            base_trees[idx] = ChristmasTree(
                base_trees[idx].center_x,
                base_trees[idx].center_y,
                (base_trees[idx].angle + random.uniform(-10, 10) * sc) % 360
            )
        elif move_type == 3:  # Perturb dx
            dx = max(0.5, dx + random.uniform(-0.1, 0.1) * sc)
        else:  # Perturb dy
            dy = max(0.5, dy + random.uniform(-0.1, 0.1) * sc)
        
        # Create new tessellation
        trees = create_tessellation(n, base_trees, dx, dy)
        
        # Check validity
        if has_collision_with_gap(trees, min_gap):
            # Revert
            base_trees = [ChristmasTree(p[0], p[1], p[2]) for p in old_base]
            dx, dy = old_dx, old_dy
            T *= alpha
            continue
        
        new_side = get_bounding_box_side(trees)
        delta = new_side - current_side
        
        if delta < 0 or random.random() < np.exp(-delta / T):
            current_side = new_side
            if new_side < best_side:
                best_side = new_side
                best_params = {
                    'base_trees': [(t.center_x, t.center_y, t.angle) for t in base_trees],
                    'dx': dx,
                    'dy': dy
                }
        else:
            # Revert
            base_trees = [ChristmasTree(p[0], p[1], p[2]) for p in old_base]
            dx, dy = old_dx, old_dy
        
        T *= alpha
    
    # Return best configuration
    base_trees = [ChristmasTree(p[0], p[1], p[2]) for p in best_params['base_trees']]
    trees = create_tessellation(n, base_trees, best_params['dx'], best_params['dy'])
    
    return trees, best_side

print("Tessellation SA defined")

Tessellation SA defined


In [7]:
# The tessellation approach is 2.7-3.5x worse than valid ensemble
# Let's analyze where the valid ensemble is weakest compared to touching solutions

# Load both ensembles
df_valid = pd.read_csv('/home/code/submission_candidates/candidate_001.csv')
df_touching = pd.read_csv('/home/code/submission_candidates/candidate_002.csv')

print("Comparing valid vs touching ensemble by N:")
print(f"{'N':>4} {'Valid':>10} {'Touching':>10} {'Gap':>10} {'Gap%':>8}")
print("-" * 50)

gaps = []
for n in range(1, 201):
    trees_valid = load_trees_for_n(df_valid, n)
    trees_touch = load_trees_for_n(df_touching, n)
    
    valid_side = get_bounding_box_side(trees_valid)
    touch_side = get_bounding_box_side(trees_touch)
    
    valid_contrib = (valid_side ** 2) / n
    touch_contrib = (touch_side ** 2) / n
    
    gap = valid_contrib - touch_contrib
    gap_pct = (gap / touch_contrib) * 100
    gaps.append((n, valid_contrib, touch_contrib, gap, gap_pct))
    
    if n <= 20 or n % 20 == 0:
        print(f"{n:4d} {valid_contrib:10.4f} {touch_contrib:10.4f} {gap:10.4f} {gap_pct:8.2f}%")

# Find N values with largest gaps
print("\n\\nTop 10 N values with largest gaps (potential for improvement):")
gaps_sorted = sorted(gaps, key=lambda x: x[3], reverse=True)
for n, valid_c, touch_c, gap, gap_pct in gaps_sorted[:10]:
    print(f"N={n:3d}: gap={gap:.4f} ({gap_pct:.2f}%)")

Comparing valid vs touching ensemble by N:
   N      Valid   Touching        Gap     Gap%
--------------------------------------------------
   1     0.6612     0.6612     0.0000     0.00%
   2     0.4508     0.4508     0.0000     0.00%
   3     0.4347     0.4347     0.0000     0.00%
   4     0.4166     0.4166     0.0000     0.00%
   5     0.4170     0.4170     0.0001     0.01%
   6     0.3998     0.3998     0.0000     0.01%
   7     0.4002     0.4000     0.0002     0.04%
   8     0.3866     0.3856     0.0010     0.25%
   9     0.3875     0.3875     0.0000     0.00%
  10     0.3771     0.3771     0.0000     0.00%
  11     0.3770     0.3770     0.0000     0.00%
  12     0.3752     0.3752     0.0000     0.00%
  13     0.3732     0.3727     0.0005     0.13%
  14     0.3821     0.3821     0.0000     0.00%
  15     0.3795     0.3795     0.0000     0.00%
  16     0.3760     0.3760     0.0000     0.00%
  17     0.3708     0.3708     0.0000     0.00%
  18     0.3709     0.3709     0.0000     0

  40     0.3621     0.3621     0.0000     0.00%


  60     0.3687     0.3687     0.0000     0.00%


  80     0.3449     0.3449     0.0000     0.00%


 100     0.3461     0.3461     0.0000     0.00%


 120     0.3377     0.3377     0.0000     0.00%


 140     0.3401     0.3401     0.0000     0.00%


 160     0.3408     0.3408     0.0000     0.00%


 180     0.3342     0.3342     0.0000     0.00%


 200     0.3380     0.3380     0.0000     0.00%

\nTop 10 N values with largest gaps (potential for improvement):
N=  8: gap=0.0010 (0.25%)
N= 13: gap=0.0005 (0.13%)
N=  7: gap=0.0002 (0.04%)
N=  5: gap=0.0001 (0.01%)
N=  6: gap=0.0000 (0.01%)
N=  1: gap=0.0000 (0.00%)
N=  2: gap=0.0000 (0.00%)
N=  3: gap=0.0000 (0.00%)
N=  4: gap=0.0000 (0.00%)
N=  9: gap=0.0000 (0.00%)


In [8]:
# Calculate total scores
valid_total = sum(g[1] for g in gaps)
touch_total = sum(g[2] for g in gaps)
total_gap = valid_total - touch_total

print(f"Valid ensemble total: {valid_total:.6f}")
print(f"Touching ensemble total: {touch_total:.6f}")
print(f"Total gap: {total_gap:.6f}")
print(f"Target: 68.919154")
print(f"Gap to target from valid: {valid_total - 68.919154:.6f}")
print(f"Gap to target from touching: {touch_total - 68.919154:.6f}")

# The valid and touching ensembles are essentially the same!
# This means the valid ensemble already contains non-touching solutions
# The gap to target is ~2.89 points - we need fundamentally better solutions

Valid ensemble total: 71.812779
Touching ensemble total: 71.811060
Total gap: 0.001720
Target: 68.919154
Gap to target from valid: 2.893625
Gap to target from touching: 2.891906


In [9]:
# Check the 70.559 ensemble - this is better than our 71.81!
df_70 = pd.read_csv('/home/nonroot/snapshots/santa-2025/21156852373/code/ensemble_70.559.csv')
print(f"Shape: {df_70.shape}")
print(df_70.head())

# Calculate score and check validity
total_score = 0
valid_count = 0
touching_count = 0

for n in range(1, 201):
    trees = load_trees_for_n(df_70, n)
    side = get_bounding_box_side(trees)
    total_score += (side ** 2) / n
    
    min_dist = get_min_distance(trees)
    if min_dist > 0:
        valid_count += 1
    else:
        touching_count += 1

print(f"\nTotal score: {total_score:.6f}")
print(f"Valid N values (dist > 0): {valid_count}")
print(f"Touching N values (dist = 0): {touching_count}")

Shape: (20100, 4)
      id                       x                        y  \
0  001_0  s43.591192092102147626  s-31.783267068741778871   
1  002_0   s0.154097069621355887   s-0.038540742694794648   
2  002_1  s-0.154097069621372845   s-0.561459257305224058   
3  003_0   s1.131270585068746337    s0.792202872326948637   
4  003_1   s1.234055695842160016    s1.275999500663759001   

                       deg  
0   s44.999999999999978684  
1  s144.272760863123583164  
2  s324.272760863123533426  
3  s113.563260441729482864  
4   s66.370622269343002131  



Total score: 70.559048
Valid N values (dist > 0): 118
Touching N values (dist = 0): 82


In [10]:
# Let's also check the 70.926 submission
df_70_9 = pd.read_csv('/home/nonroot/snapshots/santa-2025/21180221700/code/exploration/datasets/solutions/submission_70.926137.csv')
print(f"Shape: {df_70_9.shape}")

# Calculate score and check validity
total_score = 0
valid_count = 0
touching_count = 0

for n in range(1, 201):
    trees = load_trees_for_n(df_70_9, n)
    side = get_bounding_box_side(trees)
    total_score += (side ** 2) / n
    
    min_dist = get_min_distance(trees)
    if min_dist > 0:
        valid_count += 1
    else:
        touching_count += 1

print(f"\nTotal score: {total_score:.6f}")
print(f"Valid N values (dist > 0): {valid_count}")
print(f"Touching N values (dist = 0): {touching_count}")

Shape: (20100, 4)



Total score: 70.926137
Valid N values (dist > 0): 200
Touching N values (dist = 0): 0


In [11]:
# This is a MAJOR find! submission_70.926137.csv has:
# - Score: 70.926 (0.89 points better than our 71.81)
# - All 200 N values are valid (non-touching)

# Let's verify this more carefully
print("Verifying submission_70.926137.csv...")

min_distances = []
for n in range(1, 201):
    trees = load_trees_for_n(df_70_9, n)
    min_dist = get_min_distance(trees)
    min_distances.append((n, min_dist))
    if min_dist <= 0:
        print(f"WARNING: N={n} has min_dist={min_dist}")

# Check minimum distance across all N
all_min = min(d for _, d in min_distances)
print(f"\nMinimum distance across all N: {all_min:.2e}")

# Find N values with smallest distances
sorted_dists = sorted(min_distances, key=lambda x: x[1])
print("\nN values with smallest distances:")
for n, d in sorted_dists[:10]:
    print(f"  N={n}: {d:.2e}")

Verifying submission_70.926137.csv...



Minimum distance across all N: 3.65e-17

N values with smallest distances:
  N=2: 3.65e-17
  N=3: 6.36e-16
  N=156: 2.51e-15
  N=185: 4.12e-15
  N=195: 4.55e-15
  N=28: 4.71e-15
  N=68: 4.74e-15
  N=82: 4.88e-15
  N=72: 5.94e-15
  N=176: 6.81e-15


In [12]:
# The distances are extremely small (1e-17) - essentially touching
# Kaggle might reject these. Let's search for better valid solutions.

# First, let's find all CSVs in snapshots and check their validity
import os
import glob

csv_files = glob.glob('/home/nonroot/snapshots/**/*.csv', recursive=True)
print(f"Found {len(csv_files)} CSV files")

# Sample some to check
valid_submissions = []

for csv_path in csv_files[:100]:  # Check first 100
    try:
        df = pd.read_csv(csv_path)
        if df.shape[0] != 20100:
            continue
        if 'id' not in df.columns:
            continue
            
        # Quick check - just check a few N values
        all_valid = True
        total_score = 0
        min_gap = float('inf')
        
        for n in [1, 10, 50, 100, 200]:
            trees = load_trees_for_n(df, n)
            if len(trees) != n:
                all_valid = False
                break
            side = get_bounding_box_side(trees)
            total_score += (side ** 2) / n
            min_dist = get_min_distance(trees)
            min_gap = min(min_gap, min_dist)
            if min_dist <= 0:
                all_valid = False
                break
        
        if all_valid and min_gap > 1e-10:  # Require meaningful gap
            valid_submissions.append((csv_path, total_score, min_gap))
            print(f"Valid: {csv_path.split('/')[-1]}, partial_score={total_score:.4f}, min_gap={min_gap:.2e}")
    except Exception as e:
        continue

print(f"\nFound {len(valid_submissions)} potentially valid submissions with meaningful gaps")

Found 1323 CSV files


Valid: candidate_000.csv, partial_score=3.6094, min_gap=4.09e-05


Valid: santa2025_ver2_v63.csv, partial_score=2.1008, min_gap=1.87e-10


Valid: submission (77).csv, partial_score=2.0962, min_gap=2.10e-10


Valid: sample_submission.csv, partial_score=4.7200, min_gap=4.09e-05


Valid: sample_submission.csv, partial_score=4.7200, min_gap=4.09e-05


Valid: sample_submission.csv, partial_score=4.7200, min_gap=4.09e-05


Valid: sample_submission.csv, partial_score=4.7200, min_gap=4.09e-05


Valid: submission.csv, partial_score=3.6326, min_gap=3.11e-05


Valid: submission.csv, partial_score=3.6215, min_gap=1.05e-06


Valid: submission.csv, partial_score=3.4540, min_gap=1.05e-06


Valid: submission.csv, partial_score=3.6317, min_gap=4.07e-05


Valid: submission.csv, partial_score=3.6215, min_gap=1.05e-06


Valid: submission.csv, partial_score=3.6078, min_gap=6.84e-06


Valid: candidate_000.csv, partial_score=3.6215, min_gap=1.05e-06


Valid: candidate_001.csv, partial_score=3.4540, min_gap=1.05e-06


Valid: sample_submission.csv, partial_score=4.7200, min_gap=4.09e-05


Valid: submission.csv, partial_score=3.0048, min_gap=7.86e-07


Valid: submission_v21.csv, partial_score=3.0048, min_gap=7.86e-07


Valid: submission_bp.csv, partial_score=3.0048, min_gap=7.86e-07


Valid: submission.csv, partial_score=3.9089, min_gap=8.94e-08


Valid: candidate_000.csv, partial_score=3.9553, min_gap=4.09e-05


Valid: candidate_001.csv, partial_score=3.0048, min_gap=7.86e-07


Valid: sample_submission.csv, partial_score=4.7200, min_gap=4.09e-05

Found 23 potentially valid submissions with meaningful gaps


In [13]:
# Check the submission_70.926137.csv more carefully
# Even though gaps are tiny (1e-17), let's see if Kaggle accepts it

# First, let's copy it to submission and verify format
import shutil

src = '/home/nonroot/snapshots/santa-2025/21180221700/code/exploration/datasets/solutions/submission_70.926137.csv'
dst = '/home/submission/submission.csv'
shutil.copy(src, dst)

# Verify the submission
df_sub = pd.read_csv(dst)
print(f"Submission shape: {df_sub.shape}")
print(f"Columns: {df_sub.columns.tolist()}")
print(f"First few rows:\n{df_sub.head()}")

# Calculate full score
total_score = 0
for n in range(1, 201):
    trees = load_trees_for_n(df_sub, n)
    side = get_bounding_box_side(trees)
    total_score += (side ** 2) / n

print(f"\nFull score: {total_score:.6f}")
print(f"Target: 68.919154")
print(f"Gap to target: {total_score - 68.919154:.6f}")

Submission shape: (20100, 4)
Columns: ['id', 'x', 'y', 'deg']
First few rows:
      id                     x                    y                   deg
0  001_0  s-48.462025735646897  s59.036924156646883   s45.000000000000000
1  002_0    s0.354635418797758   s0.795710882346675  s203.629377730552051
2  002_1    s0.046441279555199   s0.272792367735944   s23.629377730552051
3  003_0    s0.647087199308396   s1.202535626558404  s155.134051937100821
4  003_1    s1.239428254921482   s1.298076560608782   s66.370622269343002



Full score: 70.926137
Target: 68.919154
Gap to target: 2.006983


In [None]:
# Let's search more thoroughly for valid submissions
# Focus on files that might have better scores

# Check all CSVs with scores in their names
import re

score_pattern = re.compile(r'(\d{2}\.\d+)')

best_valid = None
best_score = float('inf')

for csv_path in csv_files:
    # Skip if not a submission file
    if 'sample' in csv_path.lower():
        continue
        
    try:
        df = pd.read_csv(csv_path)
        if df.shape[0] != 20100:
            continue
        if 'id' not in df.columns:
            continue
        
        # Calculate full score and check validity
        total_score = 0
        all_valid = True
        min_gap = float('inf')
        
        for n in range(1, 201):
            trees = load_trees_for_n(df, n)
            if len(trees) != n:
                all_valid = False
                break
            side = get_bounding_box_side(trees)
            total_score += (side ** 2) / n
            min_dist = get_min_distance(trees)
            min_gap = min(min_gap, min_dist)
            if min_dist <= 0:
                all_valid = False
                break
        
        if all_valid and total_score < best_score:
            best_score = total_score
            best_valid = csv_path
            print(f"New best: {csv_path.split('/')[-1]}, score={total_score:.6f}, min_gap={min_gap:.2e}")
            
    except Exception as e:
        continue

print(f"\nBest valid submission: {best_valid}")
print(f"Best score: {best_score:.6f}")