# Experiment 013: Ensemble with nctuan_challenge Dataset

Ensemble current best with 158 new CSV files from nctuan_challenge/bbox_sub/

In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal, getcontext
from shapely.geometry import Polygon
from shapely import affinity
from shapely.ops import unary_union
import glob
import json
import time
import warnings
warnings.filterwarnings('ignore')

getcontext().prec = 30
SCALE_FACTOR = Decimal('1e18')

print('Setup complete')

Setup complete


In [2]:
# Tree shape vertices
TX = [0, 0.125, 0.0625, 0.2, 0.1, 0.35, 0.075, 0.075, -0.075, -0.075, -0.35, -0.1, -0.2, -0.0625, -0.125]
TY = [0.8, 0.5, 0.5, 0.25, 0.25, 0, 0, -0.2, -0.2, 0, 0, 0.25, 0.25, 0.5, 0.5]

def create_tree_polygon(x, y, angle):
    x, y, angle = float(x), float(y), float(angle)
    coords = list(zip(TX, TY))
    poly = Polygon(coords)
    poly = affinity.rotate(poly, angle, origin=(0, 0))
    poly = affinity.translate(poly, x, y)
    return poly

def create_high_precision_tree(x, y, angle):
    x = Decimal(str(x))
    y = Decimal(str(y))
    angle = Decimal(str(angle))
    sf = SCALE_FACTOR
    vertices = [
        (float(Decimal('0.0') * sf), float(Decimal('0.8') * sf)),
        (float(Decimal('0.125') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('-0.2') * sf)),
        (float(Decimal('-0.075') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.35') * sf), float(Decimal('0.0') * sf)),
        (float(Decimal('-0.1') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.2') * sf), float(Decimal('0.25') * sf)),
        (float(Decimal('-0.0625') * sf), float(Decimal('0.5') * sf)),
        (float(Decimal('-0.125') * sf), float(Decimal('0.5') * sf)),
    ]
    poly = Polygon(vertices)
    poly = affinity.rotate(poly, float(angle), origin=(0, 0))
    poly = affinity.translate(poly, xoff=float(x * sf), yoff=float(y * sf))
    return poly

def validate_no_overlap_strict(trees_data):
    if len(trees_data) <= 1:
        return True
    polygons = [create_high_precision_tree(t['x'], t['y'], t['deg']) for t in trees_data]
    for i in range(len(polygons)):
        for j in range(i+1, len(polygons)):
            if polygons[i].intersects(polygons[j]) and not polygons[i].touches(polygons[j]):
                return False
    return True

def get_bbox_side(trees):
    if len(trees) == 0:
        return 0
    polygons = [create_tree_polygon(t['x'], t['y'], t['deg']) for t in trees]
    union = unary_union(polygons)
    bounds = union.bounds
    return max(bounds[2] - bounds[0], bounds[3] - bounds[1])

def get_score(trees, n):
    side = get_bbox_side(trees)
    return (side ** 2) / n

print('Core functions defined')

Core functions defined


In [3]:
# Load baseline (exp_011 mega ensemble)
print('Loading baseline...')
baseline_df = pd.read_csv('/home/code/experiments/011_mega_ensemble_all/submission.csv')
baseline_df['N'] = baseline_df['id'].astype(str).str.split('_').str[0].astype(int)

best_trees = {}
best_scores = {}
best_sources = {}

for n, g in baseline_df.groupby('N'):
    trees = []
    for _, row in g.iterrows():
        x = str(row['x']).replace('s', '')
        y = str(row['y']).replace('s', '')
        deg = str(row['deg']).replace('s', '')
        trees.append({'x': x, 'y': y, 'deg': deg})
    best_trees[n] = trees
    best_scores[n] = get_score(trees, n)
    best_sources[n] = 'baseline'

baseline_total = sum(best_scores.values())
print(f'Baseline score: {baseline_total:.6f}')

Loading baseline...


Baseline score: 70.340960


In [4]:
# Collect all new CSV files from nctuan_challenge
print('\nCollecting new CSV files...')
new_files = glob.glob('/home/code/kaggle_datasets/nctuan_challenge/bbox_sub/*.csv')
new_files += glob.glob('/home/code/kaggle_datasets/nctuan_challenge/*.csv')
new_files += glob.glob('/home/code/kaggle_datasets/chistyakov_packed/*.csv')
print(f'Total new files: {len(new_files)}')


Collecting new CSV files...
Total new files: 81


In [5]:
def load_trees_from_csv(csv_path, n):
    try:
        df = pd.read_csv(csv_path)
        if 'id' in df.columns:
            df['N'] = df['id'].astype(str).str.split('_').str[0].astype(int)
        elif 'n' in df.columns:
            df['N'] = df['n']
        else:
            return None
        
        g = df[df['N'] == n]
        if len(g) != n:
            return None
        
        trees = []
        for _, row in g.iterrows():
            x = str(row['x']).replace('s', '')
            y = str(row['y']).replace('s', '')
            deg = str(row['deg']).replace('s', '')
            trees.append({'x': x, 'y': y, 'deg': deg})
        return trees
    except Exception as e:
        return None

print('Load function defined')

Load function defined


In [6]:
# Process all new files
print('\n' + '='*60)
print('PROCESSING NEW FILES')
print('='*60)

improvements = []
files_processed = 0
start_time = time.time()

for csv_file in new_files:
    files_processed += 1
    if files_processed % 20 == 0:
        elapsed = time.time() - start_time
        print(f'  Processed {files_processed}/{len(new_files)} files ({elapsed:.1f}s), improvements: {len(improvements)}')
    
    for n in range(1, 201):
        try:
            trees = load_trees_from_csv(csv_file, n)
            if trees is None:
                continue
            
            # Calculate score first (fast)
            score = get_score(trees, n)
            
            # Only validate if potentially better
            if score < best_scores[n] - 1e-9:
                # Validate with strict 1e18 precision
                if validate_no_overlap_strict(trees):
                    improvement = best_scores[n] - score
                    improvements.append((n, improvement, csv_file))
                    best_trees[n] = trees
                    best_scores[n] = score
                    best_sources[n] = csv_file.split('/')[-1]
        except Exception as e:
            continue

print(f'\nTotal time: {time.time() - start_time:.1f}s')
print(f'Files processed: {files_processed}')
print(f'N values improved: {len(improvements)}')


PROCESSING NEW FILES


  Processed 20/81 files (190.3s), improvements: 0


  Processed 40/81 files (391.2s), improvements: 0


  Processed 60/81 files (591.7s), improvements: 0


  Processed 80/81 files (793.0s), improvements: 0



Total time: 812.7s
Files processed: 81
N values improved: 0


In [7]:
# Show improvements
if improvements:
    print('\nTop 20 improvements found:')
    total_improvement = 0
    for n, imp, source in sorted(improvements, key=lambda x: -x[1])[:20]:
        print(f'  N={n:3d}: +{imp:.6f} from {source.split("/")[-1]}')
        total_improvement += imp
    print(f'\nTotal improvement from top 20: {total_improvement:.6f}')
    print(f'Total improvement from all: {sum(x[1] for x in improvements):.6f}')
else:
    print('\nNo improvements found!')


No improvements found!


In [8]:
# Calculate final score
final_total = sum(best_scores.values())
print(f'\n' + '='*60)
print('RESULTS')
print('='*60)
print(f'Baseline score: {baseline_total:.6f}')
print(f'Final score: {final_total:.6f}')
print(f'Improvement: {baseline_total - final_total:.6f}')
print(f'Target: 68.879235')
print(f'Gap to target: {final_total - 68.879235:.6f}')


RESULTS
Baseline score: 70.340960
Final score: 70.340960
Improvement: 0.000000
Target: 68.879235
Gap to target: 1.461725


In [9]:
# Final validation
print('\n' + '='*60)
print('FINAL VALIDATION')
print('='*60)

final_overlaps = []
for n in range(1, 201):
    if not validate_no_overlap_strict(best_trees[n]):
        final_overlaps.append(n)

if final_overlaps:
    print(f'WARNING: {len(final_overlaps)} N values have overlaps: {final_overlaps[:10]}...')
else:
    print('All N values pass strict validation!')


FINAL VALIDATION


All N values pass strict validation!


In [10]:
# Create submission
print('\n' + '='*60)
print('CREATE SUBMISSION')
print('='*60)

rows = []
for n in range(1, 201):
    trees = best_trees[n]
    for i, t in enumerate(trees):
        x_val = str(t['x']).replace('s', '')
        y_val = str(t['y']).replace('s', '')
        deg_val = str(t['deg']).replace('s', '')
        rows.append({
            'id': f'{n:03d}_{i}',
            'x': f's{x_val}',
            'y': f's{y_val}',
            'deg': f's{deg_val}'
        })

submission_df = pd.DataFrame(rows)
print(f'Submission shape: {submission_df.shape}')

submission_df.to_csv('/home/code/experiments/013_nctuan_ensemble/submission.csv', index=False)
submission_df.to_csv('/home/submission/submission.csv', index=False)
print('Submission saved!')


CREATE SUBMISSION
Submission shape: (20100, 4)
Submission saved!


In [11]:
# Save metrics
metrics = {
    'cv_score': final_total,
    'baseline_score': baseline_total,
    'improvement': baseline_total - final_total,
    'n_improved': len(improvements),
    'total_new_files': len(new_files),
    'final_overlaps': len(final_overlaps),
    'target': 68.879235,
    'gap': final_total - 68.879235
}

with open('/home/code/experiments/013_nctuan_ensemble/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print('\nMetrics saved!')
print(json.dumps(metrics, indent=2))


Metrics saved!
{
  "cv_score": 70.34095966509364,
  "baseline_score": 70.34095966509364,
  "improvement": 0.0,
  "n_improved": 0,
  "total_new_files": 81,
  "final_overlaps": 0,
  "target": 68.879235,
  "gap": 1.4617246650936409
}
