# CHM-BASIERTE POSITIONS- UND HÖHENKORREKTUR
## Zweck: Korrektur von Baumpositionen via Snap-to-Peak und Neuzuweisung der Höhen aus CHM für Berlin, Hamburg und Rostock

## 1. SETUP & KONFIGURATION

In [1]:
pip install geopandas rasterio tqdm



In [2]:
# Imports
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterio.windows import from_bounds, Window
from scipy.ndimage import maximum_filter
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
import json
import warnings
warnings.filterwarnings('ignore')

# Plotting-Konfiguration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("✓ Imports erfolgreich")

✓ Imports erfolgreich


## PFADE KONFIGURIEREN

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Base-Pfad zu deinem Projekt
BASE_DIR = Path("/content/drive/MyDrive/Studium/Geoinformation/Module/Projektarbeit")

# Input-Pfade
INPUT_PATHS = {
    'cadastre': BASE_DIR / 'data/tree_cadastres/processed/trees_filtered_viable_edge_15m.gpkg',
    'boundaries': BASE_DIR / 'data/boundaries/city_boundaries.gpkg',
    'chm': {
        'Berlin': BASE_DIR / 'data/CHM/processed/CHM_1m_Berlin.tif',
        'Hamburg': BASE_DIR / 'data/CHM/processed/CHM_1m_Hamburg.tif',
        'Rostock': BASE_DIR / 'data/CHM/processed/CHM_1m_Rostock.tif',
    },
}

# Output-Pfade
OUTPUT_PATHS = {
    'processed_dir': BASE_DIR / 'data/training_data/tree_cadastres/processed',
    'validation_dir': BASE_DIR / 'data/training_data/tree_cadastres/validation',
}

# Output-Verzeichnisse erstellen
OUTPUT_PATHS['processed_dir'].mkdir(parents=True, exist_ok=True)
OUTPUT_PATHS['validation_dir'].mkdir(parents=True, exist_ok=True)

print(f"✓ Pfade konfiguriert")
print(f"  Base: {BASE_DIR}")
print(f"  Outputs: {OUTPUT_PATHS['processed_dir']}")

Mounted at /content/drive
✓ Pfade konfiguriert
  Base: /content/drive/MyDrive/Studium/Geoinformation/Module/Projektarbeit
  Outputs: /content/drive/MyDrive/Studium/Geoinformation/Module/Projektarbeit/data/training_data/tree_cadastres/processed


## PROCESSING-PARAMETER

In [4]:
CITIES = ['Berlin', 'Hamburg', 'Rostock']

PROCESSING_PARAMS = {
    # Snap-to-Peak
    'search_radius_m': 5,           # Maximale Distanz Kataster → Peak
    'min_peak_height_m': 3,         # Minimale CHM-Höhe (DIN 18916)
    'footprint_size': 3,            # Lokales Maximum (3×3 = ~3m)

    # Höhen-Extraktion
    'height_buffer_m': 1,           # Buffer für robuste Höhen-Extraktion
    'height_method': 'max',         # Maximum im Buffer

    # Plausibilitäts-Filter
    'min_tree_height_m': 3,
    'max_tree_height_m': 45,

    # Validierung
    'validation_sample_size': 5000,  # Anzahl Bäume für Pre-Validierung
    'random_seed': 42,
}

# Gattungen für Stratifizierung
STRATIFY_GENERA = ['TILIA', 'ACER', 'QUERCUS', 'PLATANUS',
                   'ROBINIA', 'AESCULUS', 'BETULA', 'PRUNUS']

print("✓ Parameter geladen")
print(f"  Städte: {', '.join(CITIES)}")
print(f"  Snap-Radius: {PROCESSING_PARAMS['search_radius_m']}m")
print(f"  Min. Höhe: {PROCESSING_PARAMS['min_peak_height_m']}m")

✓ Parameter geladen
  Städte: Berlin, Hamburg, Rostock
  Snap-Radius: 5m
  Min. Höhe: 3m


## 2. HELPER FUNCTIONS

In [9]:
def stratified_sample(trees_gdf, n=5000, genera_list=None, seed=42):
    """
    Erstellt stratifiziertes Sample nach Gattung und Raum.
    Robuste Variante ohne Höhen-Quartile (da height_m nicht mehr vorhanden).
    """
    np.random.seed(seed)

    if genera_list is None:
        genera_list = STRATIFY_GENERA

    # Gattungs-Stratifizierung
    trees_sample = trees_gdf.copy()
    trees_sample['genus_stratify'] = trees_sample['genus_latin'].apply(
        lambda x: x if x in genera_list else 'OTHER'
    )

    # Räumliches Grid (4×4)
    minx, miny, maxx, maxy = trees_sample.total_bounds

    try:
        grid_x = pd.cut(trees_sample.geometry.x, bins=4, labels=['X1', 'X2', 'X3', 'X4'])
        grid_y = pd.cut(trees_sample.geometry.y, bins=4, labels=['Y1', 'Y2', 'Y3', 'Y4'])
        trees_sample['spatial_grid'] = grid_x.astype(str) + '_' + grid_y.astype(str)
    except Exception:
        # Fallback: Keine räumliche Stratifizierung
        trees_sample['spatial_grid'] = 'GRID_1'

    # Kombinierte Strata (NUR Gattung + Raum)
    trees_sample['stratum'] = (
        trees_sample['genus_stratify'].astype(str) + '_' +
        trees_sample['spatial_grid'].astype(str)
    )

    # Stratifiziertes Sampling mit robuster Größenberechnung
    def safe_sample(group, target_size, total_size):
        """Sample mit Schutz gegen zu kleine Gruppen"""
        group_target = max(1, int(target_size * len(group) / total_size))
        actual_n = min(len(group), group_target)
        return group.sample(n=actual_n, random_state=seed)

    try:
        # Proportionales Sampling pro Stratum
        total_size = len(trees_sample)
        sampled = trees_sample.groupby('stratum', group_keys=False).apply(
            lambda x: safe_sample(x, n, total_size)
        )

        # Falls zu wenig Samples: Auffüllen mit Random
        if len(sampled) < n:
            remaining_trees = trees_sample[~trees_sample.index.isin(sampled.index)]
            additional_n = min(n - len(sampled), len(remaining_trees))
            if additional_n > 0:
                additional = remaining_trees.sample(n=additional_n, random_state=seed)
                sampled = pd.concat([sampled, additional])

        # Falls zu viele Samples: Random Reduktion
        if len(sampled) > n:
            sampled = sampled.sample(n=n, random_state=seed)

        return sampled

    except Exception as e:
        # Fallback: Simple Random Sample
        print(f"  ⚠️  Stratifizierung fehlgeschlagen, verwende Random Sample: {str(e)}")
        actual_n = min(n, len(trees_sample))
        return trees_sample.sample(n=actual_n, random_state=seed)


def extract_chm_metrics_validation(point_geom, chm_src, buffer_m=5):
    """
    Extrahiert CHM-Metriken für Validierung (Original-Position).
    Vereinfachte Version - nur für Pre-Validierung.
    """
    x, y = point_geom.x, point_geom.y

    window = from_bounds(
        x - buffer_m, y - buffer_m,
        x + buffer_m, y + buffer_m,
        chm_src.transform
    )

    try:
        # CHM-Daten auslesen
        chm_data = chm_src.read(1, window=window, boundless=True, fill_value=chm_src.nodata)

        # NoData maskieren
        nodata_value = chm_src.nodata
        if nodata_value is not None:
            chm_data = np.where(chm_data == nodata_value, np.nan, chm_data)
        chm_data = np.where(chm_data < 0, np.nan, chm_data)

        # CHM-Wert am Punkt
        row, col = chm_src.index(x, y)
        row_in_window = int(row - window.row_off)
        col_in_window = int(col - window.col_off)

        if 0 <= row_in_window < chm_data.shape[0] and 0 <= col_in_window < chm_data.shape[1]:
            chm_at_point = chm_data[row_in_window, col_in_window]
        else:
            chm_at_point = np.nan

        # Lokale Maxima finden
        footprint = np.ones((3, 3))
        valid_mask = ~np.isnan(chm_data)

        if np.any(valid_mask):
            chm_temp = np.where(valid_mask, chm_data, -np.inf)
            local_max = maximum_filter(chm_temp, footprint=footprint)
            is_peak = (chm_data == local_max) & valid_mask & (chm_data > 0)

            if np.any(is_peak):
                peak_rows, peak_cols = np.where(is_peak)
                peak_coords = np.column_stack([peak_rows, peak_cols])
                center = np.array([row_in_window, col_in_window])
                distances_px = np.linalg.norm(peak_coords - center, axis=1)
                distances_m = distances_px * chm_src.res[0]

                nearest_idx = np.argmin(distances_m)
                distance_to_max_m = distances_m[nearest_idx]
                chm_max_nearby = chm_data[peak_rows[nearest_idx], peak_cols[nearest_idx]]
            else:
                distance_to_max_m = np.nan
                chm_max_nearby = np.nan
        else:
            distance_to_max_m = np.nan
            chm_max_nearby = np.nan

        chm_max_buffer = np.nanmax(chm_data) if np.any(valid_mask) else np.nan

        return {
            'chm_at_point': float(chm_at_point) if not np.isnan(chm_at_point) else np.nan,
            'chm_max_5m': float(chm_max_buffer) if not np.isnan(chm_max_buffer) else np.nan,
            'distance_to_chm_max': float(distance_to_max_m) if not np.isnan(distance_to_max_m) else np.nan,
            'chm_at_nearest_peak': float(chm_max_nearby) if not np.isnan(chm_max_nearby) else np.nan,
            'local_peak_exists': not np.isnan(distance_to_max_m)
        }

    except Exception as e:
        return {
            'chm_at_point': np.nan,
            'chm_max_5m': np.nan,
            'distance_to_chm_max': np.nan,
            'chm_at_nearest_peak': np.nan,
            'local_peak_exists': False
        }


def snap_to_peak(tree_point, chm_src, search_radius_m=5, min_height_m=3, footprint_size=3):
    """
    Snap-to-Peak-Algorithmus: Findet nächstes lokales CHM-Maximum.

    Returns:
    --------
    dict: {
        'snap_success': bool,
        'corrected_geometry': Point or None,
        'snap_distance_m': float or NaN,
        'peak_height_m': float or NaN,
        'chm_at_original': float or NaN
    }
    """
    x, y = tree_point.x, tree_point.y

    window = from_bounds(
        x - search_radius_m, y - search_radius_m,
        x + search_radius_m, y + search_radius_m,
        chm_src.transform
    )

    try:
        # CHM-Daten auslesen
        chm_data = chm_src.read(1, window=window, boundless=True, fill_value=chm_src.nodata)

        # NoData maskieren
        if chm_src.nodata is not None:
            chm_data = np.where(chm_data == chm_src.nodata, np.nan, chm_data)
        chm_data = np.where(chm_data < 0, np.nan, chm_data)

        # Original-Position im Window
        row_orig, col_orig = chm_src.index(x, y)
        row_in_window_orig = int(row_orig - window.row_off)
        col_in_window_orig = int(col_orig - window.col_off)

        if (0 <= row_in_window_orig < chm_data.shape[0] and
            0 <= col_in_window_orig < chm_data.shape[1]):
            chm_at_original = chm_data[row_in_window_orig, col_in_window_orig]
        else:
            chm_at_original = np.nan

        # Lokale Maxima finden
        footprint = np.ones((footprint_size, footprint_size))
        valid_mask = ~np.isnan(chm_data)

        if not np.any(valid_mask):
            return {
                'snap_success': False,
                'corrected_geometry': None,
                'snap_distance_m': np.nan,
                'peak_height_m': np.nan,
                'chm_at_original': chm_at_original
            }

        # Maximum-Filter
        chm_temp = np.where(valid_mask, chm_data, -np.inf)
        local_max = maximum_filter(chm_temp, footprint=footprint)
        is_peak = (chm_data == local_max) & valid_mask & (chm_data >= min_height_m)

        if not np.any(is_peak):
            return {
                'snap_success': False,
                'corrected_geometry': None,
                'snap_distance_m': np.nan,
                'peak_height_m': np.nan,
                'chm_at_original': chm_at_original
            }

        # Nächsten Peak finden
        peak_rows, peak_cols = np.where(is_peak)
        peak_coords = np.column_stack([peak_rows, peak_cols])
        center = np.array([row_in_window_orig, col_in_window_orig])
        distances_px = np.linalg.norm(peak_coords - center, axis=1)
        distances_m = distances_px * chm_src.res[0]

        # Filter: Nur Peaks innerhalb search_radius_m
        valid_peaks = distances_m <= search_radius_m
        if not np.any(valid_peaks):
            return {
                'snap_success': False,
                'corrected_geometry': None,
                'snap_distance_m': np.nan,
                'peak_height_m': np.nan,
                'chm_at_original': chm_at_original
            }

        # Nächster valider Peak
        valid_distances = distances_m[valid_peaks]
        valid_peak_coords = peak_coords[valid_peaks]
        nearest_idx = np.argmin(valid_distances)

        peak_row = valid_peak_coords[nearest_idx, 0]
        peak_col = valid_peak_coords[nearest_idx, 1]
        peak_height = chm_data[peak_row, peak_col]
        snap_distance = valid_distances[nearest_idx]

        # Neue Geometrie berechnen
        # Window-Koordinaten → Geo-Koordinaten
        peak_x, peak_y = rasterio.transform.xy(
            chm_src.transform,
            int(window.row_off + peak_row),
            int(window.col_off + peak_col),
            offset='center'
        )

        from shapely.geometry import Point
        corrected_geom = Point(peak_x, peak_y)

        return {
            'snap_success': True,
            'corrected_geometry': corrected_geom,
            'snap_distance_m': float(snap_distance),
            'peak_height_m': float(peak_height),
            'chm_at_original': float(chm_at_original) if not np.isnan(chm_at_original) else np.nan
        }

    except Exception as e:
        return {
            'snap_success': False,
            'corrected_geometry': None,
            'snap_distance_m': np.nan,
            'peak_height_m': np.nan,
            'chm_at_original': np.nan
        }


def extract_height_from_chm(corrected_point, chm_src, buffer_m=1, method='max'):
    """
    Extrahiert finale Baumhöhe aus CHM am korrigierten Punkt.
    """
    x, y = corrected_point.x, corrected_point.y

    window = from_bounds(
        x - buffer_m, y - buffer_m,
        x + buffer_m, y + buffer_m,
        chm_src.transform
    )

    try:
        chm_data = chm_src.read(1, window=window, boundless=True, fill_value=chm_src.nodata)

        if chm_src.nodata is not None:
            chm_data = np.where(chm_data == chm_src.nodata, np.nan, chm_data)
        chm_data = np.where(chm_data < 0, np.nan, chm_data)

        if method == 'max':
            return float(np.nanmax(chm_data))
        elif method == 'mean':
            return float(np.nanmean(chm_data))
        elif method == 'p90':
            return float(np.nanpercentile(chm_data, 90))
        else:
            return np.nan

    except Exception:
        return np.nan


print("✓ Helper Functions definiert")

✓ Helper Functions definiert


## 3. MAIN PROCESSING FUNCTION

In [8]:
def process_city(city_name):
    """
    Vollständiger Workflow für eine Stadt.
    """
    print(f"\n{'='*80}")
    print(f"PROCESSING: {city_name}")
    print(f"{'='*80}")

    # ========================================================================
    # PHASE 1: DATEN LADEN
    # ========================================================================

    print("\n--- PHASE 1: DATEN LADEN ---")

    # Baumkataster laden
    trees_all = gpd.read_file(INPUT_PATHS['cadastre'])
    trees_city = trees_all[trees_all['city'] == city_name].copy()
    print(f"✓ Baumkataster: {len(trees_city):,} Bäume")

    # CHM laden
    chm_path = INPUT_PATHS['chm'][city_name]
    with rasterio.open(chm_path) as src:
        chm_crs = src.crs
        print(f"✓ CHM geladen: {chm_path.name}")
        print(f"  - CRS: {chm_crs}")
        print(f"  - Shape: {src.shape}")

    # CRS-Check
    if trees_city.crs != chm_crs:
        print(f"⚠️  Reprojiziere Bäume: {trees_city.crs} → {chm_crs}")
        trees_city = trees_city.to_crs(chm_crs)

    # ========================================================================
    # PHASE 2: PRE-VALIDIERUNG
    # ========================================================================

    print("\n--- PHASE 2: PRE-VALIDIERUNG ---")

        # Sample erstellen (mit Fallback)
    sample_size = min(PROCESSING_PARAMS['validation_sample_size'], len(trees_city))

    # KRITISCH: Wenn Stadt zu klein, reduziere Sample-Größe
    if len(trees_city) < 1000:
        sample_size = min(500, len(trees_city))
        print(f"  ⚠️  Kleine Stadt, reduziere Sample auf {sample_size}")

    sample_trees = stratified_sample(
        trees_city,
        n=sample_size,
        seed=PROCESSING_PARAMS['random_seed']
    )
    print(f"✓ Sample erstellt: {len(sample_trees):,} Bäume")

    # Sample erstellen
    sample_size = min(PROCESSING_PARAMS['validation_sample_size'], len(trees_city))
    sample_trees = stratified_sample(
        trees_city,
        n=sample_size,
        seed=PROCESSING_PARAMS['random_seed']
    )
    print(f"✓ Sample erstellt: {len(sample_trees):,} Bäume")

    # CHM-Metriken extrahieren
    print(f"Extrahiere CHM-Metriken (Original-Positionen)...")
    chm_metrics_list = []

    with rasterio.open(chm_path) as chm_src:
        for idx, row in tqdm(sample_trees.iterrows(), total=len(sample_trees), desc="CHM-Extraktion"):
            metrics = extract_chm_metrics_validation(
                row.geometry,
                chm_src,
                buffer_m=PROCESSING_PARAMS['search_radius_m']
            )
            metrics['tree_id'] = row['tree_id']
            chm_metrics_list.append(metrics)

    chm_metrics_df = pd.DataFrame(chm_metrics_list)
    sample_trees = sample_trees.merge(chm_metrics_df, on='tree_id', how='left')

    # Statistiken berechnen
    valid_data = sample_trees[
        sample_trees['chm_at_point'].notna()
    ].copy()

    print(f"\n✓ Validierung abgeschlossen: {len(valid_data):,} / {len(sample_trees):,} mit CHM-Daten")

    stats_before = {
        'city': city_name,
        'n_trees_total': len(trees_city),
        'n_trees_sampled': len(sample_trees),
        'n_trees_valid_chm': len(valid_data),
        'pct_chm_valid': len(valid_data) / len(sample_trees) * 100,
        'pct_local_peak_exists': sample_trees['local_peak_exists'].mean() * 100,
        'median_distance_to_peak': valid_data['distance_to_chm_max'].median(),
        'mean_distance_to_peak': valid_data['distance_to_chm_max'].mean(),
        'pct_distance_lt_1m': (valid_data['distance_to_chm_max'] < 1).mean() * 100,
        'pct_distance_lt_2m': (valid_data['distance_to_chm_max'] < 2).mean() * 100,
        'pct_distance_lt_3m': (valid_data['distance_to_chm_max'] < 3).mean() * 100,
    }

    # Korrelation (falls Kataster-Höhe vorhanden)
    if 'height_m' in valid_data.columns and valid_data['height_m'].notna().sum() > 100:
        height_valid = valid_data[valid_data['height_m'].notna()].copy()
        r, p = pearsonr(height_valid['chm_at_point'], height_valid['height_m'])
        stats_before['pearson_r'] = r
        stats_before['pearson_p'] = p
        stats_before['median_height_diff'] = np.abs(height_valid['chm_at_point'] - height_valid['height_m']).median()
        print(f"  - Korrelation (Kataster vs. CHM): r={r:.3f}")

    print(f"  - CHM verfügbar: {stats_before['pct_chm_valid']:.1f}%")
    print(f"  - Lokaler Peak vorhanden: {stats_before['pct_local_peak_exists']:.1f}%")
    print(f"  - Median Distanz zu Peak: {stats_before['median_distance_to_peak']:.2f}m")

    # Speichern
    stats_path = OUTPUT_PATHS['validation_dir'] / f'stats_before_{city_name}.json'
    with open(stats_path, 'w') as f:
        json.dump(stats_before, f, indent=2)
    print(f"✓ Gespeichert: {stats_path.name}")

    # ========================================================================
    # PHASE 3: SNAP-TO-PEAK & HÖHEN-KORREKTUR
    # ========================================================================

    print("\n--- PHASE 3: SNAP-TO-PEAK & HÖHEN-KORREKTUR ---")
    print(f"Verarbeite {len(trees_city):,} Bäume...")

    results = []

    with rasterio.open(chm_path) as chm_src:
        for idx, tree in tqdm(trees_city.iterrows(), total=len(trees_city), desc=f"Processing {city_name}"):
            # Snap-to-Peak
            snap_result = snap_to_peak(
                tree.geometry,
                chm_src,
                search_radius_m=PROCESSING_PARAMS['search_radius_m'],
                min_height_m=PROCESSING_PARAMS['min_peak_height_m'],
                footprint_size=PROCESSING_PARAMS['footprint_size']
            )

            if snap_result['snap_success']:
                # Höhe aus CHM extrahieren
                height_chm = extract_height_from_chm(
                    snap_result['corrected_geometry'],
                    chm_src,
                    buffer_m=PROCESSING_PARAMS['height_buffer_m'],
                    method=PROCESSING_PARAMS['height_method']
                )

                # Plausibilität prüfen
                height_plausible = (
                    not np.isnan(height_chm) and
                    PROCESSING_PARAMS['min_tree_height_m'] <= height_chm <= PROCESSING_PARAMS['max_tree_height_m']
                )

                results.append({
                    'tree_id': tree['tree_id'],
                    'genus_latin': tree['genus_latin'],
                    'species_latin': tree['species_latin'],
                    'plant_year': tree['plant_year'],
                    'city': tree['city'],
                    'original_x': tree.geometry.x,
                    'original_y': tree.geometry.y,
                    'geometry': snap_result['corrected_geometry'],
                    'height_m': height_chm,
                    'snap_success': True,
                    'snap_distance_m': snap_result['snap_distance_m'],
                    'height_plausible': height_plausible,
                    'chm_at_original': snap_result['chm_at_original'],
                    'chm_at_corrected': height_chm,
                })

    # Zu GeoDataFrame konvertieren
    trees_corrected = gpd.GeoDataFrame(results, crs=trees_city.crs)

    print(f"\n✓ Snap-to-Peak abgeschlossen")
    print(f"  - Snap erfolgreich: {len(trees_corrected):,} / {len(trees_city):,} ({len(trees_corrected)/len(trees_city)*100:.1f}%)")

    # ========================================================================
    # PHASE 4: FILTER & POST-VALIDIERUNG
    # ========================================================================

    print("\n--- PHASE 4: FILTER & POST-VALIDIERUNG ---")

    # Filter anwenden
    trees_final = trees_corrected[
        trees_corrected['snap_success'] &
        trees_corrected['height_plausible']
    ].copy()

    print(f"✓ Filter angewendet")
    print(f"  - Retained: {len(trees_final):,} / {len(trees_city):,} ({len(trees_final)/len(trees_city)*100:.1f}%)")

    # Post-Statistiken
    stats_after = {
        'city': city_name,
        'n_trees_original': len(trees_city),
        'n_trees_after_snap': len(trees_corrected),
        'n_trees_final': len(trees_final),
        'pct_retained': len(trees_final) / len(trees_city) * 100,
        'median_snap_distance': trees_final['snap_distance_m'].median(),
        'mean_snap_distance': trees_final['snap_distance_m'].mean(),
        'pct_snap_lt_1m': (trees_final['snap_distance_m'] < 1).mean() * 100,
        'pct_snap_lt_2m': (trees_final['snap_distance_m'] < 2).mean() * 100,
        'height_min': trees_final['height_m'].min(),
        'height_max': trees_final['height_m'].max(),
        'height_mean': trees_final['height_m'].mean(),
        'height_median': trees_final['height_m'].median(),
        'height_std': trees_final['height_m'].std(),
        'n_genera': trees_final['genus_latin'].nunique(),
    }

    # Gattungs-Statistiken
    genus_stats = trees_final.groupby('genus_latin').agg({
        'tree_id': 'count',
        'height_m': ['mean', 'std', 'min', 'max']
    }).round(2)
    genus_stats.columns = ['count', 'height_mean', 'height_std', 'height_min', 'height_max']
    stats_after['height_by_genus'] = genus_stats.to_dict('index')

    print(f"  - Median Snap-Distanz: {stats_after['median_snap_distance']:.2f}m")
    print(f"  - Höhen-Range: {stats_after['height_min']:.1f}-{stats_after['height_max']:.1f}m")
    print(f"  - Median Höhe: {stats_after['height_median']:.1f}m")
    print(f"  - Anzahl Gattungen: {stats_after['n_genera']}")

    # Speichern
    stats_path = OUTPUT_PATHS['processed_dir'] / f'stats_after_{city_name}.json'
    with open(stats_path, 'w') as f:
        json.dump(stats_after, f, indent=2, default=str)
    print(f"✓ Gespeichert: {stats_path.name}")

    # ========================================================================
    # PHASE 5: VISUALISIERUNGEN
    # ========================================================================

    print("\n--- PHASE 5: VISUALISIERUNGEN ---")

    # Plot 1: Pre-Validierung (falls Sample vorhanden)
    if len(valid_data) > 0:
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))

        # Scatter: CHM vs. Kataster (falls Kataster-Höhe vorhanden)
        if 'height_m' in valid_data.columns and valid_data['height_m'].notna().sum() > 100:
            ax = axes[0]
            height_valid = valid_data[valid_data['height_m'].notna()]
            ax.scatter(height_valid['height_m'], height_valid['chm_at_point'],
                      alpha=0.4, s=20, edgecolors='none')

            # Regressionslinie
            z = np.polyfit(height_valid['height_m'], height_valid['chm_at_point'], 1)
            p = np.poly1d(z)
            x_line = np.linspace(height_valid['height_m'].min(), height_valid['height_m'].max(), 100)
            ax.plot(x_line, p(x_line), "r--", linewidth=2, label=f'y={z[0]:.2f}x+{z[1]:.2f}')
            ax.plot([0, height_valid['height_m'].max()], [0, height_valid['height_m'].max()],
                   'k-', linewidth=1, alpha=0.5, label='1:1')

            ax.set_xlabel('Kataster Höhe (m)')
            ax.set_ylabel('CHM Höhe (m)')
            ax.set_title(f'{city_name}: Höhenvergleich (Pre-Korrektur)\nr={stats_before.get("pearson_r", np.nan):.3f}')
            ax.legend()
            ax.grid(True, alpha=0.3)
        else:
            axes[0].text(0.5, 0.5, 'Keine Kataster-Höhen verfügbar',
                        ha='center', va='center', transform=axes[0].transAxes)
            axes[0].set_title(f'{city_name}: Höhenvergleich (Pre-Korrektur)')

        # Histogram: Distanz zu Peak
        ax = axes[1]
        ax.hist(valid_data['distance_to_chm_max'].dropna(), bins=30, edgecolor='black', alpha=0.7)
        ax.axvline(stats_before['median_distance_to_peak'], color='red', linestyle='--',
                  linewidth=2, label=f'Median: {stats_before["median_distance_to_peak"]:.2f}m')
        ax.set_xlabel('Distanz zu CHM-Maximum (m)')
        ax.set_ylabel('Anzahl Bäume')
        ax.set_title(f'{city_name}: Räumlicher Offset (Pre-Korrektur)')
        ax.legend()
        ax.grid(True, alpha=0.3, axis='y')

        plt.tight_layout()
        plot_path = OUTPUT_PATHS['validation_dir'] / f'validation_before_{city_name}.png'
        plt.savefig(plot_path, dpi=150, bbox_inches='tight')
        plt.close()
        print(f"✓ Plot gespeichert: {plot_path.name}")

    # Plot 2: Post-Korrektur
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Histogram: Snap-Distanzen
    ax = axes[0]
    ax.hist(trees_final['snap_distance_m'], bins=30, edgecolor='black', alpha=0.7)
    ax.axvline(stats_after['median_snap_distance'], color='red', linestyle='--',
              linewidth=2, label=f'Median: {stats_after["median_snap_distance"]:.2f}m')
    ax.set_xlabel('Snap-Distanz (m)')
    ax.set_ylabel('Anzahl Bäume')
    ax.set_title(f'{city_name}: Positionskorrektur-Distanzen')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')

    # Histogram: CHM-Höhen
    ax = axes[1]
    ax.hist(trees_final['height_m'], bins=40, edgecolor='black', alpha=0.7)
    ax.axvline(stats_after['height_median'], color='red', linestyle='--',
              linewidth=2, label=f'Median: {stats_after["height_median"]:.1f}m')
    ax.set_xlabel('CHM-Höhe (m)')
    ax.set_ylabel('Anzahl Bäume')
    ax.set_title(f'{city_name}: Finale Höhenverteilung (aus CHM)')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')

    plt.tight_layout()
    plot_path = OUTPUT_PATHS['processed_dir'] / f'validation_after_{city_name}.png'
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    plt.close()
    print(f"✓ Plot gespeichert: {plot_path.name}")

    # ========================================================================
    # PHASE 6: EXPORT
    # ========================================================================

    print("\n--- PHASE 6: EXPORT ---")

    # Finale Spalten
    FINAL_COLUMNS = [
        'tree_id',
        'city',
        'genus_latin',
        'species_latin',
        'plant_year',
        'height_m',
        'snap_distance_m',
        'geometry',
    ]

    trees_export = trees_final[FINAL_COLUMNS].copy()

    # Speichern
    output_path = OUTPUT_PATHS['processed_dir'] / f'trees_corrected_{city_name}.gpkg'
    trees_export.to_file(output_path, driver='GPKG')

    print(f"✓ Datensatz gespeichert: {output_path.name}")
    print(f"  - Bäume: {len(trees_export):,}")
    print(f"  - Gattungen: {trees_export['genus_latin'].nunique()}")
    print(f"  - Höhen-Range: {trees_export['height_m'].min():.1f}-{trees_export['height_m'].max():.1f}m")

    # Optional: Ausgeschlossene Bäume
    trees_excluded = trees_city[~trees_city['tree_id'].isin(trees_final['tree_id'])].copy()
    excluded_path = OUTPUT_PATHS['processed_dir'] / f'trees_excluded_{city_name}.gpkg'
    trees_excluded.to_file(excluded_path, driver='GPKG')
    print(f"✓ Ausgeschlossene Bäume gespeichert: {excluded_path.name} ({len(trees_excluded):,})")

    print(f"\n{'='*80}")
    print(f"✓ {city_name} ABGESCHLOSSEN")
    print(f"{'='*80}\n")

    return stats_before, stats_after


print("✓ Main Function definiert")

✓ Main Function definiert


## 4. PROCESSING - ALLE STÄDTE

In [10]:
all_stats = {}

for city in CITIES:
    try:
        stats_before, stats_after = process_city(city)
        all_stats[city] = {
            'before': stats_before,
            'after': stats_after
        }
    except Exception as e:
        print(f"\n⚠️  FEHLER bei {city}:")
        print(f"  {str(e)}")
        import traceback
        traceback.print_exc()
        continue


PROCESSING: Berlin

--- PHASE 1: DATEN LADEN ---
✓ Baumkataster: 245,614 Bäume
✓ CHM geladen: CHM_1m_Berlin.tif
  - CRS: EPSG:25832
  - Shape: (37360, 46092)

--- PHASE 2: PRE-VALIDIERUNG ---
✓ Sample erstellt: 5,000 Bäume
✓ Sample erstellt: 5,000 Bäume
Extrahiere CHM-Metriken (Original-Positionen)...


CHM-Extraktion: 100%|██████████| 5000/5000 [02:31<00:00, 32.93it/s] 



✓ Validierung abgeschlossen: 4,994 / 5,000 mit CHM-Daten
  - Korrelation (Kataster vs. CHM): r=0.676
  - CHM verfügbar: 99.9%
  - Lokaler Peak vorhanden: 99.9%
  - Median Distanz zu Peak: 1.00m
✓ Gespeichert: stats_before_Berlin.json

--- PHASE 3: SNAP-TO-PEAK & HÖHEN-KORREKTUR ---
Verarbeite 245,614 Bäume...


Processing Berlin: 100%|██████████| 245614/245614 [1:22:53<00:00, 49.39it/s]



✓ Snap-to-Peak abgeschlossen
  - Snap erfolgreich: 240,654 / 245,614 (98.0%)

--- PHASE 4: FILTER & POST-VALIDIERUNG ---
✓ Filter angewendet
  - Retained: 219,900 / 245,614 (89.5%)
  - Median Snap-Distanz: 1.00m
  - Höhen-Range: 3.0-44.2m
  - Median Höhe: 11.4m
  - Anzahl Gattungen: 8
✓ Gespeichert: stats_after_Berlin.json

--- PHASE 5: VISUALISIERUNGEN ---
✓ Plot gespeichert: validation_before_Berlin.png
✓ Plot gespeichert: validation_after_Berlin.png

--- PHASE 6: EXPORT ---
✓ Datensatz gespeichert: trees_corrected_Berlin.gpkg
  - Bäume: 219,900
  - Gattungen: 8
  - Höhen-Range: 3.0-44.2m
✓ Ausgeschlossene Bäume gespeichert: trees_excluded_Berlin.gpkg (25,714)

✓ Berlin ABGESCHLOSSEN


PROCESSING: Hamburg

--- PHASE 1: DATEN LADEN ---
✓ Baumkataster: 97,275 Bäume
✓ CHM geladen: CHM_1m_Hamburg.tif
  - CRS: EPSG:25832
  - Shape: (39000, 40363)

--- PHASE 2: PRE-VALIDIERUNG ---
✓ Sample erstellt: 5,000 Bäume
✓ Sample erstellt: 5,000 Bäume
Extrahiere CHM-Metriken (Original-Positionen)..

CHM-Extraktion: 100%|██████████| 5000/5000 [03:10<00:00, 26.30it/s]



✓ Validierung abgeschlossen: 4,978 / 5,000 mit CHM-Daten
  - CHM verfügbar: 99.6%
  - Lokaler Peak vorhanden: 99.5%
  - Median Distanz zu Peak: 1.41m
✓ Gespeichert: stats_before_Hamburg.json

--- PHASE 3: SNAP-TO-PEAK & HÖHEN-KORREKTUR ---
Verarbeite 97,275 Bäume...


Processing Hamburg: 100%|██████████| 97275/97275 [31:06<00:00, 52.11it/s]



✓ Snap-to-Peak abgeschlossen
  - Snap erfolgreich: 78,579 / 97,275 (80.8%)

--- PHASE 4: FILTER & POST-VALIDIERUNG ---
✓ Filter angewendet
  - Retained: 78,577 / 97,275 (80.8%)
  - Median Snap-Distanz: 1.41m
  - Höhen-Range: 3.0-41.2m
  - Median Höhe: 13.3m
  - Anzahl Gattungen: 8
✓ Gespeichert: stats_after_Hamburg.json

--- PHASE 5: VISUALISIERUNGEN ---
✓ Plot gespeichert: validation_before_Hamburg.png
✓ Plot gespeichert: validation_after_Hamburg.png

--- PHASE 6: EXPORT ---
✓ Datensatz gespeichert: trees_corrected_Hamburg.gpkg
  - Bäume: 78,577
  - Gattungen: 8
  - Höhen-Range: 3.0-41.2m
✓ Ausgeschlossene Bäume gespeichert: trees_excluded_Hamburg.gpkg (18,698)

✓ Hamburg ABGESCHLOSSEN


PROCESSING: Rostock

--- PHASE 1: DATEN LADEN ---
✓ Baumkataster: 20,682 Bäume
✓ CHM geladen: CHM_1m_Rostock.tif
  - CRS: EPSG:25832
  - Shape: (22953, 19822)

--- PHASE 2: PRE-VALIDIERUNG ---
✓ Sample erstellt: 5,000 Bäume
✓ Sample erstellt: 5,000 Bäume
Extrahiere CHM-Metriken (Original-Positionen).

CHM-Extraktion: 100%|██████████| 5000/5000 [01:38<00:00, 50.56it/s]



✓ Validierung abgeschlossen: 5,000 / 5,000 mit CHM-Daten
  - Korrelation (Kataster vs. CHM): r=0.657
  - CHM verfügbar: 100.0%
  - Lokaler Peak vorhanden: 99.5%
  - Median Distanz zu Peak: 2.00m
✓ Gespeichert: stats_before_Rostock.json

--- PHASE 3: SNAP-TO-PEAK & HÖHEN-KORREKTUR ---
Verarbeite 20,682 Bäume...


Processing Rostock: 100%|██████████| 20682/20682 [08:01<00:00, 42.93it/s]



✓ Snap-to-Peak abgeschlossen
  - Snap erfolgreich: 17,500 / 20,682 (84.6%)

--- PHASE 4: FILTER & POST-VALIDIERUNG ---
✓ Filter angewendet
  - Retained: 17,500 / 20,682 (84.6%)
  - Median Snap-Distanz: 2.00m
  - Höhen-Range: 3.0-35.6m
  - Median Höhe: 7.9m
  - Anzahl Gattungen: 8
✓ Gespeichert: stats_after_Rostock.json

--- PHASE 5: VISUALISIERUNGEN ---
✓ Plot gespeichert: validation_before_Rostock.png
✓ Plot gespeichert: validation_after_Rostock.png

--- PHASE 6: EXPORT ---
✓ Datensatz gespeichert: trees_corrected_Rostock.gpkg
  - Bäume: 17,500
  - Gattungen: 8
  - Höhen-Range: 3.0-35.6m
✓ Ausgeschlossene Bäume gespeichert: trees_excluded_Rostock.gpkg (3,182)

✓ Rostock ABGESCHLOSSEN



## 5. FINALE ZUSAMMENFASSUNG

In [1]:
print("\n" + "="*80)
print("FINALE ZUSAMMENFASSUNG")
print("="*80)

summary_data = []

for city in CITIES:
    if city in all_stats:
        stats = all_stats[city]['after']
        summary_data.append({
            'Stadt': city,
            'Original': f"{stats['n_trees_original']:,}",
            'Final': f"{stats['n_trees_final']:,}",
            'Retained %': f"{stats['pct_retained']:.1f}%",
            'Median Snap (m)': f"{stats['median_snap_distance']:.2f}",
            'Median Höhe (m)': f"{stats['height_median']:.1f}",
            'Gattungen': stats['n_genera'],
        })

summary_df = pd.DataFrame(summary_data)
print("\n")
print(summary_df.to_string(index=False))

# Speichern
summary_path = OUTPUT_PATHS['processed_dir'] / 'summary_all_cities.csv'
summary_df.to_csv(summary_path, index=False)
print(f"\n✓ Zusammenfassung gespeichert: {summary_path}")

print("\n" + "="*80)
print("ALLE STÄDTE ABGESCHLOSSEN ✓")
print("="*80)


FINALE ZUSAMMENFASSUNG


NameError: name 'CITIES' is not defined