# PopSim Data Preparation (Census Mode)

This notebook prepares all input files for PopulationSim using German Census grid data.

## Requirements

1. **GeoPackage** (`.gpkg`) - polygon defining your study area boundary
2. **Census data** (parquet or CSV) - 100m and 1km grid cells with population attributes
3. **MiD seed data** (CSV) - household and person survey data (`MiD2023_Haushalte.csv`, `MiD2023_Personen.csv`)

## Census Data Format

Census files must have cell IDs in the **first column** matching this format:
- **100m**: `CRS3035RES100mN{northing}E{easting}` (e.g., `CRS3035RES100mN2689100E4337000`)
- **1km**: `CRS3035RES1000mN{northing}E{easting}` (e.g., `CRS3035RES1000mN2689000E4337000`)

Coordinates are **EPSG:3035** (ETRS89-extended / LAEA Europe). All other columns become available as control totals.

## What Gets Generated

### Single Mode (`regiostar_split=False`)
- `popsim/data/geo_cross_walk.csv` - geographic hierarchy
- `popsim/data/control_totals_*.csv` - census data formatted as control totals
- `popsim/data/seed_persons.csv`, `seed_households.csv` - filtered MiD data
- `popsim/configs/controls.csv` - control definitions for PopSim

### RegioStar Split Mode (`regiostar_split=True`)
Creates separate folders for each RegioStaR17 value found in the study area:
- `popsim_regiostar_121/` - census AND MiD filtered to RegioStaR17=121
- `popsim_regiostar_125/` - census AND MiD filtered to RegioStaR17=125
- etc.

Each folder contains complete PopSim inputs filtered to that regional type. The controls file is edited once in `popsim/configs/_prep3_controls.csv` and copied to all folders.

## Configuration

**Edit all paths and settings below before running.**

In [7]:
# =============================================================================
# USER CONFIGURATION
# =============================================================================

# --- Paths (relative to this notebook) ---
inputs_dir = "inputs"      # Shared input files (MiD, census, geopackages)
popsim_dir = "popsim"      # Base PopSim folder (for controls template, settings)

# --- Study Area ---
geopackage_path = f"{inputs_dir}/outlineBS.gpkg"
geopackage_crs = None  # Set CRS if not embedded (None = auto-detect)

# --- Census Data (parquet or CSV) ---
census_100m_path = f"{inputs_dir}/cells_100m_with_gender_backf_binneds_happyorphans_with_aggs_regiostar.parquet"
census_1km_path = f"{inputs_dir}/cells_1km_with_binneds.parquet"

# Column containing number of households (run Step 1 with None to see options)
household_column = "Insgesamt_Haushalte_Groesse_des_privaten_Haushalts_100m-Gitter_adj"  # e.g., "Insgesamt_Haushalte_100m-Gitter"

# --- MiD Seed Data (semicolon-separated CSVs) ---
mid_households_path = f"{inputs_dir}/MiD2023_Haushalte.csv"
mid_persons_path = f"{inputs_dir}/MiD2023_Personen.csv"

# --- MiD Filtering (set to None to skip that filter) ---
kernwo = [1,2,3]        # Day of week: [2,3] or None to skip. 1=Mon, 2=Tue-Thu, 3=Fri, 4=Sat-Sun
regiostar17 = None   # Regional types: [121,123,124] or None to skip (used when regiostar_split=False)

# --- RegioStar Split Mode ---
# When True: Creates separate popsim folders for each RegioStaR17 value in the study area
# Each folder gets census AND MiD data filtered to that RegioStaR17 value
regiostar_split = True

# --- CSV Separators ---
census_csv_sep = ";"   # For input CSVs (ignored for parquet)
intermediate_sep = ";"  # For intermediate files (use ";" for German Excel)
# Note: Final PopSim files are always comma-separated

# --- Advanced ---
output_everything = False  # True = output all PopSim intermediates
seed_geography = "STAAT"   # Geography level for seed data (usually unchanged)

# =============================================================================
# END CONFIGURATION
# =============================================================================

## Step 1: Load Study Area and Filter Census

Loads your GeoPackage, filters census cells to the study area, and shows available columns.

In [4]:
import os
import re
import pandas as pd
import geopandas as gpd
from shapely.geometry import box

print("[Step 1/4] Loading study area and filtering census...")
print("=" * 60)

# Ensure output directories exist
os.makedirs(f"{popsim_dir}/data", exist_ok=True)
os.makedirs(f"{popsim_dir}/configs", exist_ok=True)

# Load GeoPackage
print(f"Loading GeoPackage: {geopackage_path}")
study_area = gpd.read_file(geopackage_path)

# Handle CRS
if study_area.crs is None and geopackage_crs:
    study_area = study_area.set_crs(geopackage_crs)
    print(f"  Set CRS to: {geopackage_crs}")
elif study_area.crs is None:
    raise ValueError("GeoPackage has no CRS. Please set geopackage_crs in configuration.")

# Transform to EPSG:3035 (Census CRS)
study_area_3035 = study_area.to_crs("EPSG:3035")
bounds = study_area_3035.total_bounds  # minx, miny, maxx, maxy
print(f"  Study area bounds (EPSG:3035): {bounds}")

# Parse cell ID to extract coordinates
def parse_cell_id_100m(cell_id):
    """Extract N,E coordinates from 100m cell ID like CRS3035RES100mN2689100E4337000"""
    match = re.match(r'CRS3035RES100mN(\d+)E(\d+)', str(cell_id))
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None

def parse_cell_id_1km(cell_id):
    """Extract N,E coordinates from 1km cell ID like CRS3035RES1000mN2689000E4337000"""
    match = re.match(r'CRS3035RES1000mN(\d+)E(\d+)', str(cell_id))
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None

def get_1km_id_from_100m(cell_id_100m):
    """Convert 100m cell ID to corresponding 1km cell ID."""
    n, e = parse_cell_id_100m(cell_id_100m)
    if n is None:
        return None
    n_1km = (n // 1000) * 1000
    e_1km = (e // 1000) * 1000
    return f"CRS3035RES1000mN{n_1km}E{e_1km}"

# Load 100m census
print(f"\nLoading 100m census: {census_100m_path}")

if census_100m_path.endswith('.parquet'):
    import pyarrow.parquet as pq
    pf_100m = pq.ParquetFile(census_100m_path)
    print(f"  Total rows: {pf_100m.metadata.num_rows:,}")
    print(f"  Total columns: {pf_100m.metadata.num_columns}")
    
    print("  Filtering to study area (this may take a moment)...")
    filtered_chunks = []
    total_read = 0
    
    for batch in pf_100m.iter_batches(batch_size=100000):
        df_batch = batch.to_pandas()
        total_read += len(df_batch)
        
        coords = df_batch.iloc[:, 0].apply(parse_cell_id_100m)
        df_batch['_N'] = coords.apply(lambda x: x[0])
        df_batch['_E'] = coords.apply(lambda x: x[1])
        
        mask = (
            (df_batch['_N'] >= bounds[1]) & (df_batch['_N'] <= bounds[3]) &
            (df_batch['_E'] >= bounds[0]) & (df_batch['_E'] <= bounds[2])
        )
        df_filtered = df_batch[mask].drop(columns=['_N', '_E'])
        
        if len(df_filtered) > 0:
            filtered_chunks.append(df_filtered)
        
        if total_read % 500000 == 0:
            print(f"    Processed {total_read:,} rows...")
    
    census_100m = pd.concat(filtered_chunks, ignore_index=True)
else:
    print(f"  Loading CSV with separator: '{census_csv_sep}'")
    census_100m_full = pd.read_csv(census_100m_path, sep=census_csv_sep)
    print(f"  Total rows: {len(census_100m_full):,}")
    
    coords = census_100m_full.iloc[:, 0].apply(parse_cell_id_100m)
    census_100m_full['_N'] = coords.apply(lambda x: x[0])
    census_100m_full['_E'] = coords.apply(lambda x: x[1])
    
    mask = (
        (census_100m_full['_N'] >= bounds[1]) & (census_100m_full['_N'] <= bounds[3]) &
        (census_100m_full['_E'] >= bounds[0]) & (census_100m_full['_E'] <= bounds[2])
    )
    census_100m = census_100m_full[mask].drop(columns=['_N', '_E']).copy()

print(f"  Filtered to {len(census_100m):,} cells in bounding box")

# Fine filter: check actual intersection with study area polygon
print("  Performing precise polygon intersection...")
id_col_100m = census_100m.columns[0]

def cell_intersects_study_area(cell_id):
    n, e = parse_cell_id_100m(cell_id)
    if n is None:
        return False
    cell_geom = box(e, n, e + 100, n + 100)
    return study_area_3035.geometry.intersects(cell_geom).any()

sample_mask = census_100m[id_col_100m].sample(min(100, len(census_100m))).apply(cell_intersects_study_area)
if sample_mask.mean() > 0.9:
    print("  Bounding box is tight, skipping detailed intersection.")
else:
    mask = census_100m[id_col_100m].apply(cell_intersects_study_area)
    census_100m = census_100m[mask]
    print(f"  After polygon intersection: {len(census_100m):,} cells")

# Find likely household columns
# print(f"\n{'='*60}")
# print("SUGGESTED HOUSEHOLD COLUMNS (first 5 values):")
# print(f"{'='*60}")

# hh_keywords = ['haushalt', 'household', 'hh_', 'wohnung']
# suggested = []
# for col in census_100m.columns:
#     col_lower = col.lower()
#     if any(kw in col_lower for kw in hh_keywords):
#         suggested.append(col)

# if suggested:
#     # Show all columns without truncation
#     with pd.option_context('display.max_columns', None, 'display.width', None):
#         display(census_100m[suggested].head())
# else:
#     print("  No household-related columns found.")
with pd.option_context('display.max_columns', None, 'display.width', None):
    display(census_100m.head())

# print(f"\nTotal columns available: {len(census_100m.columns)}")
# print("Use census_100m.columns to see all column names.")

# Load 1km census
print(f"\n{'='*60}")
print(f"Loading 1km census: {census_1km_path}")

if census_1km_path.endswith('.parquet'):
    census_1km_full = pd.read_parquet(census_1km_path)
else:
    census_1km_full = pd.read_csv(census_1km_path, sep=census_csv_sep)
print(f"  Total rows: {len(census_1km_full):,}")

# Filter 1km by deriving from 100m cells
km_ids_needed = set(census_100m[id_col_100m].apply(get_1km_id_from_100m).dropna())
id_col_1km = census_1km_full.columns[1]  # Usually GITTER_ID_1km
census_1km = census_1km_full[census_1km_full[id_col_1km].isin(km_ids_needed)].copy()
print(f"  Filtered to {len(census_1km):,} 1km cells")

# Save filtered data as parquet
census_100m.to_parquet(f'{popsim_dir}/data/_census_100m_filtered.parquet', index=False)
census_1km.to_parquet(f'{popsim_dir}/data/_census_1km_filtered.parquet', index=False)
print(f"\nSaved filtered census to {popsim_dir}/data/_census_*_filtered.parquet")

print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
print(f"  100m cells in study area: {len(census_100m):,}")
print(f"  1km cells in study area: {len(census_1km):,}")
print(f"  MiD households: {mid_households_path}")
print(f"  MiD persons: {mid_persons_path}")
kernwo_list = kernwo if isinstance(kernwo, list) else ([kernwo] if kernwo else None)
regiostar17_list = regiostar17 if isinstance(regiostar17, list) else ([regiostar17] if regiostar17 else None)
print(f"  MiD filters: kernwo={kernwo_list}, regiostar17={regiostar17_list}")
print(f"  RegioStar split mode: {regiostar_split}")
print(f"\nSet 'household_column' in Configuration and re-run Step 1,")
print("or proceed to Step 2 if already set.")
print("\n[Step 1/4] Complete.")

[Step 1/4] Loading study area and filtering census...
Loading GeoPackage: inputs/outlineBS.gpkg
  Study area bounds (EPSG:3035): [4349047.63188537 3230733.57551223 4364792.73274846 3249763.51079757]

Loading 100m census: inputs/cells_100m_with_gender_backf_binneds_happyorphans_with_aggs_regiostar.parquet
  Total rows: 3,148,482
  Total columns: 570
  Filtering to study area (this may take a moment)...
    Processed 500,000 rows...
    Processed 1,000,000 rows...
    Processed 1,500,000 rows...
    Processed 2,000,000 rows...
    Processed 2,500,000 rows...
    Processed 3,000,000 rows...
  Filtered to 6,338 cells in bounding box
  Performing precise polygon intersection...
  After polygon intersection: 5,147 cells


Unnamed: 0,GITTER_ID_100m,Insgesamt_Bevoelkerung_Alter_in_10er-Jahresgruppen_100m-Gitter,Unter10_Alter_in_10er-Jahresgruppen_100m-Gitter,a10bis19_Alter_in_10er-Jahresgruppen_100m-Gitter,a20bis29_Alter_in_10er-Jahresgruppen_100m-Gitter,a30bis39_Alter_in_10er-Jahresgruppen_100m-Gitter,a40bis49_Alter_in_10er-Jahresgruppen_100m-Gitter,a50bis59_Alter_in_10er-Jahresgruppen_100m-Gitter,a60bis69_Alter_in_10er-Jahresgruppen_100m-Gitter,a70bis79_Alter_in_10er-Jahresgruppen_100m-Gitter,a80undaelter_Alter_in_10er-Jahresgruppen_100m-Gitter,Insgesamt_Bevoelkerung_Alter_in_5_Altersklassen_100m-Gitter,Unter18_Alter_in_5_Altersklassen_100m-Gitter,a18bis29_Alter_in_5_Altersklassen_100m-Gitter,a30bis49_Alter_in_5_Altersklassen_100m-Gitter,a50bis64_Alter_in_5_Altersklassen_100m-Gitter,a65undaelter_Alter_in_5_Altersklassen_100m-Gitter,AnteilAuslaender_Anteil_Auslaender_100m-Gitter,AnteilUeber65_Anteil_ueber_65_100m-Gitter,AnteilUnter18_Anteil_unter_18_100m-Gitter,AnteilAuslaenderAb18_Auslaenderanteil_ab18_100m-Gitter,Einwohner_Bevoelkerungszahl_100m-Gitter,Deutsche_ab18_Deutsche_Staatsangehoerige_ab18_100m-Gitter,durchschnFlaechejeBew_Durchschn_Flaeche_je_Bewohner_100m-Gitter,durchschnFlaechejeWohn_Durchschn_Flaeche_je_Wohnung_100m-Gitter,DurchschnHHGroesse_Durchschn_Haushaltsgroesse_100m-Gitter,durchschnMieteQM_Durchschn_Nettokaltmiete_100m-Gitter,durchschnMieteQM_Durchschn_Nettokaltmiete_Anzahl_der_Wohnungen_100m-Gitter,AnzahlWohnungen_Durchschn_Nettokaltmiete_Anzahl_der_Wohnungen_100m-Gitter,Durchschnittsalter_Durchschnittsalter_100m-Gitter,Eigentuemerquote_Eigentuemerquote_100m-Gitter,Insgesamt_Energietraeger_Energietraeger_100m-Gitter,Gas_Energietraeger_100m-Gitter,Heizoel_Energietraeger_100m-Gitter,Holz_Holzpellets_Energietraeger_100m-Gitter,Biomasse_Biogas_Energietraeger_100m-Gitter,Solar_Geothermie_Waermepumpen_Energietraeger_100m-Gitter,Strom_Energietraeger_100m-Gitter,Kohle_Energietraeger_100m-Gitter,Fernwaerme_Energietraeger_100m-Gitter,kein_Energietraeger_Energietraeger_100m-Gitter,Insgesamt_Bevoelkerung_Familienstand_100m-Gitter,Ledig_Familienstand_100m-Gitter,Verheiratet_Familienstand_100m-Gitter,Verwitwet_Familienstand_100m-Gitter,Geschieden_Familienstand_100m-Gitter,EingetrLebenspartnerschaft_Familienstand_100m-Gitter,EingetrLebenspartVerstorben_Familienstand_100m-Gitter,EingetrLebenspartAufgehoben_Familienstand_100m-Gitter,OhneAngabe_Familienstand_100m-Gitter,Insgesamt_Wohnungen_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,unter30_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,30bis39_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,40bis49_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,50bis59_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,60bis69_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,70bis79_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,80bis89_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,90bis99_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,100bis109_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,110bis119_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,120bis129_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,130bis139_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,140bis149_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,150bis159_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,160bis169_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,170bis179_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,180undmehr_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter,Insgesamt_Gebaeude_Geb_Gebaeudetyp_Groesse_100m-Gitter,FreiEFH_Geb_Gebaeudetyp_Groesse_100m-Gitter,EFH_DHH_Geb_Gebaeudetyp_Groesse_100m-Gitter,EFH_Reihenhaus_Geb_Gebaeudetyp_Groesse_100m-Gitter,Freist_ZFH_Geb_Gebaeudetyp_Groesse_100m-Gitter,ZFH_DHH_Geb_Gebaeudetyp_Groesse_100m-Gitter,ZFH_Reihenhaus_Geb_Gebaeudetyp_Groesse_100m-Gitter,MFH_3bis6Wohnungen_Geb_Gebaeudetyp_Groesse_100m-Gitter,MFH_7bis12Wohnungen_Geb_Gebaeudetyp_Groesse_100m-Gitter,MFH_13undmehrWohnungen_Geb_Gebaeudetyp_Groesse_100m-Gitter,AndererGebaeudetyp_Geb_Gebaeudetyp_Groesse_100m-Gitter,Insgesamt_Gebaeude_Gebaeude_nach_Anzahl_der_Wohnungen_100m-Gitter,1_Wohnung_Gebaeude_nach_Anzahl_der_Wohnungen_100m-Gitter,2_Wohnungen_Gebaeude_nach_Anzahl_der_Wohnungen_100m-Gitter,3bis6_Wohnungen_Gebaeude_nach_Anzahl_der_Wohnungen_100m-Gitter,7bis12_Wohnungen_Gebaeude_nach_Anzahl_der_Wohnungen_100m-Gitter,13undmehr_Wohnungen_Gebaeude_nach_Anzahl_der_Wohnungen_100m-Gitter,Insgesamt_Gebaeude_Gebaeude_nach_Baujahr_in_MZ_Klassen_100m-Gitter,Vor1919_Gebaeude_nach_Baujahr_in_MZ_Klassen_100m-Gitter,a1919bis1948_Gebaeude_nach_Baujahr_in_MZ_Klassen_100m-Gitter,a1949bis1978_Gebaeude_nach_Baujahr_in_MZ_Klassen_100m-Gitter,a1979bis1990_Gebaeude_nach_Baujahr_in_MZ_Klassen_100m-Gitter,a1991bis2000_Gebaeude_nach_Baujahr_in_MZ_Klassen_100m-Gitter,a2001bis2010_Gebaeude_nach_Baujahr_in_MZ_Klassen_100m-Gitter,a2011bis2019_Gebaeude_nach_Baujahr_in_MZ_Klassen_100m-Gitter,a2020undspaeter_Gebaeude_nach_Baujahr_in_MZ_Klassen_100m-Gitter,Insgesamt_Energietraeger_Gebaeude_nach_Energietraeger_der_Heizung_100m-Gitter,Gas_Gebaeude_nach_Energietraeger_der_Heizung_100m-Gitter,Heizoel_Gebaeude_nach_Energietraeger_der_Heizung_100m-Gitter,Holz_Holzpellets_Gebaeude_nach_Energietraeger_der_Heizung_100m-Gitter,Biomasse_Biogas_Gebaeude_nach_Energietraeger_der_Heizung_100m-Gitter,Solar_Geothermie_Waermepumpen_Gebaeude_nach_Energietraeger_der_Heizung_100m-Gitter,Strom_Gebaeude_nach_Energietraeger_der_Heizung_100m-Gitter,Kohle_Gebaeude_nach_Energietraeger_der_Heizung_100m-Gitter,Fernwaerme_Gebaeude_nach_Energietraeger_der_Heizung_100m-Gitter,kein_Energietraeger_Gebaeude_nach_Energietraeger_der_Heizung_100m-Gitter,Insgesamt_Heizungsart_Gebaeude_nach_ueberwiegender_Heizungsart_100m-Gitter,Fernheizung_Gebaeude_nach_ueberwiegender_Heizungsart_100m-Gitter,Etagenheizung_Gebaeude_nach_ueberwiegender_Heizungsart_100m-Gitter,Blockheizung_Gebaeude_nach_ueberwiegender_Heizungsart_100m-Gitter,Zentralheizung_Gebaeude_nach_ueberwiegender_Heizungsart_100m-Gitter,Einzel_Mehrraumoefen_Gebaeude_nach_ueberwiegender_Heizungsart_100m-Gitter,keine_Heizung_Gebaeude_nach_ueberwiegender_Heizungsart_100m-Gitter,Insgesamt_Bevoelkerung_Geburtsland_Gruppen_100m-Gitter,Deutschland_Geburtsland_Gruppen_100m-Gitter,Ausland_Sonstige_Geburtsland_Gruppen_100m-Gitter,EU27_Land_Geburtsland_Gruppen_100m-Gitter,Sonstiges_Europa_Geburtsland_Gruppen_100m-Gitter,Sonstige_Welt_Geburtsland_Gruppen_100m-Gitter,Sonstige_Geburtsland_Gruppen_100m-Gitter,Insgesamt_Haushalte_Groesse_des_privaten_Haushalts_100m-Gitter,1_Person_Groesse_des_privaten_Haushalts_100m-Gitter,2_Personen_Groesse_des_privaten_Haushalts_100m-Gitter,3_Personen_Groesse_des_privaten_Haushalts_100m-Gitter,4_Personen_Groesse_des_privaten_Haushalts_100m-Gitter,5_Personen_Groesse_des_privaten_Haushalts_100m-Gitter,6_Personen_und_mehr_Groesse_des_privaten_Haushalts_100m-Gitter,Insgesamt_Familien_Grosse_Kernfamilie_bis6undmehrPers_100m-Gitter,a2Personen_Grosse_Kernfamilie_bis6undmehrPers_100m-Gitter,a3Personen_Grosse_Kernfamilie_bis6undmehrPers_100m-Gitter,a4Personen_Grosse_Kernfamilie_bis6undmehrPers_100m-Gitter,a5Personen_Grosse_Kernfamilie_bis6undmehrPers_100m-Gitter,a6Pers_und_mehr_Grosse_Kernfamilie_bis6undmehrPers_100m-Gitter,Insgesamt_Heizungsart_Heizungsart_100m-Gitter,Fernheizung_Heizungsart_100m-Gitter,Etagenheizung_Heizungsart_100m-Gitter,Blockheizung_Heizungsart_100m-Gitter,Zentralheizung_Heizungsart_100m-Gitter,Einzel_Mehrraumoefen_Heizungsart_100m-Gitter,keine_Heizung_Heizungsart_100m-Gitter,Leerstandsquote_Leerstandsquote_100m-Gitter,marktaktive_Leerstandsquote_Marktaktive_Leerstandsquote_100m-Gitter,Insgesamt_Bevoelkerung_Religion_100m-Gitter,Roemisch_katholisch_Religion_100m-Gitter,Evangelisch_Religion_100m-Gitter,Sonstige_keine_ohneAngabe_Religion_100m-Gitter,Insgesamt_Haushalte_Seniorenstatus_eines_privaten_Haushalts_100m-Gitter,HH_nurSenioren_Seniorenstatus_eines_privaten_Haushalts_100m-Gitter,HH_mitSenioren_Seniorenstatus_eines_privaten_Haushalts_100m-Gitter,HH_ohneSenioren_Seniorenstatus_eines_privaten_Haushalts_100m-Gitter,Insgesamt_Bevoelkerung_Staatsangehoerigkeit_100m-Gitter,Deutschland_Staatsangehoerigkeit_100m-Gitter,Ausland_Sonstige_Staatsangehoerigkeit_100m-Gitter,Insgesamt_Bevoelkerung_Staatsangehoerigkeit_Gruppen_100m-Gitter,Deutschland_Staatsangehoerigkeit_Gruppen_100m-Gitter,Ausland_Sonstige_Staatsangehoerigkeit_Gruppen_100m-Gitter,EU27_Land_Staatsangehoerigkeit_Gruppen_100m-Gitter,Sonstiges_Europa_Staatsangehoerigkeit_Gruppen_100m-Gitter,Sonstige_Welt_Staatsangehoerigkeit_Gruppen_100m-Gitter,Sonstige_Staatsangehoerigkeit_Gruppen_100m-Gitter,Insgesamt_Familie_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,Ehep_ohneKind_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,Ehep_mind_1Kind_unter18_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,Ehep_Kinder_ab18_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,EingetrLP_ohneKind_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,EingetrLP_mind_1Kind_unter18_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,EingetrLP_Kinder_ab18_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,NichtehelLG_ohneKind_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,NichtehelLG_mind_1Kind_unter18_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,NichtehelLG_Kinder_ab18_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,Vater_mind_1Kind_unter18_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,Vater_Kinder_ab18_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,Mutter_mind_1Kind_unter18_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,Mutter_Kinder_ab18_Typ_der_Kernfamilie_nach_Kindern_100m-Gitter,Insgesamt_Haushalte_Typ_priv_HH_Familie_100m-Gitter,EinpersHH_SingleHH_Typ_priv_HH_Familie_100m-Gitter,Paare_ohneKind_Typ_priv_HH_Familie_100m-Gitter,Paare_mitKind_Typ_priv_HH_Familie_100m-Gitter,Alleinerziehende_Typ_priv_HH_Familie_100m-Gitter,MehrpersHHohneKernfam_Typ_priv_HH_Familie_100m-Gitter,Insgesamt_Haushalte_Typ_priv_HH_Lebensform_100m-Gitter,EinpersHH_SingleHH_Typ_priv_HH_Lebensform_100m-Gitter,Ehepaare_Typ_priv_HH_Lebensform_100m-Gitter,EingetrLebensp_Typ_priv_HH_Lebensform_100m-Gitter,NichtehelLebensg_Typ_priv_HH_Lebensform_100m-Gitter,AlleinerzMuetter_Typ_priv_HH_Lebensform_100m-Gitter,AlleinerzVaeter_Typ_priv_HH_Lebensform_100m-Gitter,MehrpersHHohneKernfam_Typ_priv_HH_Lebensform_100m-Gitter,Insgesamt_Wohnungen_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,FreiEFH_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,EFH_DHH_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,EFH_Reihenhaus_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,Freist_ZFH_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,ZFH_DHH_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,ZFH_Reihenhaus_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,MFH_3bis6Wohnungen_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,MFH_7bis12Wohnungen_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,MFH_13undmehrWohnungen_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,AndererGebaeudetyp_Wohnung_Gebaeudetyp_Groesse_100m-Gitter,Insgesamt_Wohnungen_Wohnungen_nach_Zahl_der_Raeume_100m-Gitter,1Raum_Wohnungen_nach_Zahl_der_Raeume_100m-Gitter,2Raeume_Wohnungen_nach_Zahl_der_Raeume_100m-Gitter,3Raeume_Wohnungen_nach_Zahl_der_Raeume_100m-Gitter,4Raeume_Wohnungen_nach_Zahl_der_Raeume_100m-Gitter,5Raeume_Wohnungen_nach_Zahl_der_Raeume_100m-Gitter,6Raeume_Wohnungen_nach_Zahl_der_Raeume_100m-Gitter,7undmehrRaeume_Wohnungen_nach_Zahl_der_Raeume_100m-Gitter,Insgesamt_Bevoelkerung_Zahl_der_Staatsangehoerigkeiten_100m-Gitter,EineStaatsang_Zahl_der_Staatsangehoerigkeiten_100m-Gitter,Mehrere_deutsch_und_auslaendisch_Zahl_der_Staatsangehoerigkeiten_100m-Gitter,Mehrere_nur_auslaendisch_Zahl_der_Staatsangehoerigkeiten_100m-Gitter,Nicht_bekannt_Zahl_der_Staatsangehoerigkeiten_100m-Gitter,GITTER_ID_1km,GITTER_ID_10km,POP_TOTAL_100m,scale,is_orphan,POP_TOTAL_100m_adj,AGE_0,AGE_1,AGE_2,AGE_3,AGE_4,AGE_5,AGE_6,AGE_7,AGE_8,AGE_9,AGE_10,AGE_11,AGE_12,AGE_13,AGE_14,AGE_15,AGE_16,AGE_17,AGE_18,AGE_19,AGE_20,AGE_21,AGE_22,AGE_23,AGE_24,AGE_25,AGE_26,AGE_27,AGE_28,AGE_29,AGE_30,AGE_31,AGE_32,AGE_33,AGE_34,AGE_35,AGE_36,AGE_37,AGE_38,AGE_39,AGE_40,AGE_41,AGE_42,AGE_43,AGE_44,AGE_45,AGE_46,AGE_47,AGE_48,AGE_49,AGE_50,AGE_51,AGE_52,AGE_53,AGE_54,AGE_55,AGE_56,AGE_57,AGE_58,AGE_59,AGE_60,AGE_61,AGE_62,AGE_63,AGE_64,AGE_65,AGE_66,AGE_67,AGE_68,AGE_69,AGE_70,AGE_71,AGE_72,AGE_73,AGE_74,AGE_75,AGE_76,AGE_77,AGE_78,AGE_79,AGE_80,AGE_81,AGE_82,AGE_83,AGE_84,AGE_85,AGE_86,AGE_87,AGE_88,AGE_89,AGE_90,AGE_91,AGE_92,AGE_93,AGE_94,AGE_95,AGE_96,AGE_97,AGE_98,AGE_99,AGE_100,AGE_0_9_agg,AGE_10_19_agg,AGE_20_29_agg,AGE_30_39_agg,AGE_40_49_agg,AGE_50_59_agg,AGE_60_69_agg,AGE_70_79_agg,AGE_80_plus_agg,RegionalSchl√ºssel_ARS,Land,Regierungsbezirk,Kreis,VerwaltungsgemeinschaftTeil1,VerwaltungsgemeinschaftTeil2,Gemeinde,M_AGE_0,F_AGE_0,M_AGE_1,F_AGE_1,M_AGE_2,F_AGE_2,M_AGE_3,F_AGE_3,M_AGE_4,F_AGE_4,M_AGE_5,F_AGE_5,M_AGE_6,F_AGE_6,M_AGE_7,F_AGE_7,M_AGE_8,F_AGE_8,M_AGE_9,F_AGE_9,M_AGE_10,F_AGE_10,M_AGE_11,F_AGE_11,M_AGE_12,F_AGE_12,M_AGE_13,F_AGE_13,M_AGE_14,F_AGE_14,M_AGE_15,F_AGE_15,M_AGE_16,F_AGE_16,M_AGE_17,F_AGE_17,M_AGE_18,F_AGE_18,M_AGE_19,F_AGE_19,M_AGE_20,F_AGE_20,M_AGE_21,F_AGE_21,M_AGE_22,F_AGE_22,M_AGE_23,F_AGE_23,M_AGE_24,F_AGE_24,M_AGE_25,F_AGE_25,M_AGE_26,F_AGE_26,M_AGE_27,F_AGE_27,M_AGE_28,F_AGE_28,M_AGE_29,F_AGE_29,M_AGE_30,F_AGE_30,M_AGE_31,F_AGE_31,M_AGE_32,F_AGE_32,M_AGE_33,F_AGE_33,M_AGE_34,F_AGE_34,M_AGE_35,F_AGE_35,M_AGE_36,F_AGE_36,M_AGE_37,F_AGE_37,M_AGE_38,F_AGE_38,M_AGE_39,F_AGE_39,M_AGE_40,F_AGE_40,M_AGE_41,F_AGE_41,M_AGE_42,F_AGE_42,M_AGE_43,F_AGE_43,M_AGE_44,F_AGE_44,M_AGE_45,F_AGE_45,M_AGE_46,F_AGE_46,M_AGE_47,F_AGE_47,M_AGE_48,F_AGE_48,M_AGE_49,F_AGE_49,M_AGE_50,F_AGE_50,M_AGE_51,F_AGE_51,M_AGE_52,F_AGE_52,M_AGE_53,F_AGE_53,M_AGE_54,F_AGE_54,M_AGE_55,F_AGE_55,M_AGE_56,F_AGE_56,M_AGE_57,F_AGE_57,M_AGE_58,F_AGE_58,M_AGE_59,F_AGE_59,M_AGE_60,F_AGE_60,M_AGE_61,F_AGE_61,M_AGE_62,F_AGE_62,M_AGE_63,F_AGE_63,M_AGE_64,F_AGE_64,M_AGE_65,F_AGE_65,M_AGE_66,F_AGE_66,M_AGE_67,F_AGE_67,M_AGE_68,F_AGE_68,M_AGE_69,F_AGE_69,M_AGE_70,F_AGE_70,M_AGE_71,F_AGE_71,M_AGE_72,F_AGE_72,M_AGE_73,F_AGE_73,M_AGE_74,F_AGE_74,M_AGE_75,F_AGE_75,M_AGE_76,F_AGE_76,M_AGE_77,F_AGE_77,M_AGE_78,F_AGE_78,M_AGE_79,F_AGE_79,M_AGE_80,F_AGE_80,M_AGE_81,F_AGE_81,M_AGE_82,F_AGE_82,M_AGE_83,F_AGE_83,M_AGE_84,F_AGE_84,M_AGE_85,F_AGE_85,M_AGE_86,F_AGE_86,M_AGE_87,F_AGE_87,M_AGE_88,F_AGE_88,M_AGE_89,F_AGE_89,M_AGE_90,F_AGE_90,M_AGE_91,F_AGE_91,M_AGE_92,F_AGE_92,M_AGE_93,F_AGE_93,M_AGE_94,F_AGE_94,M_AGE_95,F_AGE_95,M_AGE_96,F_AGE_96,M_AGE_97,F_AGE_97,M_AGE_98,F_AGE_98,M_AGE_99,F_AGE_99,M_AGE_100,F_AGE_100,M_AGE_0_9_agg,M_AGE_10_19_agg,M_AGE_20_29_agg,M_AGE_30_39_agg,M_AGE_40_49_agg,M_AGE_50_59_agg,M_AGE_60_69_agg,M_AGE_70_79_agg,M_AGE_80_plus_agg,F_AGE_0_9_agg,F_AGE_10_19_agg,F_AGE_20_29_agg,F_AGE_30_39_agg,F_AGE_40_49_agg,F_AGE_50_59_agg,F_AGE_60_69_agg,F_AGE_70_79_agg,F_AGE_80_plus_agg,M_TOTAL,F_TOTAL,Insgesamt_Bevoelkerung_Familienstand_100m-Gitter_adj,Insgesamt_Energietraeger_Energietraeger_100m-Gitter_adj,Insgesamt_Heizungsart_Gebaeude_nach_ueberwiegender_Heizungsart_100m-Gitter_adj,Insgesamt_Haushalte_Groesse_des_privaten_Haushalts_100m-Gitter_adj,Insgesamt_Haushalte_Typ_priv_HH_Lebensform_100m-Gitter_adj,Insgesamt_Wohnungen_Wohnungen_nach_Zahl_der_Raeume_100m-Gitter_adj,Insgesamt_Wohnungen_Flaeche_der_Wohnung_10m2_Intervalle_100m-Gitter_adj,Insgesamt_Bevoelkerung_Geburtsland_Gruppen_100m-Gitter_adj,RegioStaR2,RegioStaR4,RegioStaR17,RegioStaR7,RegioStaR5,RegioStaRGem7,RegioStaRGem5
307,CRS3035RES100mN3232400E4354600,10.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.0,10.0,0.0,0.0,0.0,4.0,3.0,0.0,27.27,0.0,0.0,10.0,7.0,47.72,146.29,2.67,0.0,0.0,0.0,48.98,50.0,6.0,5.150002,0.0,0.0,0.0,0.153839,0.0,0.0,0.606161,0.0,10.0,4.363409,5.601559,0.035396,0.24184,0.0,0.0,0.0,0.0,6.0,0.017168,0.040054,0.07861,0.149132,0.449982,0.180063,0.085318,0.337546,0.254032,0.380877,0.576643,0.963797,1.651989,0.328639,0.011996,0.291125,0.113706,6.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.620928,0.330023,0.0,4.579844,0.368157,0.0,10.0,8.266619,0.931931,0.395972,0.4103708,0.2373113,0.0,3.0,0.337831,1.371412,0.649378,0.678791,0.158842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,10.0,0.0,8.0,0.0,3.0,0.0,3.0,0.0,10.0,10.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.179216,2.800776,0.0,0.111516,0.104747,0.0,0.0,6.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.542127,0.707387,3.173129,1.173655,0.314379,10.0,10.0,0.0,0.0,0.0,CRS3035RES1000mN3232000E4354000,CRS3035RES10000mN3230000E4350000,10.242424,1.024242,False,10.243213,0.054517,0.056788,0.056872,0.039542,0.04035,0.040812,0.049604,0.048798,0.047613,0.046785,0.063672,0.064029,0.063752,0.064531,0.065349,0.063785,0.117599,0.121235,0.07259,0.134506,0.080776,0.085145,0.089084,0.091083,0.094214,0.054957,0.05352,0.053427,0.055347,0.056868,0.041248,0.044546,0.045694,0.045911,0.046202,0.045155,0.043867,0.043673,0.043377,0.04402,0.112278,0.112171,0.109022,0.105052,0.104577,0.102062,0.099098,0.099206,0.099717,0.103211,0.361297,0.378736,0.39658,0.420846,0.43489,0.437678,0.442403,0.446577,0.446858,0.434617,0.123489,0.118922,0.1152,0.109065,0.104753,0.177907,0.172083,0.102928,0.098443,0.096225,0.125387,0.121082,0.11865,0.109028,0.097278,0.133085,0.093872,0.121446,0.134748,0.125662,0.015242,0.014613,0.01564,0.013615,0.011848,0.010444,0.009112,0.007968,0.005641,0.004535,0.003957,0.003434,0.00279,0.002174,0.001621,0.001185,0.000854,0.000589,0.000376,0.000252,0.00035,0.48168,0.831048,0.714422,0.443692,1.046394,4.200483,1.219015,1.180238,0.126241,31010000000,3,1,1,0,0,0,0.027968,0.026549,0.028998,0.027789,0.028642,0.02823,0.020175,0.019367,0.02094,0.01941,0.020279,0.020533,0.024306,0.025297,0.025159,0.023639,0.022922,0.024691,0.024602,0.022183,0.032961,0.03071,0.031507,0.032522,0.033366,0.030386,0.032136,0.032395,0.033723,0.031625,0.033877,0.029908,0.057606,0.059993,0.059603,0.061632,0.036439,0.036151,0.067339,0.067167,0.040672,0.040103,0.042891,0.042254,0.045465,0.043619,0.047611,0.043472,0.050589,0.043625,0.029247,0.02571,0.028554,0.024966,0.028729,0.024698,0.0295,0.025848,0.03085,0.026019,0.022619,0.018628,0.024459,0.020087,0.024747,0.020948,0.024439,0.021472,0.024364,0.021839,0.023884,0.021271,0.022739,0.021128,0.023401,0.020272,0.022897,0.02048,0.022848,0.021172,0.057909,0.054369,0.055394,0.056776,0.054776,0.054246,0.053083,0.051969,0.051597,0.05298,0.050406,0.051656,0.051246,0.047853,0.049446,0.04976,0.049466,0.050251,0.050808,0.052403,0.191903,0.169393,0.189985,0.188751,0.203165,0.193415,0.21947,0.201377,0.224669,0.210221,0.224754,0.212924,0.224677,0.217727,0.229716,0.216861,0.227747,0.219111,0.212266,0.222351,0.060469,0.06302,0.056937,0.061985,0.05712,0.05808,0.051517,0.057548,0.050355,0.054397,0.08435,0.093557,0.08293,0.089153,0.048668,0.05426,0.045132,0.053311,0.042351,0.053873,0.057679,0.067708,0.052424,0.068658,0.053163,0.065487,0.049655,0.059373,0.042495,0.054783,0.058585,0.0745,0.040633,0.053239,0.051554,0.069892,0.058515,0.076232,0.05356,0.072102,0.006666,0.008576,0.006234,0.008379,0.006504,0.009135,0.005344,0.00827,0.004764,0.007084,0.004111,0.006333,0.003242,0.00587,0.002911,0.005056,0.001998,0.003643,0.001517,0.003018,0.00129,0.002667,0.001094,0.00234,0.000747,0.002043,0.000511,0.001662,0.000391,0.00123,0.000298,0.000888,0.000178,0.000676,0.000123,0.000466,5.2e-05,0.000324,3.3e-05,0.000219,5e-05,0.0003,0.243991,0.418557,0.374107,0.236396,0.524131,2.148352,0.579832,0.518264,0.048061,0.237688,0.41249,0.340314,0.207296,0.522263,2.052131,0.639183,0.661973,0.07818,5.091692,5.15152,10.242204,5.910003,5.898952,3.196254,3.196254,5.910677,5.910677,10.242204,1.0,12.0,121.0,72.0,52.0,72.0,52.0
313,CRS3035RES100mN3232500E4355600,38.0,3.0,3.0,4.0,5.0,10.0,0.0,5.0,5.0,0.0,38.0,3.0,7.0,16.0,4.0,11.0,0.0,28.95,7.89,0.0,38.0,32.0,79.84,109.77,1.73,6.22,6.22,6.0,42.97,72.73,24.0,19.982843,0.83331,0.060727,0.0,0.103952,0.0,0.0,3.080765,0.058412,38.0,15.000245,11.588311,6.336227,5.074399,0.0,0.0,0.0,0.0,24.0,0.183748,0.138035,2.180466,0.331421,0.714245,2.410231,2.853961,0.510087,1.146947,2.185062,2.82937,0.865993,0.657371,2.006549,2.848926,1.528785,0.731564,17.0,8.0,0.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0,3.0,17.0,9.0,5.0,3.0,0.0,0.0,17.0,11.0,5.0,0.0,0.0,0.0,0.0,3.0,0.0,17.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,17.0,2.069229,0.425267,0.140438,14.054,0.0,0.05341,38.0,31.622267,3.233068,3.143846,3.209996e-13,1.717574e-13,0.0,22.0,11.551389,2.899494,3.430558,2.496081,1.726946,0.010263,9.0,4.0,3.0,3.0,0.0,0.0,24.0,5.0,3.0,0.0,16.0,0.0,0.0,12.5,0.0,38.0,4.0,13.0,23.0,22.0,6.0,0.0,14.0,38.0,38.0,3.0,38.0,38.0,3.0,3.0,0.0,0.0,0.0,9.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,15.0,3.0,5.0,0.0,0.0,22.0,13.145438,8.126558,0.0,0.401948,0.319024,0.087375,0.034388,24.0,6.0,0.0,0.0,9.0,0.0,0.0,8.0,0.0,0.0,3.0,24.0,0.0,2.873641,2.332311,3.590188,8.226087,2.744143,4.35639,38.0,38.0,0.0,0.0,0.0,CRS3035RES1000mN3232000E4355000,CRS3035RES10000mN3230000E4350000,38.0,1.0,False,38.002928,0.249678,0.260078,0.260465,0.278891,0.284588,0.287846,0.337925,0.332435,0.324361,0.318721,0.14954,0.150379,0.149729,0.151558,0.153479,0.149807,0.164284,0.169365,0.816372,0.948399,0.450914,0.475306,0.497295,0.508452,0.525933,0.349478,0.340341,0.339752,0.351962,0.361632,0.428913,0.463204,0.475148,0.477406,0.480433,0.46954,0.45615,0.454128,0.45105,0.457737,0.899804,0.898942,0.873711,0.841894,0.838084,0.817933,0.79418,0.795041,0.799141,0.827139,0.271448,0.284551,0.297957,0.316189,0.32674,0.328835,0.332385,0.335521,0.335732,0.326535,0.541417,0.521393,0.505077,0.478179,0.459271,0.471405,0.455974,0.734246,0.702255,0.68643,0.602463,0.581781,0.570092,0.523863,0.467407,0.554382,0.391036,0.505901,0.56131,0.523462,0.10518,0.10084,0.107924,0.093949,0.08176,0.07207,0.06288,0.054983,0.038928,0.031293,0.027303,0.023699,0.019255,0.015001,0.011189,0.008181,0.005895,0.004065,0.002597,0.001741,0.002417,2.934987,3.002913,4.201065,4.613709,8.385869,3.155893,5.555647,5.281697,0.871149,31010000000,3,1,1,0,0,0,0.128087,0.121591,0.132808,0.12727,0.131176,0.12929,0.142296,0.136594,0.147691,0.136897,0.143025,0.14482,0.165586,0.172339,0.171397,0.161038,0.156155,0.168205,0.167599,0.151122,0.077413,0.072127,0.073997,0.076382,0.078365,0.071365,0.075476,0.076082,0.079204,0.074276,0.079564,0.070244,0.080474,0.08381,0.083265,0.0861,0.409803,0.406569,0.474805,0.473593,0.227046,0.223869,0.239431,0.235875,0.253801,0.243494,0.265779,0.242674,0.282404,0.243529,0.185982,0.163495,0.18158,0.158762,0.182691,0.15706,0.187592,0.16437,0.196176,0.165455,0.235207,0.193706,0.254332,0.208872,0.257327,0.217821,0.254131,0.223275,0.253344,0.227088,0.248355,0.221185,0.236453,0.219697,0.243331,0.210798,0.238091,0.212959,0.237581,0.220156,0.464088,0.435715,0.443932,0.45501,0.438979,0.434731,0.42541,0.416484,0.4135,0.424584,0.403955,0.413978,0.410686,0.383494,0.396265,0.398776,0.396427,0.402715,0.407177,0.419962,0.14418,0.127268,0.142739,0.141812,0.152641,0.145316,0.164891,0.151298,0.168797,0.157943,0.168861,0.159974,0.168803,0.163582,0.17259,0.162931,0.17111,0.164622,0.159479,0.167056,0.265118,0.276299,0.249632,0.271761,0.250434,0.254642,0.22587,0.252309,0.220775,0.238497,0.223505,0.2479,0.219743,0.236232,0.347178,0.387068,0.321957,0.380298,0.302118,0.384312,0.277138,0.325325,0.25189,0.329891,0.25544,0.314653,0.238587,0.285276,0.204183,0.263224,0.244042,0.310339,0.169262,0.221774,0.214756,0.291144,0.243754,0.317556,0.223111,0.300351,0.045999,0.059181,0.043018,0.057822,0.044885,0.063038,0.03688,0.05707,0.032877,0.048883,0.028372,0.043699,0.022375,0.040505,0.020091,0.034892,0.013786,0.025142,0.010465,0.020828,0.0089,0.018403,0.007552,0.016146,0.005158,0.014097,0.00353,0.011471,0.002699,0.00849,0.002055,0.006126,0.001228,0.004667,0.000852,0.003213,0.000361,0.002236,0.000229,0.001512,0.000345,0.002072,1.485821,1.512366,2.202481,2.458151,4.20042,1.614093,2.62633,2.322163,0.331656,1.449166,1.490547,1.998584,2.155558,4.185448,1.5418,2.929317,2.959533,0.539493,18.753483,19.249449,37.999184,24.12001,16.742344,22.114733,22.114733,24.122763,24.122763,37.999184,1.0,12.0,121.0,72.0,52.0,72.0,52.0
314,CRS3035RES100mN3232500E4357700,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,106.08,187.85,2.0,0.0,0.0,0.0,48.88,100.0,3.0,0.803011,0.781443,0.518112,6e-05,0.015045,0.009732,0.000268,0.682461,0.002156,4.0,1.773147,2.484794,0.060419,0.071492,0.000576,6.5e-05,0.000313,0.009098,3.0,0.031546,0.0736,0.144447,0.274032,0.311563,0.246943,0.156773,0.46453,0.081359,0.059667,0.071273,0.04211,0.040989,0.03138,0.022043,0.013431,0.746921,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.215176,0.097048,0.010041,2.290057,0.009223,0.002988,4.0,3.825823,0.288362,0.28572,0.0,0.0,0.0,3.0,3.028874,0.770881,0.277214,1.412679,0.056766,0.024201,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,4.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,2.240298,2.951375,0.001134,0.16904,0.118637,0.025056,0.065075,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.038842,0.138092,0.304586,0.672939,0.7661,0.058013,0.834037,4.0,4.0,0.0,0.0,0.0,CRS3035RES1000mN3232000E4357000,CRS3035RES10000mN3230000E4350000,4.4,1.1,False,4.400339,0.008596,0.008954,0.008967,0.008509,0.008683,0.008782,0.042639,0.041946,0.040928,0.040216,0.053302,0.053601,0.05337,0.054021,0.054706,0.053397,0.058145,0.059943,0.225307,0.000111,0.1005,0.105936,0.110837,0.113324,0.11722,0.07222,0.070332,0.07021,0.072733,0.074732,0.034129,0.036857,0.037808,0.037987,0.038228,0.037362,0.036296,0.036135,0.03589,0.036422,0.080157,0.080081,0.077833,0.074999,0.074659,0.072864,0.070748,0.070825,0.07119,0.073684,0.059874,0.062764,0.065721,0.069742,0.07207,0.072532,0.073315,0.074007,0.074053,0.072025,0.085137,0.081988,0.079423,0.075193,0.07222,9e-06,9e-06,0.042267,0.040425,0.039514,0.017723,0.017114,0.016771,0.015411,0.01375,0.020781,0.014658,0.018964,0.021041,0.019622,0.012785,0.012258,0.013119,0.01142,0.009938,0.008761,0.007643,0.006684,0.004732,0.003804,0.003319,0.002881,0.002341,0.001823,0.00136,0.000994,0.000717,0.000494,0.000316,0.000212,0.000294,0.21822,0.665904,0.908044,0.367115,0.74704,0.696102,0.516185,0.175836,0.105893,31010000000,3,1,1,0,0,0,0.00441,0.004186,0.004572,0.004382,0.004516,0.004451,0.004342,0.004168,0.004506,0.004177,0.004364,0.004419,0.020894,0.021746,0.021627,0.02032,0.019704,0.021224,0.021148,0.019068,0.027593,0.025709,0.026375,0.027225,0.027932,0.025437,0.026903,0.027119,0.028231,0.026475,0.02836,0.025038,0.028482,0.029663,0.02947,0.030473,0.1131,0.112208,5.5e-05,5.5e-05,0.050604,0.049896,0.053364,0.052572,0.056567,0.05427,0.059237,0.054087,0.062942,0.054278,0.038433,0.033787,0.037524,0.032808,0.037753,0.032457,0.038766,0.033967,0.04054,0.034192,0.018716,0.015413,0.020237,0.01662,0.020476,0.017332,0.020221,0.017766,0.020159,0.01807,0.019762,0.0176,0.018815,0.017481,0.019362,0.016773,0.018945,0.016945,0.018904,0.017518,0.041342,0.038815,0.039547,0.040534,0.039106,0.038727,0.037897,0.037102,0.036836,0.037823,0.035986,0.036878,0.036585,0.034163,0.035301,0.035524,0.035315,0.035875,0.036273,0.037412,0.031802,0.028072,0.031484,0.03128,0.033668,0.032053,0.03637,0.033372,0.037232,0.034838,0.037246,0.035286,0.037233,0.036082,0.038068,0.035938,0.037742,0.036311,0.035177,0.036848,0.04169,0.043448,0.039254,0.042734,0.039381,0.040042,0.035518,0.039675,0.034717,0.037503,4e-06,5e-06,4e-06,4e-06,0.019985,0.022281,0.018533,0.021892,0.017391,0.022123,0.008153,0.00957,0.00741,0.009705,0.007514,0.009256,0.007019,0.008392,0.006007,0.007743,0.009148,0.011633,0.006345,0.008313,0.00805,0.010914,0.009137,0.011904,0.008363,0.011259,0.005591,0.007194,0.005229,0.007029,0.005456,0.007663,0.004483,0.006937,0.003996,0.005942,0.003449,0.005312,0.00272,0.004924,0.002442,0.004241,0.001676,0.003056,0.001272,0.002532,0.001082,0.002237,0.000918,0.001963,0.000627,0.001714,0.000429,0.001394,0.000328,0.001032,0.00025,0.000745,0.000149,0.000567,0.000104,0.000391,4.4e-05,0.000272,2.8e-05,0.000184,4.2e-05,0.000252,0.110081,0.336502,0.475731,0.195596,0.374187,0.356024,0.246477,0.077146,0.040315,0.108139,0.329402,0.432313,0.171519,0.372853,0.340078,0.269708,0.098689,0.065578,2.212059,2.18828,4.399905,2.812288,2.624534,5.570615,5.570615,2.812609,2.812609,4.399905,1.0,12.0,121.0,72.0,52.0,72.0,52.0
321,CRS3035RES100mN3232600E4354800,8.0,0.0,0.0,3.0,0.0,0.0,8.0,3.0,0.0,0.0,8.0,0.0,3.0,0.0,6.0,3.0,0.0,0.0,0.0,0.0,8.0,9.0,55.48,139.39,4.0,0.0,0.0,0.0,44.53,100.0,3.0,2.547157,0.0,0.0,0.0,0.082556,0.0,0.0,0.325288,0.0,8.0,2.370456,5.656062,0.021353,0.145893,0.0,0.0,0.0,0.0,3.0,0.011167,0.026053,0.051131,0.097002,0.292688,0.117121,0.055495,0.219555,0.165234,0.247739,0.375074,0.626896,0.1853,0.213761,0.007803,0.189361,0.07396,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.587875,0.312456,0.0,1.700585,0.34856,0.0,8.0,1.471544,2.376074,0.137388,3.417421,0.7913361,0.0,3.0,0.337831,1.371412,0.649378,0.678791,0.158842,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,7.0,4.0,3.0,0.0,3.0,3.0,8.0,8.0,3.0,8.0,8.0,3.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.179216,2.800776,0.0,0.111516,0.104747,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.139502,1.021804,0.345886,1.367249,0.080897,8.0,6.0,5.0,0.0,0.0,CRS3035RES1000mN3232000E4354000,CRS3035RES10000mN3230000E4350000,8.193939,1.024242,False,8.194571,0.026622,0.027731,0.027772,0.01931,0.019704,0.01993,0.024223,0.023829,0.023251,0.022846,0.031313,0.031489,0.031353,0.031736,0.032138,0.031369,0.057834,0.059623,0.03489,0.064649,0.175742,0.185248,0.193818,0.198167,0.20498,0.119569,0.116443,0.116241,0.120418,0.123727,0.020203,0.021818,0.02238,0.022487,0.022629,0.022116,0.021485,0.02139,0.021245,0.02156,0.054992,0.054939,0.053397,0.051453,0.05122,0.049989,0.048537,0.048589,0.04884,0.050551,0.271664,0.284776,0.298193,0.316439,0.326999,0.329095,0.332648,0.335787,0.335998,0.326794,0.024427,0.023524,0.022787,0.021574,0.020721,0.518598,0.501622,0.300035,0.286962,0.280495,0.005739,0.005542,0.005431,0.00499,0.004453,0.006091,0.004297,0.005559,0.006168,0.005752,0.006716,0.006439,0.006891,0.005999,0.00522,0.004602,0.004015,0.003511,0.002486,0.001998,0.001743,0.001513,0.001229,0.000958,0.000714,0.000522,0.000376,0.00026,0.000166,0.000111,0.000154,0.235218,0.406395,1.554353,0.217314,0.512508,3.158394,2.000745,0.054021,0.055624,31010000000,3,1,1,0,0,0,0.013657,0.012965,0.014161,0.01357,0.013987,0.013786,0.009852,0.009457,0.010226,0.009478,0.009903,0.010027,0.011869,0.012353,0.012286,0.011543,0.011193,0.012057,0.012014,0.010833,0.01621,0.015103,0.015495,0.015994,0.016409,0.014944,0.015805,0.015931,0.016585,0.015553,0.016661,0.014709,0.02833,0.029504,0.029312,0.03031,0.017514,0.017376,0.032366,0.032283,0.08849,0.087252,0.093317,0.091931,0.098918,0.094901,0.103586,0.094581,0.110066,0.094914,0.063631,0.055938,0.062125,0.054318,0.062505,0.053736,0.064182,0.056237,0.067119,0.056608,0.011079,0.009124,0.011979,0.009838,0.012121,0.01026,0.01197,0.010517,0.011933,0.010696,0.011698,0.010418,0.011137,0.010348,0.011461,0.009929,0.011215,0.010031,0.01119,0.01037,0.028363,0.026629,0.027131,0.027808,0.026829,0.026569,0.025999,0.025454,0.025271,0.025949,0.024688,0.025301,0.025099,0.023437,0.024218,0.024371,0.024228,0.024612,0.024885,0.025666,0.144295,0.127369,0.142852,0.141924,0.152762,0.145431,0.165022,0.151418,0.168931,0.158068,0.168995,0.1601,0.168937,0.163711,0.172727,0.16306,0.171246,0.164752,0.159605,0.167188,0.011961,0.012466,0.011263,0.012261,0.011299,0.011489,0.010191,0.011383,0.009961,0.01076,0.245881,0.272717,0.241741,0.259881,0.141867,0.158167,0.131561,0.155401,0.123454,0.157041,0.00264,0.003099,0.0024,0.003143,0.002433,0.002997,0.002273,0.002718,0.001945,0.002507,0.002681,0.00341,0.00186,0.002437,0.00236,0.003199,0.002678,0.003489,0.002452,0.0033,0.002937,0.003779,0.002747,0.003692,0.002866,0.004025,0.002355,0.003644,0.002099,0.003121,0.001812,0.00279,0.001429,0.002586,0.001283,0.002228,0.00088,0.001605,0.000668,0.00133,0.000568,0.001175,0.000482,0.001031,0.000329,0.0009,0.000225,0.000732,0.000172,0.000542,0.000131,0.000391,7.8e-05,0.000298,5.4e-05,0.000205,2.3e-05,0.000143,1.5e-05,9.7e-05,2.2e-05,0.000132,0.119148,0.204687,0.813938,0.115783,0.256711,1.615372,0.939178,0.023722,0.021177,0.11607,0.201708,0.740415,0.101531,0.255796,1.543022,1.061567,0.030299,0.034447,4.109714,4.084854,8.193763,2.955001,2.949476,3.196254,3.196254,2.955338,2.955338,8.193763,1.0,12.0,121.0,72.0,52.0,72.0,52.0
322,CRS3035RES100mN3232600E4354900,24.0,3.0,3.0,0.0,0.0,9.0,3.0,6.0,0.0,0.0,24.0,6.0,0.0,9.0,9.0,3.0,12.5,0.0,25.0,12.5,24.0,16.0,57.18,129.78,3.0,0.0,0.0,0.0,41.68,100.0,8.0,4.49765,0.0,0.0,0.0,0.176151,0.0,0.0,3.206202,0.0,24.0,8.32239,13.659271,0.039366,2.560261,0.0,0.0,0.0,0.0,8.0,0.020484,0.047791,0.093794,0.177939,0.536902,0.214845,0.101798,0.402748,0.303102,1.607206,1.994949,1.149968,0.339912,0.39212,0.014314,0.34736,0.13567,8.0,7.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,0.0,8.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,8.0,3.4478,0.353431,0.0,3.669769,0.394269,0.0,24.0,9.106261,8.310781,4.672733,0.2052264,2.286288,0.0,8.0,0.441387,1.791792,2.539163,3.543471,0.207532,0.0,8.0,3.0,0.0,0.0,0.0,0.0,8.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,24.0,6.0,8.0,6.0,8.0,0.0,0.0,8.0,24.0,18.0,3.0,24.0,18.0,3.0,3.0,0.0,0.0,0.0,8.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,9.0,0.0,0.0,8.0,0.428729,7.57726,0.0,0.266775,0.250581,0.0,0.0,8.0,7.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.447503,2.001266,2.405946,2.766679,0.259507,24.0,19.0,3.0,0.0,0.0,CRS3035RES1000mN3232000E4354000,CRS3035RES10000mN3230000E4350000,24.581818,1.024242,False,24.583712,0.215364,0.224334,0.224669,0.156209,0.1594,0.161225,0.195955,0.192771,0.188089,0.184819,0.27877,0.280333,0.279123,0.282533,0.286114,0.279268,0.514875,0.530798,0.11082,0.205344,0.12359,0.130276,0.136303,0.139361,0.144152,0.084087,0.081888,0.081746,0.084684,0.087011,0.063077,0.06812,0.069876,0.070208,0.070653,0.069052,0.067082,0.066785,0.066332,0.067316,0.67209,0.671447,0.6526,0.628835,0.62599,0.610938,0.593196,0.593839,0.596902,0.617815,0.435599,0.456624,0.478138,0.507395,0.524327,0.527688,0.533385,0.538418,0.538756,0.523998,0.641324,0.617605,0.598278,0.566417,0.54402,0.857604,0.829532,0.496167,0.474548,0.463855,0.023238,0.022441,0.02199,0.020207,0.018029,0.024665,0.017398,0.022508,0.024973,0.023289,0.027193,0.026071,0.027903,0.02429,0.021138,0.018633,0.016257,0.014216,0.010064,0.00809,0.007059,0.006127,0.004978,0.003878,0.002893,0.002115,0.001524,0.001051,0.000671,0.00045,0.000625,1.902835,3.047979,1.093098,0.678502,6.263652,5.064328,6.08935,0.218739,0.225229,31010000000,3,1,1,0,0,0,0.110484,0.10488,0.114556,0.109779,0.113148,0.111521,0.079701,0.076508,0.082723,0.076677,0.08011,0.081115,0.09602,0.099935,0.099389,0.093382,0.090551,0.097538,0.097187,0.087632,0.144313,0.134457,0.137944,0.142389,0.146086,0.133037,0.140701,0.141831,0.14765,0.138464,0.148322,0.130947,0.252211,0.262664,0.260956,0.269841,0.05563,0.055191,0.102803,0.102541,0.062231,0.06136,0.065625,0.064651,0.069564,0.066739,0.072847,0.066514,0.077404,0.066748,0.044748,0.039338,0.043689,0.038199,0.043957,0.03779,0.045136,0.039548,0.047201,0.03981,0.03459,0.028487,0.037403,0.030717,0.037843,0.032033,0.037373,0.032835,0.037257,0.033396,0.036524,0.032528,0.034773,0.032309,0.035785,0.031,0.035014,0.031318,0.034939,0.032377,0.346641,0.325448,0.331586,0.339861,0.327887,0.324714,0.317751,0.311084,0.308856,0.317134,0.301726,0.309212,0.306753,0.286443,0.295982,0.297858,0.296103,0.300799,0.304133,0.313682,0.231369,0.20423,0.229056,0.227568,0.244947,0.233191,0.264604,0.242791,0.270873,0.253454,0.270975,0.256713,0.270882,0.262503,0.276958,0.261459,0.274584,0.264172,0.255919,0.268078,0.31404,0.327284,0.295696,0.321909,0.296647,0.301631,0.267549,0.298868,0.261514,0.282506,0.406612,0.450992,0.399767,0.429765,0.234606,0.261561,0.217562,0.256986,0.204156,0.259699,0.01069,0.012549,0.009716,0.012725,0.009853,0.012137,0.009203,0.011004,0.007876,0.010153,0.010858,0.013807,0.007531,0.009867,0.009555,0.012953,0.010845,0.014128,0.009926,0.013363,0.011893,0.015301,0.011122,0.014949,0.011605,0.016298,0.009535,0.014755,0.0085,0.012638,0.007335,0.011298,0.005785,0.010472,0.005194,0.009021,0.003564,0.0065,0.002706,0.005385,0.002301,0.004758,0.001953,0.004174,0.001333,0.003645,0.000913,0.002966,0.000698,0.002195,0.000531,0.001584,0.000318,0.001207,0.00022,0.000831,9.3e-05,0.000578,5.9e-05,0.000391,8.9e-05,0.000536,0.963868,1.536616,0.572402,0.361501,3.137417,2.590169,2.89815,0.096052,0.085747,0.938968,1.511363,0.520697,0.317001,3.126235,2.474159,3.1912,0.122686,0.139482,12.241924,12.341792,24.581289,7.880003,7.865269,8.523345,8.523345,7.880902,7.880902,24.581289,1.0,12.0,121.0,72.0,52.0,72.0,52.0



Loading 1km census: inputs/cells_1km_with_binneds.parquet
  Total rows: 212,758
  Filtered to 165 1km cells

Saved filtered census to popsim/data/_census_*_filtered.parquet

SUMMARY
  100m cells in study area: 5,147
  1km cells in study area: 165
  MiD households: inputs/MiD2023_Haushalte.csv
  MiD persons: inputs/MiD2023_Personen.csv
  MiD filters: kernwo=[1, 2, 3], regiostar17=None
  RegioStar split mode: True

Set 'household_column' in Configuration and re-run Step 1,
or proceed to Step 2 if already set.

[Step 1/4] Complete.


## Step 2: Generate Geo Crosswalk and Control Totals

Creates the geographic hierarchy and control totals from filtered census data:
- `geo_cross_walk.csv` - mapping ZENSUS100m ‚Üí ZENSUS1km ‚Üí STAAT ‚Üí WELT
- `control_totals_*.csv` - one file per geography level

In [None]:
import pandas as pd
import numpy as np
import os
import re
import yaml
from unidecode import unidecode

print("[Step 2/4] Generating geo crosswalk and control totals...")
print("=" * 60)

if household_column is None:
    raise ValueError("household_column not set! Set it in Configuration and re-run Step 1.")

# Load filtered census from Step 1
census_100m = pd.read_parquet(f'{popsim_dir}/data/_census_100m_filtered.parquet')
census_1km = pd.read_parquet(f'{popsim_dir}/data/_census_1km_filtered.parquet')

print(f"Loaded {len(census_100m):,} 100m cells, {len(census_1km):,} 1km cells")

# Validate household column
if household_column not in census_100m.columns:
    raise ValueError(f"household_column '{household_column}' not found in census data.")

# Check household values
hh_values = census_100m[household_column]
if hh_values.isna().any():
    na_count = hh_values.isna().sum()
    print(f"WARNING: {na_count} cells have missing household values (will be set to 0)")
if (hh_values < 0).any():
    neg_count = (hh_values < 0).sum()
    print(f"WARNING: {neg_count} cells have negative household values")

# Helper to get 1km ID from 100m ID
def get_1km_from_100m(cell_id):
    """Convert 100m cell ID to corresponding 1km cell ID."""
    match = re.match(r'CRS3035RES100mN(\d+)E(\d+)', str(cell_id))
    if match:
        n, e = int(match.group(1)), int(match.group(2))
        n_1km = (n // 1000) * 1000
        e_1km = (e // 1000) * 1000
        return f"CRS3035RES1000mN{n_1km}E{e_1km}"
    return None

# Standardize column names
def clean_col_name(name):
    return unidecode(name).replace(" ", "").replace(".", "").replace(",", "").replace("-", "_")

# Rename columns
census_100m.columns = [clean_col_name(c) for c in census_100m.columns]
census_1km.columns = [clean_col_name(c) for c in census_1km.columns]
household_column_clean = clean_col_name(household_column)

# Find the ID columns after cleaning
id_col_100m_clean = census_100m.columns[0]
id_col_1km_clean = [c for c in census_1km.columns if '1km' in c.lower()][0]

# Create geo_cross_walk (hierarchy: ZENSUS100m -> ZENSUS1km -> STAAT -> WELT)
print("\nCreating geo_cross_walk...")
geo_cross = pd.DataFrame()
geo_cross['ZENSUS100m'] = census_100m[id_col_100m_clean]
geo_cross['ZENSUS1km'] = geo_cross['ZENSUS100m'].apply(get_1km_from_100m)
geo_cross['STAAT'] = 1
geo_cross['WELT'] = 1

geo_cross.to_csv(f'{popsim_dir}/data/geo_cross_walk.csv', index=False)
print(f"  Created {popsim_dir}/data/geo_cross_walk.csv ({len(geo_cross)} rows)")

# Create control_totals for 100m (lowest level)
print("\nCreating control totals...")

# Geography names (hierarchy from lowest to highest)
geo_names = ['ZENSUS100m', 'ZENSUS1km', 'STAAT', 'WELT']

# Add geography columns (NO rename of household column - keep original name)
census_100m = census_100m.rename(columns={id_col_100m_clean: 'ZENSUS100m'})
census_100m['ZENSUS1km'] = census_100m['ZENSUS100m'].apply(get_1km_from_100m)
census_100m['STAAT'] = 1
census_100m['WELT'] = 1

# Suffix non-geo columns
for col in census_100m.columns:
    if col not in geo_names:
        census_100m.rename(columns={col: f"{col}_ZENSUS100m"}, inplace=True)

# The household column with suffix
household_column_suffixed = f"{household_column_clean}_ZENSUS100m"

census_100m = census_100m.fillna(0)
census_100m.to_csv(f'{popsim_dir}/data/control_totals_ZENSUS100m.csv', index=False)
print(f"  Created {popsim_dir}/data/control_totals_ZENSUS100m.csv")

# Create control_totals for 1km
census_1km = census_1km.rename(columns={id_col_1km_clean: 'ZENSUS1km'})
census_1km['STAAT'] = 1
census_1km['WELT'] = 1

for col in census_1km.columns:
    if col not in geo_names:
        census_1km.rename(columns={col: f"{col}_ZENSUS1km"}, inplace=True)

census_1km = census_1km.fillna(0)
census_1km.to_csv(f'{popsim_dir}/data/control_totals_ZENSUS1km.csv', index=False)
print(f"  Created {popsim_dir}/data/control_totals_ZENSUS1km.csv")

# Create control_totals for STAAT
staat_df = pd.DataFrame({'STAAT': [1], 'WELT': [1]})
staat_df.to_csv(f'{popsim_dir}/data/control_totals_STAAT.csv', index=False)
print(f"  Created {popsim_dir}/data/control_totals_STAAT.csv")

# Create control_totals for WELT (top level)
welt_df = pd.DataFrame({'WELT': [1]})
welt_df.to_csv(f'{popsim_dir}/data/control_totals_WELT.csv', index=False)
print(f"  Created {popsim_dir}/data/control_totals_WELT.csv")

# Create controls template
print("\nCreating controls template...")
controls_rows = []
other_100m_rows = []

total_hh_control = None

# Add 100m controls - separate household from others
for col in census_100m.columns:
    if col not in geo_names:
        target = f"{col}_target"
        row = {
            'target': target,
            'geography': 'ZENSUS100m',
            'seed_table': '',
            'importance': '',
            'control_field': col,
            'expression': ''
        }
        # Identify household control by matching the suffixed household column name
        if col == household_column_suffixed:
            total_hh_control = target
            row['seed_table'] = 'households'
            row['importance'] = 1000
            row['expression'] = '(households.H_GEW > 0) & (households.H_GEW < np.inf)'
            controls_rows.insert(0, row)  # Insert at beginning
        else:
            other_100m_rows.append(row)

# Add other 100m rows after household
controls_rows.extend(other_100m_rows)

# Add 1km controls
for col in census_1km.columns:
    if col not in geo_names:
        controls_rows.append({
            'target': f"{col}_target",
            'geography': 'ZENSUS1km',
            'seed_table': '',
            'importance': '',
            'control_field': col,
            'expression': ''
        })

controls_df = pd.DataFrame(controls_rows)
controls_df.to_csv(f'{popsim_dir}/configs/_prep3_controls.csv', index=False, sep=intermediate_sep)
print(f"  Created {popsim_dir}/configs/_prep3_controls.csv ({len(controls_df)} controls, sep='{intermediate_sep}')")

if total_hh_control is None:
    raise ValueError(f"Could not find household control column '{household_column_suffixed}'!")

print(f"  Household control: {total_hh_control}")

# Update settings.yaml
print("\nUpdating PopSim configuration...")
with open(f'{popsim_dir}/configs/settings.yaml', 'r') as f:
    settings = yaml.safe_load(f)

# Geographies from top to bottom: WELT -> STAAT -> ZENSUS1km -> ZENSUS100m
settings['geographies'] = ['WELT', 'STAAT', 'ZENSUS1km', 'ZENSUS100m']
settings['seed_geography'] = seed_geography
settings['total_hh_control'] = total_hh_control

# Update input tables
idx = next((i for i, t in enumerate(settings['input_table_list']) if t['tablename'] == 'geo_cross_walk'), None)
if idx is not None:
    settings['input_table_list'] = settings['input_table_list'][:idx + 1]

for geo in ['ZENSUS100m', 'ZENSUS1km', 'STAAT', 'WELT']:
    settings['input_table_list'].append({
        'tablename': f'{geo}_control_data',
        'filename': f'control_totals_{geo}.csv'
    })

# Update output tables
if output_everything:
    settings['output_tables'] = {'action': 'skip', 'tables': 'geo_cross_walk'}
else:
    settings['output_tables'] = {
        'action': 'include',
        'tables': ['expanded_household_ids', 
                   'summary_ZENSUS100m', 'summary_ZENSUS1km', 'summary_STAAT', 'summary_WELT',
                   f'summary_ZENSUS100m_{seed_geography}']
    }

# Update models - add sub_balancing for each geography level below seed
settings['models'] = [m for m in settings['models'] if 'sub_balancing' not in m]
idx = settings['models'].index('integerize_final_seed_weights')
# Add sub_balancing for intermediate geographies (ZENSUS1km) then lowest (ZENSUS100m)
settings['models'].insert(idx + 1, 'sub_balancing.geography=ZENSUS1km')
settings['models'].insert(idx + 2, 'sub_balancing.geography=ZENSUS100m')

with open(f'{popsim_dir}/configs/settings.yaml', 'w') as f:
    yaml.dump(settings, f, default_flow_style=False)
print(f"  Updated {popsim_dir}/configs/settings.yaml")

# Update verification.yaml
with open(f'{popsim_dir}/scripts/verification.yaml', 'r') as f:
    verify = yaml.safe_load(f)

verify['group_geographies'] = ['WELT', 'STAAT', 'ZENSUS1km', 'ZENSUS100m']
verify['seed_cols']['geog'] = seed_geography
verify['summaries'] = [
    'output/final_summary_ZENSUS100m.csv',
    'output/final_summary_ZENSUS1km.csv',
    'output/final_summary_STAAT.csv',
    'output/final_summary_WELT.csv',
    f'output/final_summary_ZENSUS100m_{seed_geography}.csv'
]

with open(f'{popsim_dir}/scripts/verification.yaml', 'w') as f:
    yaml.dump(verify, f, default_flow_style=False)
print(f"  Updated {popsim_dir}/scripts/verification.yaml")

# Summary
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
total_hh = census_100m[household_column_suffixed].sum()
print(f"  Geographic hierarchy: WELT -> STAAT -> ZENSUS1km -> ZENSUS100m")
print(f"  Geographic units: {len(census_100m):,} (100m), {len(census_1km):,} (1km)")
print(f"  Total households: {total_hh:,.0f}")
print(f"  Household column: {household_column_suffixed}")
print(f"  Controls defined: {len(controls_df)}")
print(f"  Intermediate separator: '{intermediate_sep}'")
print(f"\nNext: Edit {popsim_dir}/configs/_prep3_controls.csv to add expressions")
print("for the controls you want, then run Step 3.")
print("\n[Step 2/4] Complete.")

## Step 3: Process Controls, Integerize, and Create PopSim Folders

1. Edit `popsim/configs/_prep3_controls.csv` to add expressions for the controls you want (done ONCE)
2. Run this cell to:
   - Filter census data to only columns needed by controls with expressions
   - **Smart integerize** 100m control totals (preserves 1km sums using largest remainder method)
   - If `regiostar_split=False`: Create single popsim folder with seed files
   - If `regiostar_split=True`: Create multiple `popsim_regiostar_{value}/` folders, each with census AND MiD filtered by that RegioStaR17 value

In [None]:
import pandas as pd
import numpy as np
import os
import re
import shutil
import yaml
from unidecode import unidecode

print("[Step 3/4] Processing controls, integerizing, and creating PopSim folders...")
print("=" * 60)

# Normalize filter lists
kernwo_list = kernwo if isinstance(kernwo, list) else ([kernwo] if kernwo else None)
regiostar17_list = regiostar17 if isinstance(regiostar17, list) else ([regiostar17] if regiostar17 else None)

# Load controls (intermediate file - use configured separator)
print(f"Loading controls template (separator: '{intermediate_sep}')...")
controls_df_full = pd.read_csv(f'{popsim_dir}/configs/_prep3_controls.csv', sep=intermediate_sep)
print(f"  Loaded {len(controls_df_full)} total controls from _prep3_controls.csv")

# Filter to controls that have expressions (these are the ones actually used)
controls_df = controls_df_full[controls_df_full['expression'].notna() & (controls_df_full['expression'] != '')].copy()
print(f"  {len(controls_df)} controls have expressions (will be used)")

# Extract control_field values - these are the census columns we actually need
needed_control_fields = set(controls_df['control_field'].tolist())
needed_100m_cols = {c for c in needed_control_fields if c.endswith('_ZENSUS100m')}
needed_1km_cols = {c for c in needed_control_fields if c.endswith('_ZENSUS1km')}
print(f"  100m columns needed: {len(needed_100m_cols)}")
print(f"  1km columns needed: {len(needed_1km_cols)}")

# =============================================================================
# SMART INTEGERIZATION FUNCTIONS
# =============================================================================

def get_1km_parent(id_100m: str) -> str:
    """Convert 100m cell ID to its parent 1km cell ID."""
    match = re.match(r'CRS3035RES100mN(\d+)E(\d+)', str(id_100m))
    if match:
        n, e = int(match.group(1)), int(match.group(2))
        n_1km = (n // 1000) * 1000
        e_1km = (e // 1000) * 1000
        return f'CRS3035RES1000mN{n_1km}E{e_1km}'
    return None

def largest_remainder_round(values: np.ndarray, target_sum: int) -> np.ndarray:
    """Distribute integer values using largest remainder method (Hamilton apportionment)."""
    if target_sum == 0 or len(values) == 0:
        return np.zeros(len(values), dtype=int)
    
    total = values.sum()
    if total == 0:
        return np.zeros(len(values), dtype=int)
    
    # Scale values to sum to target
    scaled = values * (target_sum / total)
    
    # Floor all values
    floored = np.floor(scaled).astype(int)
    
    # Calculate remainders and distribute deficit
    remainders = scaled - floored
    deficit = target_sum - floored.sum()
    
    if deficit > 0:
        indices = np.argsort(-remainders)[:deficit]
        floored[indices] += 1
    elif deficit < 0:
        indices = np.argsort(remainders)[:-deficit]
        floored[indices] -= 1
    
    return floored

def smart_integerize_column(df: pd.DataFrame, col: str, group_col: str = 'ZENSUS1km') -> pd.Series:
    """Integerize a single column, preserving 1km sums."""
    result = pd.Series(index=df.index, dtype=int)
    
    for group_id, group_df in df.groupby(group_col):
        values = group_df[col].values.astype(float)
        target = int(round(values.sum()))
        if target < 0:
            target = 0
        int_values = largest_remainder_round(values, target)
        result.loc[group_df.index] = int_values
    
    return result

def smart_integerize_census(df_100m: pd.DataFrame, cols_to_integerize: set, id_col: str) -> pd.DataFrame:
    """Smart integerize specified columns in 100m census data."""
    df = df_100m.copy()
    
    # Add 1km parent mapping
    df['_ZENSUS1km'] = df[id_col].apply(get_1km_parent)
    
    # Find which columns to integerize (intersection of needed and available)
    available_cols = set(df.columns)
    cols_to_process = cols_to_integerize.intersection(available_cols)
    
    if not cols_to_process:
        print("    No columns to integerize")
        df = df.drop(columns=['_ZENSUS1km'])
        return df
    
    print(f"    Integerizing {len(cols_to_process)} columns...")
    
    for i, col in enumerate(cols_to_process):
        if (i + 1) % 10 == 0 or i == 0:
            print(f"      Processing {i+1}/{len(cols_to_process)}: {col[:50]}...")
        df[col] = smart_integerize_column(df, col, '_ZENSUS1km')
    
    df = df.drop(columns=['_ZENSUS1km'])
    return df

# =============================================================================
# LOAD SEED DATA
# =============================================================================

print(f"\nLoading MiD seed data...")
print(f"  Households: {mid_households_path}")
print(f"  Persons: {mid_persons_path}")
seed_households_full = pd.read_csv(mid_households_path, sep=',')
seed_persons_full = pd.read_csv(mid_persons_path, sep=',')
print(f"  Loaded {len(seed_persons_full):,} persons, {len(seed_households_full):,} households")

# Apply kernwo filter globally (only if not None) - this is done once
if kernwo_list:
    print(f"\nApplying kernwo filter:")
    persons_before = len(seed_persons_full)
    if 'kernwo' in seed_persons_full.columns:
        seed_persons_full = seed_persons_full[seed_persons_full['kernwo'].isin(kernwo_list)]
        print(f"  kernwo {kernwo_list}: {persons_before:,} -> {len(seed_persons_full):,} persons")

# Essential columns
essential_cols = {'H_ID', 'H_GEW', 'HP_ID', 'P_ID', 'P_GEW'}
needed_cols = essential_cols.copy()

# Extract columns from expressions
pattern = r'\.(?P<col>[A-Za-z_][A-Za-z0-9_]*)'
for expr in controls_df['expression'].dropna():
    for match in re.finditer(pattern, str(expr)):
        needed_cols.add(match.group('col'))

print(f"\nColumns needed from expressions: {needed_cols - essential_cols}")

# Standardize column names helper
def clean_col_name(name):
    return unidecode(name).replace(" ", "").replace(".", "").replace(",", "").replace("-", "_")

# Compute the suffixed household column name (same logic as Step 2)
household_column_clean = clean_col_name(household_column)
household_column_suffixed = f"{household_column_clean}_ZENSUS100m"

# Helper function to process and save seed data for a specific folder
def create_popsim_folder(output_dir, census_100m_filtered, census_1km_filtered, 
                         seed_persons_filtered, seed_households_filtered,
                         apply_integerization=True):
    """Create a complete popsim folder with all required files."""
    
    # Create directory structure
    os.makedirs(f"{output_dir}/data", exist_ok=True)
    os.makedirs(f"{output_dir}/configs", exist_ok=True)
    os.makedirs(f"{output_dir}/scripts", exist_ok=True)
    os.makedirs(f"{output_dir}/output", exist_ok=True)
    
    # Helper to get 1km ID from 100m ID
    def get_1km_from_100m(cell_id):
        match = re.match(r'CRS3035RES100mN(\d+)E(\d+)', str(cell_id))
        if match:
            n, e = int(match.group(1)), int(match.group(2))
            n_1km = (n // 1000) * 1000
            e_1km = (e // 1000) * 1000
            return f"CRS3035RES1000mN{n_1km}E{e_1km}"
        return None
    
    # Process census 100m
    census_100m_proc = census_100m_filtered.copy()
    census_100m_proc.columns = [clean_col_name(c) for c in census_100m_proc.columns]
    
    id_col_100m_clean = census_100m_proc.columns[0]
    
    # Geography names
    geo_names = ['ZENSUS100m', 'ZENSUS1km', 'STAAT', 'WELT']
    
    # Create geo_cross_walk
    geo_cross = pd.DataFrame()
    geo_cross['ZENSUS100m'] = census_100m_proc[id_col_100m_clean]
    geo_cross['ZENSUS1km'] = geo_cross['ZENSUS100m'].apply(get_1km_from_100m)
    geo_cross['STAAT'] = 1
    geo_cross['WELT'] = 1
    geo_cross.to_csv(f'{output_dir}/data/geo_cross_walk.csv', index=False)
    
    # Add geography columns
    census_100m_proc = census_100m_proc.rename(columns={id_col_100m_clean: 'ZENSUS100m'})
    census_100m_proc['ZENSUS1km'] = census_100m_proc['ZENSUS100m'].apply(get_1km_from_100m)
    census_100m_proc['STAAT'] = 1
    census_100m_proc['WELT'] = 1
    
    # Suffix non-geo columns
    for col in list(census_100m_proc.columns):
        if col not in geo_names:
            census_100m_proc.rename(columns={col: f"{col}_ZENSUS100m"}, inplace=True)
    
    census_100m_proc = census_100m_proc.fillna(0)
    
    # Smart integerize 100m control columns (only the ones needed)
    if apply_integerization:
        print(f"  Smart integerizing 100m control totals...")
        census_100m_proc = smart_integerize_census(census_100m_proc, needed_100m_cols, 'ZENSUS100m')
    
    census_100m_proc.to_csv(f'{output_dir}/data/control_totals_ZENSUS100m.csv', index=False)
    
    # Process census 1km
    census_1km_proc = census_1km_filtered.copy()
    census_1km_proc.columns = [clean_col_name(c) for c in census_1km_proc.columns]
    id_col_1km_clean = [c for c in census_1km_proc.columns if '1km' in c.lower()][0]
    
    census_1km_proc = census_1km_proc.rename(columns={id_col_1km_clean: 'ZENSUS1km'})
    census_1km_proc['STAAT'] = 1
    census_1km_proc['WELT'] = 1
    
    for col in list(census_1km_proc.columns):
        if col not in geo_names:
            census_1km_proc.rename(columns={col: f"{col}_ZENSUS1km"}, inplace=True)
    
    census_1km_proc = census_1km_proc.fillna(0)
    
    # Integerize 1km columns (simple rounding since no sub-cells)
    if apply_integerization:
        for col in needed_1km_cols:
            if col in census_1km_proc.columns:
                census_1km_proc[col] = census_1km_proc[col].round().astype(int)
    
    census_1km_proc.to_csv(f'{output_dir}/data/control_totals_ZENSUS1km.csv', index=False)
    
    # Create STAAT and WELT control totals
    staat_df = pd.DataFrame({'STAAT': [1], 'WELT': [1]})
    staat_df.to_csv(f'{output_dir}/data/control_totals_STAAT.csv', index=False)
    
    welt_df = pd.DataFrame({'WELT': [1]})
    welt_df.to_csv(f'{output_dir}/data/control_totals_WELT.csv', index=False)
    
    # Process seed data - filter to needed columns
    p_cols = list(needed_cols.intersection(seed_persons_filtered.columns))
    h_cols = list(needed_cols.intersection(seed_households_filtered.columns))
    
    seed_persons_out = seed_persons_filtered[p_cols].copy()
    seed_households_out = seed_households_filtered[h_cols].copy()
    
    # Add STAAT geography
    seed_persons_out['STAAT'] = 1
    seed_households_out['STAAT'] = 1
    
    # Save seed files (comma-separated for PopSim)
    seed_persons_out.to_csv(f'{output_dir}/data/seed_persons.csv', index=False)
    seed_households_out.to_csv(f'{output_dir}/data/seed_households.csv', index=False)
    
    # Copy controls.csv (only rows with expressions)
    controls_df.to_csv(f'{output_dir}/configs/controls.csv', index=False)
    
    # Copy and adapt settings.yaml from base popsim folder
    shutil.copy(f'{popsim_dir}/configs/settings.yaml', f'{output_dir}/configs/settings.yaml')
    
    # Copy other config files if they exist
    for config_file in ['logging.yaml']:
        src = f'{popsim_dir}/configs/{config_file}'
        if os.path.exists(src):
            shutil.copy(src, f'{output_dir}/configs/{config_file}')
    
    # Copy scripts
    for script_file in ['verification.yaml']:
        src = f'{popsim_dir}/scripts/{script_file}'
        if os.path.exists(src):
            shutil.copy(src, f'{output_dir}/scripts/{script_file}')
    
    # Copy run script
    run_script = f'{popsim_dir}/run_populationsim.py'
    if os.path.exists(run_script):
        shutil.copy(run_script, f'{output_dir}/run_populationsim.py')
    
    return {
        'cells_100m': len(census_100m_proc),
        'cells_1km': len(census_1km_proc),
        'households': census_100m_proc[household_column_suffixed].sum(),
        'seed_persons': len(seed_persons_out),
        'seed_households': len(seed_households_out)
    }

# =============================================================================
# MAIN LOGIC: Split by RegioStar or single folder
# =============================================================================

if regiostar_split:
    print(f"\n{'='*60}")
    print("REGIOSTAR SPLIT MODE")
    print(f"{'='*60}")
    
    # Load the census data (already spatially filtered in Step 1)
    census_100m_base = pd.read_parquet(f'{popsim_dir}/data/_census_100m_filtered.parquet')
    census_1km_base = pd.read_parquet(f'{popsim_dir}/data/_census_1km_filtered.parquet')
    
    # Get RegioStaR17 column name from 100m census (may vary in case)
    regiostar_col_100m = [c for c in census_100m_base.columns if c.lower() == 'regiostar17'][0]
    
    # Get unique RegioStaR17 values from census (these are the ones in the study area)
    unique_regiostar = sorted(census_100m_base[regiostar_col_100m].dropna().unique())
    print(f"\nFound {len(unique_regiostar)} unique RegioStaR17 values in study area: {unique_regiostar}")
    
    # Check MiD has RegioStaR17 column
    if 'RegioStaR17' not in seed_persons_full.columns:
        raise ValueError("MiD persons data does not have 'RegioStaR17' column!")
    if 'RegioStaR17' not in seed_households_full.columns:
        raise ValueError("MiD households data does not have 'RegioStaR17' column!")
    
    created_folders = []
    
    for rs_value in unique_regiostar:
        rs_int = int(rs_value)  # Convert from float if needed
        folder_name = f"popsim_regiostar_{rs_int}"
        print(f"\n--- Creating {folder_name} ---")
        
        # Filter census by RegioStaR17
        census_100m_rs = census_100m_base[census_100m_base[regiostar_col_100m] == rs_value].copy()
        
        # Get 1km cells that correspond to the filtered 100m cells
        def get_1km_from_100m_id(cell_id):
            match = re.match(r'CRS3035RES100mN(\d+)E(\d+)', str(cell_id))
            if match:
                n, e = int(match.group(1)), int(match.group(2))
                n_1km = (n // 1000) * 1000
                e_1km = (e // 1000) * 1000
                return f"CRS3035RES1000mN{n_1km}E{e_1km}"
            return None
        
        id_col_100m = census_100m_base.columns[0]
        km_ids_needed = set(census_100m_rs[id_col_100m].apply(get_1km_from_100m_id).dropna())
        id_col_1km = census_1km_base.columns[1]
        census_1km_rs = census_1km_base[census_1km_base[id_col_1km].isin(km_ids_needed)].copy()
        
        # Filter MiD by RegioStaR17
        seed_persons_rs = seed_persons_full[seed_persons_full['RegioStaR17'] == rs_int].copy()
        seed_households_rs = seed_households_full[seed_households_full['RegioStaR17'] == rs_int].copy()
        
        print(f"  Census: {len(census_100m_rs)} 100m cells, {len(census_1km_rs)} 1km cells")
        print(f"  MiD: {len(seed_persons_rs):,} persons, {len(seed_households_rs):,} households")
        
        if len(census_100m_rs) == 0:
            print(f"  WARNING: No census cells for RegioStaR17={rs_int}, skipping!")
            continue
        
        if len(seed_persons_rs) == 0 or len(seed_households_rs) == 0:
            print(f"  WARNING: No MiD data for RegioStaR17={rs_int}, skipping!")
            continue
        
        # Create the folder (with smart integerization)
        stats = create_popsim_folder(
            folder_name, 
            census_100m_rs, 
            census_1km_rs, 
            seed_persons_rs, 
            seed_households_rs,
            apply_integerization=True
        )
        
        created_folders.append({
            'folder': folder_name,
            'regiostar17': rs_int,
            **stats
        })
        print(f"  Created {folder_name}/ with {stats['cells_100m']} cells, {stats['households']:.0f} target households")
    
    # Summary
    print(f"\n{'='*60}")
    print("SUMMARY - RegioStar Split Mode")
    print(f"{'='*60}")
    print(f"\nCreated {len(created_folders)} popsim folders:")
    for info in created_folders:
        print(f"  {info['folder']}: RS17={info['regiostar17']}, "
              f"{info['cells_100m']} cells, {info['households']:.0f} HH, "
              f"{info['seed_persons']:,} seed persons")

else:
    # Original single-folder behavior
    print(f"\n{'='*60}")
    print("SINGLE FOLDER MODE")
    print(f"{'='*60}")
    
    # Apply regiostar17 filter if specified (only in single mode)
    seed_persons = seed_persons_full.copy()
    seed_households = seed_households_full.copy()
    
    if regiostar17_list:
        print(f"\nApplying regiostar17 filter:")
        persons_before = len(seed_persons)
        households_before = len(seed_households)
        if 'RegioStaR17' in seed_persons.columns:
            seed_persons = seed_persons[seed_persons['RegioStaR17'].isin(regiostar17_list)]
        if 'RegioStaR17' in seed_households.columns:
            seed_households = seed_households[seed_households['RegioStaR17'].isin(regiostar17_list)]
        print(f"  regiostar17 {regiostar17_list}: {len(seed_persons):,} persons, {len(seed_households):,} households")
    
    print(f"\nFinal counts:")
    print(f"  Persons: {len(seed_persons):,}")
    print(f"  Households: {len(seed_households):,}")
    
    # Load census data
    census_100m = pd.read_parquet(f'{popsim_dir}/data/_census_100m_filtered.parquet')
    census_1km = pd.read_parquet(f'{popsim_dir}/data/_census_1km_filtered.parquet')
    
    # Create the folder (with smart integerization)
    stats = create_popsim_folder(
        popsim_dir, 
        census_100m, 
        census_1km, 
        seed_persons, 
        seed_households,
        apply_integerization=True
    )
    
    print(f"\nCreated (all comma-separated for PopSim):")
    print(f"  {popsim_dir}/data/seed_persons.csv ({stats['seed_persons']} rows)")
    print(f"  {popsim_dir}/data/seed_households.csv ({stats['seed_households']} rows)")
    print(f"  {popsim_dir}/configs/controls.csv ({len(controls_df)} controls)")

print("\n[Step 3/4] Complete.")

## Step 4: Validate and Run

Validates the setup and provides instructions for running PopSim.

In [None]:
import os
import glob
import json
import yaml
import pandas as pd

print("[Step 4/4] Validating setup...")
print("=" * 60)

def validate_popsim_folder(folder_path, folder_name):
    """Validate a single popsim folder and return errors list."""
    errors = []
    
    required_files = [
        'data/geo_cross_walk.csv',
        'data/seed_persons.csv',
        'data/seed_households.csv',
        'data/control_totals_ZENSUS100m.csv',
        'data/control_totals_ZENSUS1km.csv',
        'data/control_totals_STAAT.csv',
        'data/control_totals_WELT.csv',
        'configs/settings.yaml',
        'configs/controls.csv',
    ]
    
    print(f"\nChecking {folder_name}...")
    for f in required_files:
        full_path = f"{folder_path}/{f}"
        if os.path.exists(full_path):
            size = os.path.getsize(full_path)
            print(f"  [OK] {f} ({size:,} bytes)")
        else:
            print(f"  [MISSING] {f}")
            errors.append(f"Missing: {f}")
    
    # Check controls
    controls_path = f"{folder_path}/configs/controls.csv"
    if os.path.exists(controls_path):
        try:
            controls = pd.read_csv(controls_path)
            empty = controls['expression'].isna().sum()
            if empty > 0:
                errors.append(f"{empty} controls missing expressions")
            else:
                print(f"  {len(controls)} controls, all have expressions")
        except Exception as e:
            errors.append(f"Error reading controls: {e}")
    
    # Check settings
    settings_path = f"{folder_path}/configs/settings.yaml"
    if os.path.exists(settings_path):
        try:
            with open(settings_path) as f:
                settings = yaml.safe_load(f)
            print(f"  Geographies: {settings.get('geographies')}")
            print(f"  Total HH control: {settings.get('total_hh_control')}")
        except Exception as e:
            errors.append(f"Error reading settings: {e}")
    
    return errors

# =============================================================================
# VALIDATION
# =============================================================================

all_errors = {}

if regiostar_split:
    # Find all popsim_regiostar_* folders
    regiostar_folders = sorted(glob.glob("popsim_regiostar_*"))
    
    if not regiostar_folders:
        print("\nWARNING: No popsim_regiostar_* folders found!")
        print("Run Step 3 first to create them.")
    else:
        print(f"\nFound {len(regiostar_folders)} RegioStar folders to validate")
        
        for folder in regiostar_folders:
            folder_errors = validate_popsim_folder(folder, folder)
            if folder_errors:
                all_errors[folder] = folder_errors
        
        # Summary
        print(f"\n{'='*60}")
        if all_errors:
            print("VALIDATION FAILED")
            for folder, errors in all_errors.items():
                print(f"\n  {folder}:")
                for e in errors:
                    print(f"    - {e}")
        else:
            print("VALIDATION PASSED")
            print(f"\n{len(regiostar_folders)} folders ready to run PopSim.")
            print("\nTo run all folders:")
            print("  for d in popsim_regiostar_*/; do")
            print("    echo \"Running $d...\"")
            print("    (cd \"$d\" && python run_populationsim.py)")
            print("  done")
else:
    # Single folder validation
    folder_errors = validate_popsim_folder(popsim_dir, popsim_dir)
    if folder_errors:
        all_errors[popsim_dir] = folder_errors
    
    # Summary
    print(f"\n{'='*60}")
    if all_errors:
        print("VALIDATION FAILED")
        for e in all_errors[popsim_dir]:
            print(f"  - {e}")
    else:
        print("VALIDATION PASSED")
        print(f"\nReady to run PopSim:")
        print(f"  cd {popsim_dir}")
        print("  conda activate popsim")
        print("  python run_populationsim.py")

print(f"{'='*60}")
print("\n[Step 4/4] Complete.")

## Utilities: Reset

Clean up generated files to start fresh.

In [None]:
import os
import glob
import shutil

def reset(confirm=False, include_regiostar_folders=False):
    """Delete all generated files.
    
    Args:
        confirm: Set to True to actually delete files
        include_regiostar_folders: Set to True to also delete popsim_regiostar_* folders
    """
    files = [
        f'{popsim_dir}/data/geo_cross_walk.csv',
        f'{popsim_dir}/data/seed_persons.csv',
        f'{popsim_dir}/data/seed_households.csv',
        f'{popsim_dir}/data/control_totals_ZENSUS100m.csv',
        f'{popsim_dir}/data/control_totals_ZENSUS1km.csv',
        f'{popsim_dir}/data/control_totals_STAAT.csv',
        f'{popsim_dir}/data/control_totals_WELT.csv',
        f'{popsim_dir}/data/_census_100m_filtered.parquet',
        f'{popsim_dir}/data/_census_1km_filtered.parquet',
        f'{popsim_dir}/configs/controls.csv',
        f'{popsim_dir}/configs/_prep3_controls.csv',
    ]
    
    existing_files = [f for f in files if os.path.exists(f)]
    
    # Find RegioStar folders
    regiostar_folders = sorted(glob.glob("popsim_regiostar_*")) if include_regiostar_folders else []
    
    if not existing_files and not regiostar_folders:
        print("No files to delete.")
        return
    
    if existing_files:
        print("Files to delete:")
        for f in existing_files:
            print(f"  {f}")
    
    if regiostar_folders:
        print(f"\nRegioStar folders to delete ({len(regiostar_folders)}):")
        for f in regiostar_folders:
            print(f"  {f}/")
    
    if not confirm:
        cmd = "reset(confirm=True"
        if regiostar_folders:
            cmd += ", include_regiostar_folders=True"
        cmd += ")"
        print(f"\nRun {cmd} to delete.")
        return
    
    for f in existing_files:
        os.remove(f)
        print(f"Deleted: {f}")
    
    for folder in regiostar_folders:
        shutil.rmtree(folder)
        print(f"Deleted: {folder}/")
    
    print("\nReset complete.")

# Show what would be deleted
reset(confirm=False, include_regiostar_folders=True)