In [1]:
# ============================================================
# PIPELINE S — BLOCK S1
# IBGE Census Tracts — Brazil (2022)
# Canonical spatial base for Paper 3
# ============================================================

import os
import json
import time
import geopandas as gpd

# ------------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------------

INPUT_GPKG = (
    "/Users/rafaelalbuquerque/Desktop/"
    "Census Tract GeoJson/BR_setores_CD2022.gpkg"
)

OUTPUT_DIR = (
    "/Users/rafaelalbuquerque/Desktop/"
    "Output Pipeline S (Shapefiles)/S1"
)
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_GPKG = os.path.join(
    OUTPUT_DIR,
    "census_tracts_brazil_ibge_2022.gpkg"
)

QC_REPORT = os.path.join(
    OUTPUT_DIR,
    "spatial_qc_report.json"
)

start_time = time.time()

print("[INFO] Block S1 starting — IBGE official census tracts (Brazil, 2022)")
print(f" - Input GPKG      : {INPUT_GPKG}")
print(f" - Output geometry : {OUTPUT_GPKG}")
print(f" - QC report       : {QC_REPORT}")

# ------------------------------------------------------------
# STEP 1 — LOAD GEOMETRIES
# ------------------------------------------------------------

print("\n[STEP 1/4] Reading IBGE GeoPackage...")

if not os.path.exists(INPUT_GPKG):
    raise FileNotFoundError(f"GPKG not found: {INPUT_GPKG}")

# Robust engine fallback (macOS-safe)
try:
    gdf = gpd.read_file(INPUT_GPKG, engine="pyogrio")
    engine_used = "pyogrio"
except Exception as e:
    print("[WARN] pyogrio failed — falling back to fiona")
    gdf = gpd.read_file(INPUT_GPKG, engine="fiona")
    engine_used = "fiona"

print(f"[INFO] Engine used: {engine_used}")
print(f"[INFO] Total geometries loaded: {len(gdf):,}")
print(f"[INFO] Columns detected: {list(gdf.columns)}")

# ------------------------------------------------------------
# STEP 2 — STRUCTURAL & ID CHECKS
# ------------------------------------------------------------

print("\n[STEP 2/4] Structural checks...")

if "CD_SETOR" not in gdf.columns:
    raise ValueError(
        "Expected column 'CD_SETOR' not found. "
        "Verify IBGE layer integrity."
    )

# Canonical census tract ID
gdf["ct_id"] = gdf["CD_SETOR"].astype(str)

invalid_geoms = (~gdf.geometry.is_valid).sum()
empty_geoms = gdf.geometry.is_empty.sum()

print(f"[INFO] Invalid geometries : {invalid_geoms}")
print(f"[INFO] Empty geometries   : {empty_geoms}")
print(f"[INFO] CRS                : {gdf.crs}")

# ------------------------------------------------------------
# STEP 3 — QUALITY CONTROL METRICS
# ------------------------------------------------------------

print("\n[STEP 3/4] Building QC report...")

qc_report = {
    "source": "IBGE — Malha de Setores Censitários 2022",
    "engine_used": engine_used,
    "n_geometries": int(len(gdf)),
    "n_unique_ct_id": int(gdf["ct_id"].nunique()),
    "invalid_geometries": int(invalid_geoms),
    "empty_geometries": int(empty_geoms),
    "crs": str(gdf.crs),
    "columns": list(gdf.columns),
}

# ------------------------------------------------------------
# STEP 4 — WRITE CANONICAL OUTPUT
# ------------------------------------------------------------

print("\n[STEP 4/4] Writing canonical GeoPackage...")

gdf_out = gdf[["ct_id", "geometry"]].copy()

gdf_out.to_file(
    OUTPUT_GPKG,
    driver="GPKG"
)

with open(QC_REPORT, "w") as f:
    json.dump(qc_report, f, indent=2)

elapsed = time.time() - start_time

print("\n[DONE] Block S1 completed successfully.")
print(f" - Output geometry : {OUTPUT_GPKG}")
print(f" - QC report       : {QC_REPORT}")
print(f" - Runtime         : {elapsed:.2f} seconds")

[INFO] Block S1 starting — IBGE official census tracts (Brazil, 2022)
 - Input GPKG      : /Users/rafaelalbuquerque/Desktop/Census Tract GeoJson/BR_setores_CD2022.gpkg
 - Output geometry : /Users/rafaelalbuquerque/Desktop/Output Pipeline S (Shapefiles)/S1/census_tracts_brazil_ibge_2022.gpkg
 - QC report       : /Users/rafaelalbuquerque/Desktop/Output Pipeline S (Shapefiles)/S1/spatial_qc_report.json

[STEP 1/4] Reading IBGE GeoPackage...
[INFO] Engine used: pyogrio
[INFO] Total geometries loaded: 472,780
[INFO] Columns detected: ['CD_SETOR', 'SITUACAO', 'CD_SIT', 'CD_TIPO', 'AREA_KM2', 'CD_REGIAO', 'NM_REGIAO', 'CD_UF', 'NM_UF', 'CD_MUN', 'NM_MUN', 'CD_DIST', 'NM_DIST', 'CD_SUBDIST', 'NM_SUBDIST', 'CD_BAIRRO', 'NM_BAIRRO', 'CD_NU', 'NM_NU', 'CD_FCU', 'NM_FCU', 'CD_AGLOM', 'NM_AGLOM', 'CD_RGINT', 'NM_RGINT', 'CD_RGI', 'NM_RGI', 'CD_CONCURB', 'NM_CONCURB', 'geometry']

[STEP 2/4] Structural checks...
[INFO] Invalid geometries : 30
[INFO] Empty geometries   : 0
[INFO] CRS                :

In [5]:
# ============================================================
# PIPELINE S — BLOCK S2
# Spatial Anchoring of Mobility Infrastructure Index (MII)
# Many-to-One Join (Geometry → Mobility)
# ============================================================

import os
import json
import time
import pandas as pd
import geopandas as gpd

# ------------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------------

MOBILITY_INPUT = (
    "/Users/rafaelalbuquerque/Desktop/"
    "Output Pipeline A (Mobility)/A4/"
    "mobility_by_tract_aug2024_with_mii_FINAL.csv.gz"
)

GEOMETRY_INPUT = (
    "/Users/rafaelalbuquerque/Desktop/"
    "Output Pipeline S (Shapefiles)/S1/"
    "census_tracts_brazil_ibge_2022.gpkg"
)

OUTPUT_DIR = (
    "/Users/rafaelalbuquerque/Desktop/"
    "Output Pipeline S (Shapefiles)/S2"
)
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_GPKG = os.path.join(
    OUTPUT_DIR,
    "census_tracts_brazil_mobility_mii.gpkg"
)

QC_REPORT = os.path.join(
    OUTPUT_DIR,
    "spatial_join_qc_report.json"
)

start_time = time.time()

print("[INFO] Block S2 starting — Spatial anchoring of MII (m:1 join)")
print(f" - Mobility input : {MOBILITY_INPUT}")
print(f" - Geometry input : {GEOMETRY_INPUT}")
print(f" - Output GPKG    : {OUTPUT_GPKG}")

# ------------------------------------------------------------
# STEP 1 — LOAD MOBILITY DATA
# ------------------------------------------------------------

print("\n[STEP 1/4] Loading mobility data...")

mob = pd.read_csv(
    MOBILITY_INPUT,
    compression="gzip",
    dtype={"ct_id": str}
)

if "mii" not in mob.columns:
    raise ValueError("Column 'mii' not found in mobility data.")

print(f"[INFO] Mobility rows loaded: {len(mob):,}")

# ------------------------------------------------------------
# STEP 2 — LOAD GEOMETRIES
# ------------------------------------------------------------

print("\n[STEP 2/4] Loading census tract geometries...")

gdf = gpd.read_file(GEOMETRY_INPUT)
gdf["ct_id"] = gdf["ct_id"].astype(str)

print(f"[INFO] Geometry rows loaded: {len(gdf):,}")
print(f"[INFO] CRS: {gdf.crs}")

# ------------------------------------------------------------
# STEP 3 — MANY-TO-ONE ATTRIBUTE JOIN (CORRECT)
# ------------------------------------------------------------

print("\n[STEP 3/4] Joining mobility to geometries (m:1)...")

gdf_merged = gdf.merge(
    mob,
    on="ct_id",
    how="left",
    validate="m:1"   # ← THIS IS THE CORRECT LOGIC
)

matched = int(gdf_merged["mii"].notna().sum())
unmatched = int(gdf_merged["mii"].isna().sum())

print(f"[INFO] Geometries with MII   : {matched:,}")
print(f"[INFO] Geometries without MII: {unmatched:,}")

# ------------------------------------------------------------
# STEP 3a — SPATIAL DESCRIPTORS (MINIMAL)
# ------------------------------------------------------------

print("\n[STEP 3a/4] Computing area and MII density...")

gdf_metric = gdf_merged.to_crs(epsg=5880)
gdf_merged["area_km2"] = gdf_metric.geometry.area / 1e6
gdf_merged["mii_density"] = gdf_merged["mii"] / gdf_merged["area_km2"]

# ------------------------------------------------------------
# STEP 4 — SAVE OUTPUTS
# ------------------------------------------------------------

print("\n[STEP 4/4] Writing outputs...")

gdf_merged.to_file(
    OUTPUT_GPKG,
    driver="GPKG"
)

qc = {
    "mobility_rows": int(len(mob)),
    "geometry_rows": int(len(gdf)),
    "matched_geometries": matched,
    "unmatched_geometries": unmatched,
    "join_type": "many-to-one (geometry → mobility)",
    "crs": str(gdf.crs),
    "derived_metrics": ["area_km2", "mii_density"]
}

with open(QC_REPORT, "w") as f:
    json.dump(qc, f, indent=2)

elapsed = time.time() - start_time

print("\n[DONE] Block S2 completed successfully.")
print(f" - Output: {OUTPUT_GPKG}")
print(f" - QC    : {QC_REPORT}")
print(f" - Time  : {elapsed:.2f} seconds")

[INFO] Block S2 starting — Spatial anchoring of MII (m:1 join)
 - Mobility input : /Users/rafaelalbuquerque/Desktop/Output Pipeline A (Mobility)/A4/mobility_by_tract_aug2024_with_mii_FINAL.csv.gz
 - Geometry input : /Users/rafaelalbuquerque/Desktop/Output Pipeline S (Shapefiles)/S1/census_tracts_brazil_ibge_2022.gpkg
 - Output GPKG    : /Users/rafaelalbuquerque/Desktop/Output Pipeline S (Shapefiles)/S2/census_tracts_brazil_mobility_mii.gpkg

[STEP 1/4] Loading mobility data...
[INFO] Mobility rows loaded: 436,868

[STEP 2/4] Loading census tract geometries...
[INFO] Geometry rows loaded: 472,780
[INFO] CRS: EPSG:4674

[STEP 3/4] Joining mobility to geometries (m:1)...
[INFO] Geometries with MII   : 398,774
[INFO] Geometries without MII: 74,006

[STEP 3a/4] Computing area and MII density...

[STEP 4/4] Writing outputs...

[DONE] Block S2 completed successfully.
 - Output: /Users/rafaelalbuquerque/Desktop/Output Pipeline S (Shapefiles)/S2/census_tracts_brazil_mobility_mii.gpkg
 - QC    :