In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Load a shapefile into BigQuery with a GEOGRAPHY column.
- Reprojects to EPSG:4326
- Repairs invalid polygons
- Loads via NDJSON to avoid CSV quoting issues
"""

import os, re, sys, json, tempfile
import pandas as pd
import geopandas as gpd
from shapely.geometry import mapping
from shapely.validation import make_valid
from google.cloud import bigquery
from google.api_core.exceptions import NotFound, Forbidden, BadRequest
import google.auth

# -----------------------
# CONFIG — edit as needed
# -----------------------
sys.path.insert(0, '../../2-Python/global-functions')
import BigQuery  # provided by you

SHAPEFILE = r"M:\GitHub\WF-TDM-v9x\1_Inputs\1_TAZ\Districts\Dist_Small.dbf"
TARGET_TABLE = "wfrc-data.personal_vehicle.small_district_geo"
WRITE_DISPOSITION = bigquery.WriteDisposition.WRITE_TRUNCATE

# -----------------------
# Helpers
# -----------------------
def bq_safe_name(s: str) -> str:
    s = re.sub(r"[^\w]+", "_", s.strip())
    s = re.sub(r"__+", "_", s).strip("_")
    if not s or not re.match(r"^[A-Za-z_]", s):
        s = "_" + (s or "col")
    return s.lower()

def infer_bq_type(dtype) -> str:
    if pd.api.types.is_integer_dtype(dtype): return "INT64"
    if pd.api.types.is_float_dtype(dtype):   return "FLOAT64"
    if pd.api.types.is_bool_dtype(dtype):    return "BOOL"
    if pd.api.types.is_datetime64_any_dtype(dtype): return "TIMESTAMP"
    return "STRING"

def ensure_dataset(client: bigquery.Client, full_table_id: str):
    proj, dset, _ = full_table_id.split(".")
    ds_ref = bigquery.Dataset(f"{proj}.{dset}")
    try:
        client.get_dataset(ds_ref)
    except NotFound:
        client.create_dataset(ds_ref, exists_ok=True)

def assert_can_create_table(client: bigquery.Client, full_table_id: str):
    proj, dset, tbl = full_table_id.split(".")
    ds_id = f"{proj}.{dset}"
    try:
        client.get_dataset(ds_id)
    except NotFound:
        raise RuntimeError(f"Dataset {ds_id} does not exist.")
    probe = bigquery.Table(f"{proj}.{dset}.__perm_probe__", schema=[bigquery.SchemaField("p","STRING")])
    try:
        client.create_table(probe)
        client.delete_table(probe, not_found_ok=True)
    except Forbidden:
        raise RuntimeError(f"No table-create rights on {ds_id}. Grant roles/bigquery.dataEditor.")

def print_running_identity_from_client(client):
    # Uses the same creds the client is actually using
    creds = getattr(client, "_credentials", None)
    email = getattr(creds, "service_account_email", None) or "user credentials"
    print(f"[INFO] Running as: {email} | Project: {client.project}")

# create the client first (env var or explicit path—your choice)
# Option A: env var is already set to your SA JSON
client = BigQuery.getBigQueryClient_WfrcData()

# Option B: pass the SA JSON path explicitly
# client = BigQuery.getBigQueryClient_WfrcData(r"D:\keys\wfrc-data.json")

# now print identity based on the actual client
print_running_identity_from_client(client)

# Ensure dataset and permissions
ensure_dataset(client, TARGET_TABLE)
assert_can_create_table(client, TARGET_TABLE)

# Read shapefile
print(f"[INFO] Reading shapefile: {SHAPEFILE}")
gdf = gpd.read_file(SHAPEFILE)
if gdf.empty:
    raise ValueError("The shapefile appears to be empty.")
if gdf.crs is None:
    raise ValueError("Shapefile has no CRS. Please define the source CRS before loading.")

# Reproject to EPSG:4326
if gdf.crs.to_epsg() != 4326:
    print("[INFO] Reprojecting to EPSG:4326…")
    gdf = gdf.to_crs(4326)

# Repair invalid geometries
print("[INFO] Repairing invalid geometries (make_valid + buffer(0))…")
gdf["geometry"] = gdf.geometry.apply(make_valid)
gdf["geometry"] = gdf.geometry.buffer(0)

# Drop rows with empty/null geometry
before = len(gdf)
gdf = gdf[~gdf.geometry.is_empty & gdf.geometry.notna()].copy()
dropped = before - len(gdf)
if dropped:
    print(f"[WARN] Dropped {dropped} rows with empty/null geometry after repair.")

# Prepare attributes
attr_df = gdf.drop(columns=[c for c in gdf.columns if c.lower() == "geometry"], errors="ignore").copy()
attr_df.columns = [bq_safe_name(c) for c in attr_df.columns]
attr_df["geom"] = gdf.geometry.apply(lambda geom: json.dumps(mapping(geom), ensure_ascii=False))

# Build schema
schema = []
for col in attr_df.columns:
    if col == "geom":
        schema.append(bigquery.SchemaField("geom", "GEOGRAPHY"))
    else:
        schema.append(bigquery.SchemaField(col, infer_bq_type(attr_df[col].dtype)))

# Write NDJSON and load
print("[INFO] Writing NDJSON temp file…")
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as tmp:
    for rec in attr_df.to_dict(orient="records"):
        tmp.write(json.dumps(rec, ensure_ascii=False) + "\n")
    tmp_path = tmp.name

print(f"[INFO] Loading into BigQuery table: {TARGET_TABLE}")
job_config = bigquery.LoadJobConfig(
    schema=schema,
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    write_disposition=WRITE_DISPOSITION,
    encoding="UTF-8",
)

try:
    with open(tmp_path, "rb") as f:
        load_job = client.load_table_from_file(f, TARGET_TABLE, job_config=job_config)
    load_job.result()
except BadRequest as e:
    print("[ERROR] BigQuery load failed.")
    if hasattr(e, "errors") and e.errors:
        for err in e.errors:
            print(" -", err.get("message"))
    raise
finally:
    try: os.remove(tmp_path)
    except Exception: pass

tbl = client.get_table(TARGET_TABLE)
print(f"[SUCCESS] Loaded {tbl.num_rows} rows into {TARGET_TABLE}.")

print("[INFO] Example spatial-join query:")
print(f"""
WITH bbox AS (
  SELECT ST_GeogFromText('POLYGON((
    -112.3 40.3,
    -111.6 40.3,
    -111.6 40.95,
    -112.3 40.95,
    -112.3 40.3
  ))') AS g
)
SELECT COUNT(*) AS n_tracts
FROM `{TARGET_TABLE}` t
JOIN bbox b
ON ST_INTERSECTS(t.geom, b.g);
""")


bhereth
C:\Users\bhereth\wfrc-data-f89de5afadfb.json
[INFO] Running as: bhereth-wfrcbq@wfrc-data.iam.gserviceaccount.com | Project: wfrc-data
[INFO] Reading shapefile: M:\GitHub\WF-TDM-v9x\1_Inputs\1_TAZ\Districts\Dist_Small.dbf
[INFO] Reprojecting to EPSG:4326…
[INFO] Repairing invalid geometries (make_valid + buffer(0))…
[INFO] Writing NDJSON temp file…
[INFO] Loading into BigQuery table: wfrc-data.personal_vehicle.small_district_geo
[SUCCESS] Loaded 131 rows into wfrc-data.personal_vehicle.small_district_geo.
[INFO] Example spatial-join query:

WITH bbox AS (
  SELECT ST_GeogFromText('POLYGON((
    -112.3 40.3,
    -111.6 40.3,
    -111.6 40.95,
    -112.3 40.95,
    -112.3 40.3
  ))') AS g
)
SELECT COUNT(*) AS n_tracts
FROM `wfrc-data.personal_vehicle.small_district_geo` t
JOIN bbox b
ON ST_INTERSECTS(t.geom, b.g);

