In [1]:
# --- SCRIPT 1: Convert all .nc hazard maps into CSVs ---

import os
import re
import glob
import pandas as pd
import xarray as xr
from unidecode import unidecode

# === PATHS ===
nc_dir = "../tests/tests_data/hazards"

csv_out_dir = "../workspace/Climate Data/Climate Maps CSVs"
os.makedirs(csv_out_dir, exist_ok=True)

# === FILENAME PATTERN PARSER ===
# No complex patterns needed - just extract from directory structure

def parse_hazard_from_fname(path):
    # Extract hazard type and indicator from directory structure
    # Path format: .../hazards/Drought/SPI6/ensemble/ensemble_return_period.nc
    path_parts = path.split(os.sep)
    
    # Find the "hazards" directory and get the next two parts
    hazards_idx = path_parts.index("hazards")
    if hazards_idx + 2 >= len(path_parts):
        raise ValueError(f"Invalid path structure: {path}")
    
    hazard_type = path_parts[hazards_idx + 1]  # "Drought"
    hazard_indicator = path_parts[hazards_idx + 2]  # "SPI6"
    return (hazard_type, hazard_indicator)

def nc_to_csv(nc_path, out_dir):
    ds = xr.open_dataset(nc_path)
    var_name = list(ds.data_vars.keys())[0]
    da = ds[var_name]
    
    # Expecting dims: ensemble, GWL, return_period, lat, lon
    df = da.to_dataframe(name="value").reset_index().dropna(subset=["value"])
    
    # Add hazard info from filename
    h_type, h_ind = parse_hazard_from_fname(nc_path)
    df["hazard_type"] = h_type
    df["hazard_indicator"] = h_ind
    
    # Save to CSV
    out_name = os.path.splitext(os.path.basename(nc_path))[0] + ".csv"
    out_path = os.path.join(out_dir, out_name)
    df.to_csv(out_path, index=False, encoding="utf-8")
    
    print(f"✅ Saved: {out_name}")
    return out_path

# === RUN ===
# Recursively search for .nc files in nc_dir and subdirectories
# Look for files containing "ensemble_return_period" in the name
nc_files = sorted(glob.glob(os.path.join(nc_dir, "**", "*ensemble_return_period*.nc"), recursive=True))
if not nc_files:
    raise FileNotFoundError(f"No .nc files found in {nc_dir} (searched recursively)")

for fp in nc_files:
    print(f"Processing {os.path.basename(fp)} ...")
    nc_to_csv(fp, csv_out_dir)

print("\n✅ All NetCDFs converted to CSVs.")
print(f"CSV files saved in:\n{csv_out_dir}")


Processing ensemble_return_period.nc ...
✅ Saved: ensemble_return_period.csv

✅ All NetCDFs converted to CSVs.
CSV files saved in:
../workspace/Climate Data/Climate Maps CSVs


In [2]:
# --- SCRIPT 2: Aggregate CSVs over ADM1 and ADM2 ---

import os
import glob
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from unidecode import unidecode

# === PATHS ===
csv_dir = csv_out_dir
adm1_path = "../tests/tests_data/areas/province/geoBoundaries-BRA-ADM1.shp"
adm2_path = "../tests/tests_data/areas/municipality/geoBoundaries-BRA-ADM2.shp"
output_dir = "../workspace/Climate Data/Precomputed Regional Data"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "precomputed_adm_nc.csv")

# === LOAD ADM SHAPEFILES ===
def load_adm(adm_path):
    gdf = gpd.read_file(adm_path)
    
    # Set CRS if not already set (many shapefiles don't have CRS defined)
    if gdf.crs is None:
        # Try to infer from .prj file or assume WGS84
        gdf = gdf.set_crs("EPSG:4326", allow_override=True)
    
    # Transform to WGS84 if not already
    if gdf.crs != "EPSG:4326":
        gdf = gdf.to_crs("EPSG:4326")
    

    return gdf, "shapeName"


adm1, adm1_name = load_adm(adm1_path)
adm2, adm2_name = load_adm(adm2_path)

# === AGGREGATION FUNCTION ===
def summarize_points_over_adm(points_df, adm_gdf, adm_name_col, adm_level):
    gdf_pts = gpd.GeoDataFrame(
        points_df,
        geometry=gpd.points_from_xy(points_df["lon"], points_df["lat"]),
        crs="EPSG:4326",
    )
    joined = gpd.sjoin(gdf_pts, adm_gdf, how="inner", predicate="within")

    def q(p): return lambda x: float(np.nanpercentile(x, p))

    agg = (
        joined.groupby(
            ["GWL", "return_period", "ensemble", "hazard_type", "hazard_indicator", adm_name_col],
            dropna=False,
        )
        .agg(
            min=("value", "min"),
            max=("value", "max"),
            mean=("value", "mean"),
            median=("value", "median"),
            p2_5=("value", q(2.5)),
            p5=("value", q(5)),
            p10=("value", q(10)),
            p90=("value", q(90)),
            p95=("value", q(95)),
            p97_5=("value", q(97.5)),
        )
        .reset_index()
    )

    agg = agg.rename(
        columns={
            "GWL": "scenario_code",
            "return_period": "hazard_return_period",
            adm_name_col: "region",
        }
    )
    agg["scenario_name"] = agg["scenario_code"]
    agg["adm_level"] = adm_level

    cols = [
        "region",
        "adm_level",
        "scenario_code",
        "scenario_name",
        "hazard_return_period",
        "hazard_type",
        "hazard_indicator",
        "min",
        "max",
        "mean",
        "median",
        "p2_5",
        "p5",
        "p10",
        "p90",
        "p95",
        "p97_5",
    ]
    return agg[cols]

# === RUN AGGREGATION ===
csv_files = sorted(glob.glob(os.path.join(csv_dir, "*.csv")))
if not csv_files:
    raise FileNotFoundError(f"No CSVs found in {csv_dir}")

all_results = []

for csv_fp in csv_files:
    print(f"Aggregating: {os.path.basename(csv_fp)}")
    df = pd.read_csv(csv_fp, encoding="utf-8")
    df = df[df["ensemble"] == 'mean']
    if "value" not in df.columns:
        possible_val = [c for c in df.columns if c not in ["GWL", "lon", "lat", "return_period", "ensemble", "hazard_type", "hazard_indicator"]]
        if possible_val:
            df = df.rename(columns={possible_val[0]: "value"})

    res_adm1 = summarize_points_over_adm(df, adm1, adm1_name, adm_level='ADM1')
    res_adm2 = summarize_points_over_adm(df, adm2, adm2_name, adm_level='ADM2')

    res_adm1["ensemble"] = "mean"
    res_adm2["ensemble"] = "mean"

    all_results.extend([res_adm1, res_adm2])
    

# === COMBINE AND REPAIR ENCODING ===
final_df = pd.concat(all_results, ignore_index=True)

# --- FIX MOJIBAKE (e.g. √Ågua → Água) ---

# try several encodings automatically

def fix_text(text):
    return unidecode(text)

# Apply only to region names
final_df["region"] = final_df["region"].apply(fix_text)

# --- ENSURE UTF-8 OUTPUT ---
output_path = os.path.join(output_dir, "precomputed_adm_nc.csv")
final_df.to_csv(output_path, index=False, encoding="utf-8-sig")

print("\n✅ Aggregation complete! Encoding repaired with ftfy.")
print(f"Saved to:\n{output_path}")
print(final_df.head())


Aggregating: ensemble_return_period.csv


  df = pd.read_csv(csv_fp, encoding="utf-8")


KeyboardInterrupt: 

In [None]:
### add flood hazard
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
import rasterio.mask
from unidecode import unidecode

# --- INPUTS & OUTPUTS ---

# ADM boundaries
adm_levels = [
    ("ADM1", "../tests/tests_data/areas/province/geoBoundaries-BRA-ADM1.shp"),
    ("ADM2", "../tests/tests_data/areas/municipality/geoBoundaries-BRA-ADM2.shp"),
]

# Flood maps directory
flood_maps_dir = "../tests/tests_data/hazards/FloodTIF"

# Scenarios and return periods
scenario_codes = ["pc", "rcp26", "rcp85"]
scenario_labels = {"pc": "CurrentClimate", "rcp26": "RCP2.6", "rcp85": "RCP8.5"}
scenario_code_map_for_output = {"pc": "present", "rcp26": "rcp26", "rcp85": "rcp85"}
return_periods = [5, 10, 25, 50, 100, 1000]

# Output CSV
output_csv = "../workspace/Climate Data/Precomputed Regional Data/precomputed_adm_flood.csv"

# --- HELPERS ---

def fix_text(s):
    """Try to repair mojibake / encoding artefacts; safe no-op if already fine."""
    return unidecode(s)

def load_adm(adm_path):
    """Load a shapefile and pick a reasonable region-name column."""
    try:
        gdf = gpd.read_file(adm_path)
    except UnicodeDecodeError:
        gdf = gpd.read_file(adm_path, encoding="latin1")
    # choose name column
    name_col = None
    for c in ["shapeName", "NAME_2", "NAME_1", "NAME", "name", "prov_name"]:
        if c in gdf.columns:
            name_col = c
            break
    if name_col is None:
        gdf["region"] = gdf.index.astype(str)
    else:
        gdf["region"] = gdf[name_col]
    gdf["region"] = gdf["region"].apply(fix_text)
    return gdf

def percentile(arr, q):
    return float(np.percentile(arr, q)) if arr.size else np.nan

# --- PROCESSING ---

all_rows = []

for adm_label, adm_path in adm_levels:
    gdf = load_adm(adm_path)

    for sc in scenario_codes:
        sc_label = scenario_labels[sc]
        sc_code_out = scenario_code_map_for_output[sc]

        for rp in return_periods:
            tif_name = f"global_{sc}_h{rp}glob.tif"
            tif_path = os.path.join(flood_maps_dir, tif_name)

            # Only process existing .tif files
            if not (tif_path.endswith(".tif") and os.path.exists(tif_path)):
                print(f"Skipping (not found): {tif_path}")
                continue

            with rasterio.open(tif_path) as src:
                nodata = src.nodata

                # Reproject ADM to raster CRS if needed
                if gdf.crs is None:
                    gdf = gdf.set_crs("EPSG:4326", allow_override=True)
                if gdf.crs != src.crs:
                    gdf_proj = gdf.to_crs(src.crs)
                else:
                    gdf_proj = gdf

                for idx, row in gdf_proj.iterrows():
                    geom = [row.geometry]
                    try:
                        out_image, out_transform = rasterio.mask.mask(src, geom, crop=True)
                    except Exception as e:
                        print(f"Mask error ({adm_label}, {row['region']}, {sc_label}, RP {rp}): {e}")
                        continue

                    data = out_image[0]
                    if nodata is not None:
                        valid_mask = data != nodata
                    else:
                        valid_mask = np.ones(data.shape, dtype=bool)

                    valid = data[valid_mask]
                    # If masked array, compress
                    if np.ma.isMaskedArray(valid):
                        valid = valid.compressed()

                    # Compute stats
                    if valid.size == 0:
                        stats = dict(
                            min=np.nan, max=np.nan, mean=np.nan, median=np.nan,
                            p2_5=np.nan, p5=np.nan, p10=np.nan, p90=np.nan, p95=np.nan, p97_5=np.nan
                        )
                    else:
                        stats = dict(
                            min=float(np.min(valid)),
                            max=float(np.max(valid)),
                            mean=float(np.mean(valid)),
                            median=percentile(valid, 50),
                            p2_5=percentile(valid, 2.5),
                            p5=percentile(valid, 5),
                            p10=percentile(valid, 10),
                            p90=percentile(valid, 90),
                            p95=percentile(valid, 95),
                            p97_5=percentile(valid, 97.5),
                        )

                    # Build output row (match precomputed_adm_hazards.csv format)
                    out_row = {
                        "region": fix_text(gdf.loc[idx, "region"]),   # original name in original CRS GeoDF
                        "adm_level": adm_label,
                        "scenario_code": sc_code_out,                 # 'present' | 'rcp26' | 'rcp85'
                        "scenario_name": sc_label,                    # 'CurrentClimate' | 'RCP2.6' | 'RCP8.5'
                        "hazard_return_period": rp,
                        "hazard_type": "FloodTIF",
                        "hazard_indicator": "Flood Height",
                        "min": stats["min"],
                        "max": stats["max"],
                        "mean": stats["mean"],
                        "median": stats["median"],
                        "p2_5": stats["p2_5"],
                        "p5": stats["p5"],
                        "p10": stats["p10"],
                        "p90": stats["p90"],
                        "p95": stats["p95"],
                        "p97_5": stats["p97_5"],
                        "ensemble": np.nan,
                    }
                    all_rows.append(out_row)

# --- EXPORT ---

df_out = pd.DataFrame(all_rows, columns=[
    "region","adm_level","scenario_code","scenario_name","hazard_return_period",
    "hazard_type","hazard_indicator","min","max","mean","median","p2_5","p5","p10","p90","p95","p97_5"
])

# Ensure accent-safe output for Excel (UTF-8 with BOM)
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df_out.to_csv(output_csv, index=False, encoding="utf-8-sig")

print(f"✅ Saved flood precompute to:\n{output_csv}")
print(df_out.head())


Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_pc_h5glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_pc_h25glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_pc_h50glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_pc_h100glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_pc_h1000glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_rcp26_h5glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_rcp26_h10glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_rcp26_h25glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_rcp26_h50glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_rcp26_h100glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_rcp26_h1000glob.tif
Skipping (not found): ../tests/tests_data/hazards/FloodTIF/global_rcp85_h5glob.tif
Skip

In [None]:
import pandas as pd

# === INPUT FILES ===
nc_file = "../workspace/Climate Data/Precomputed Regional Data/precomputed_adm_nc.csv"
flood_file = "../workspace/Climate Data/Precomputed Regional Data/precomputed_adm_flood.csv"

# === OUTPUT FILE ===
output_file = "../workspace/Climate Data/Precomputed Regional Data/precomputed_adm_hazards.csv"

# === LOAD FILES (UTF-8 SAFE) ===
df_nc = pd.read_csv(nc_file, encoding="utf-8-sig")
df_flood = pd.read_csv(flood_file, encoding="utf-8-sig")

# === ALIGN COLUMNS ===
# Get the union of all columns
all_cols = sorted(set(df_nc.columns).union(set(df_flood.columns)))

# Add missing columns as NaN so both match perfectly
for df in [df_nc, df_flood]:
    for col in all_cols:
        if col not in df.columns:
            df[col] = pd.NA
    df = df[all_cols]

# === MERGE (STACK) ===
df_merged = pd.concat([df_nc, df_flood], ignore_index=True)

# === SAVE OUTPUT ===
df_merged.to_csv(output_file, index=False, encoding="utf-8-sig")

print(f"✅ Merged dataset saved successfully to:\n{output_file}")
print(f"Total records: {len(df_merged)}")
print("Columns:", list(df_merged.columns))

FileNotFoundError: [Errno 2] No such file or directory: '../workspace/Climate Data/Precomputated Regional Data/precomputed_adm_nc.csv'