In [None]:
import geopandas as gpd
import rasterio
import rasterio.mask
import numpy as np
import pandas as pd
import os
from tqdm import tqdm



In [None]:
# --- INPUTS & OUTPUTS ---
##Input here the ADM boundaries ADM1= province, ADM2= Municipality
# ADM boundaries
adm_levels = [
    ("ADM1", "../workspace/boundaries/geoBoundaries-BRA-ADM1-all/geoBoundaries-BRA-ADM1.shp"),
    ("ADM2", "../workspace/boundaries/geoBoundaries-BRA-ADM2-all/geoBoundaries-BRA-ADM2.shp")
]


#This script sepeates the flood maps into different categories. We can also leave the full name here aswell, as this might help with the matching later
# Flood maps directory and pattern
flood_maps_dir = "../workspace/hazards_world/flood"

# Scenarios and return periods
scenario_codes = ["pc", "rcp26", "rcp85"]
scenario_labels = {
    "pc": "CurrentClimate",
    "rcp26": "RCP2.6",
    "rcp85": "RCP8.5"
}

##all the return periods we use, we are currently only using 10 and 100)
return_periods = [10, 100, 1000]

# Output directory for individual file results
output_dir = "../workspace/precomputed_region_results"
os.makedirs(output_dir, exist_ok=True)

# Final combined output CSV path
output_csv = "../workspace/ADM_allfloods.csv"



In [None]:

# --- HELPER FUNCTION ---

def fix_text(s):
    """
    Attempts to fix text encoding (e.g., CP1252 to UTF-8).
    """
    try:
        return s.encode('cp1252').decode('utf-8')
    except Exception:
        return s


# --- LOAD ADM SHAPEFILES ---
print("Loading ADM boundary shapefiles...")
adm_gdfs = {}

for adm_label, adm_path in adm_levels:
    # Read the shapefile for this ADM level
    try:
        gdf = gpd.read_file(adm_path)
    except UnicodeDecodeError:
        gdf = gpd.read_file(adm_path, encoding='latin1')
    
    # Create a unique identifier if one doesn't exist
    if "unique_id" not in gdf.columns:
        gdf["unique_id"] = gdf.index + 1

    # Determine a region name field by checking common field names
    if "shapeName" in gdf.columns:
        gdf["region"] = gdf["shapeName"]
    elif "NAME" in gdf.columns:
        gdf["region"] = gdf["NAME"]
    elif "name" in gdf.columns:
        gdf["region"] = gdf["name"]
    else:
        print(f"Warning: No standard region name field found for {adm_label}. Available fields:")
        print(gdf.columns)
        gdf["region"] = gdf["unique_id"]

    # Fix potential text encoding issues
    gdf["region"] = gdf["region"].apply(lambda s: fix_text(s) if isinstance(s, str) else s)
    
    adm_gdfs[adm_label] = gdf
    print(f"  ✓ Loaded {adm_label}: {len(gdf)} regions")


# --- PROCESSING ---
print("\nStarting flood map processing...")

# Calculate total number of flood maps to process
total_flood_maps = len(scenario_codes) * len(return_periods)

# Create main progress bar for flood maps
pbar_files = tqdm(total=total_flood_maps, desc="Processing flood maps", unit="file", position=0, leave=True, dynamic_ncols=True)

# Loop over each scenario and return period (i.e., each flood file)
for scenario_code in scenario_codes:
    scenario_label = scenario_labels[scenario_code]
    
    for rp in return_periods:
        # Construct the flood map path
        floodmap_filename = f"global_{scenario_code}_h{rp}glob.tif"
        floodmap_path = os.path.join(flood_maps_dir, floodmap_filename)
        
        # Update progress bar description
        pbar_files.set_description(f"Processing {scenario_label} - RP{rp}")
        
        # Check if output CSV already exists
        output_filename = f"flood_{scenario_code}_rp{rp}.csv"
        output_path = os.path.join(output_dir, output_filename)
        
        if os.path.exists(output_path):
            pbar_files.write(f"⏭  Already processed: {output_filename}. Skipping.")
            pbar_files.update(1)
            continue
        
        # If the flood map file does not exist, skip it
        if not os.path.exists(floodmap_path):
            pbar_files.write(f"⚠ Flood map not found: {floodmap_filename}. Skipping.")
            pbar_files.update(1)
            continue

        # Results for this specific flood file
        file_results = []
        
        # Open the floodmap raster
        with rasterio.open(floodmap_path) as src:
            nodata_val = src.nodata
            
            # Process each ADM level for this flood file
            for adm_label, gdf in adm_gdfs.items():
                
                # Loop through each polygon in the shapefile
                for idx, row in gdf.iterrows():
                    geom = [row["geometry"]]
                    
                    try:
                        # Mask and crop the raster to the polygon's extent
                        out_image, out_transform = rasterio.mask.mask(src, geom, crop=True)
                    except Exception as e:
                        pbar_files.write(f"⚠ Error processing region {row['region']} in {adm_label}: {e}")
                        continue

                    # Extract the data from the first band
                    data = out_image[0]

                    # Create a mask for valid data (exclude nodata values)
                    if nodata_val is not None:
                        valid_mask = data != nodata_val
                        valid_data = data[valid_mask]
                    else:
                        valid_mask = np.ones(data.shape, dtype=bool)
                        valid_data = data

                    # If valid_data is a masked array, extract the underlying data
                    if np.ma.is_masked(valid_data):
                        valid_data = valid_data.compressed()

                    # Determine the number of valid observations (pixels)
                    n_obs = valid_data.size

                    # Compute statistics
                    if n_obs == 0:
                        stats = {
                            "min": np.nan,
                            "max": np.nan,
                            "mean": np.nan,
                            "median": np.nan,
                            "p2_5": np.nan,
                            "p5": np.nan,
                            "p95": np.nan,
                            "p97_5": np.nan
                        }
                        max_coord = (np.nan, np.nan)
                    else:
                        stats = {
                            "min": float(np.min(valid_data)),
                            "max": float(np.max(valid_data)),
                            "mean": float(np.mean(valid_data)),
                            "median": float(np.percentile(valid_data, 50)),
                            "p2_5": float(np.percentile(valid_data, 2.5)),
                            "p5": float(np.percentile(valid_data, 5)),
                            "p95": float(np.percentile(valid_data, 95)),
                            "p97_5": float(np.percentile(valid_data, 97.5))
                        }

                        # Identify the pixel location of the maximum value
                        max_val = stats["max"]
                        indices = np.where((data == max_val) & valid_mask)
                        if len(indices[0]) > 0:
                            row_idx = indices[0][0]
                            col_idx = indices[1][0]
                            max_coord = rasterio.transform.xy(out_transform, row_idx, col_idx)
                        else:
                            max_coord = (np.nan, np.nan)

                    # Build the result r cord
                    result = {
                        "region": row["region"],
                        "adm_level": adm_label,
                        "scenario_code": scenario_code,
                        "scenario_name": scenario_label,
                        "hazard_return": rp,
                        "hazard_type": "flood",
                        "min": stats["min"],
                        "max": stats["max"],
                        "mean": stats["mean"],
                        "median": stats["median"],
                        "p2_5": stats["p2_5"],
                        "p5": stats["p5"],
                        "p95": stats["p95"],
                        "p97_5": stats["p97_5"],
                        "n_obs": n_obs,
                        "max_x": max_coord[0],
                        "max_y": max_coord[1]
                    }

                    file_results.append(result)
        
        # Save results for this file to individual CSV
        if file_results:
            df_file = pd.DataFrame(file_results)
            df_file.to_csv(output_path, index=False, encoding='utf-8-sig')
            pbar_files.write(f"  ✓ Saved {len(file_results)} records to {output_filename}")
        
        # Update file progress bar
        pbar_files.update(1)

# Close main progress bar
pbar_files.close()

print(f"\n✓ Processing complete!")



In [None]:

# --- COMBINE AND EXPORT ---

print("\nCombining individual CSV files...")

# Get all CSV files from the output directory
csv_files = [f for f in os.listdir(output_dir) if f.endswith('.csv')]

if csv_files:
    # Read and combine all CSV files
    dfs = []
    for csv_file in tqdm(csv_files, desc="Reading CSV files", unit="file"):
        csv_path = os.path.join(output_dir, csv_file)
        df = pd.read_csv(csv_path)
        dfs.append(df)
    
    # Concatenate all dataframes
    df_combined = pd.concat(dfs, ignore_index=True)
    
    # Fill missing values with 0
    df_combined = df_combined.fillna(0)
    
    # Save combined results
    df_combined.to_csv(output_csv, index=False, encoding='utf-8-sig')
    
    print(f"\n" + "="*60)
    print(f"✓ Combined results saved to: {output_csv}")
    print(f"✓ Individual file results saved to: {output_dir}/")
    print(f"✓ Total records: {len(df_combined)}")
    print(f"✓ CSV files combined: {len(csv_files)}")
    print(f"✓ Missing values filled with: 0")
    print("="*60)
else:
    print("\n⚠ No CSV files found to combine. Check if any flood maps were processed.")
