# Filter GIRI Flood Maps for Brazil

Simple clip to Brazil boundaries - no aggregation, maintains original resolution.

In [None]:
import os
import glob
import geopandas as gpd
import rasterio
from rasterio.mask import mask
from pathlib import Path
import numpy as np

# Define paths
project_root = Path().resolve().parent
giri_raw_dir = project_root / 'workspace' / 'GIRI_raw'
# Use ADM2 boundaries to ensure all municipalities (including Fernando de Noronha) are included
# ADM1 boundaries only extend to -34.66°E, but Fernando de Noronha extends to -32.38°E
brazil_boundaries_file = project_root / 'tests' / 'tests_data' / 'areas' / 'municipality' / 'geoBoundaries-BRA-ADM2_simplified.geojson'
output_dir = giri_raw_dir / 'brazil'
output_dir.mkdir(exist_ok=True)

# Load Brazil boundaries (ADM2 includes all municipalities including Fernando de Noronha)
brazil_gdf = gpd.read_file(brazil_boundaries_file)
brazil_union = brazil_gdf.geometry.union_all()

# Verify the bounds include Fernando de Noronha
print(f"Brazil boundaries bounds: {brazil_gdf.total_bounds}")
print(f"Easternmost longitude: {brazil_gdf.total_bounds[2]}")
print(f"(Fernando de Noronha is at ~-32.38°E, so this should be >= -32.38)")

# Find all TIFF files
tif_files = sorted(glob.glob(str(giri_raw_dir / '*.tif')))

print(f'Found {len(tif_files)} TIFF files to process')

# Process each file
for tif_file in tif_files:
    tif_name = Path(tif_file).name
    output_file = output_dir / tif_name
    
    if output_file.exists():
        print(f'Skipping {tif_name} (already exists)')
        continue
    
    print(f'Processing: {tif_name}')
    
    # Get original file size
    original_size_gb = os.path.getsize(tif_file) / (1024**3)
    print(f'   Original size: {original_size_gb:.2f} GB')
    
    with rasterio.open(tif_file) as src:
        print(f'   Original dimensions: {src.width} x {src.height}')
        print(f'   Original bounds: {src.bounds}')
        print(f'   Original dtype: {src.dtypes[0]}')
        print(f'   Original nodata: {src.nodata}')
        
        # Reproject Brazil geometry to match raster CRS
        if brazil_gdf.crs != src.crs:
            brazil_reprojected = gpd.GeoSeries([brazil_union], crs=brazil_gdf.crs).to_crs(src.crs)
            brazil_geom = brazil_reprojected.iloc[0]
        else:
            brazil_geom = brazil_union
        
        # Clip the raster - crop=True reduces extent, filled=False preserves nodata
        # This keeps EXACT same values, just crops spatial extent
        out_image, out_transform = mask(src, [brazil_geom], crop=True, filled=False)
        
        print(f'   Clipped dimensions: {out_image.shape[2]} x {out_image.shape[1]}')
        
        # Preserve ALL original metadata for exact value reproduction
        out_meta = src.meta.copy()
        out_meta.update({
            'driver': 'GTiff',
            'height': out_image.shape[1],
            'width': out_image.shape[2],
            'transform': out_transform,
            # Compression settings - these don't change values, just file size
            'compress': 'DEFLATE',
            'predictor': 2,  # Works with DEFLATE for numeric data
            'tiled': True,
            'blockxsize': 256,
            'blockysize': 256,
            'ZLEVEL': 9
        })
        
        # Write with exact same data type and nodata value
        with rasterio.open(output_file, 'w', **out_meta) as dst:
            dst.write(out_image)
    
    # Verify the output
    print(f'   Verifying output integrity...')
    with rasterio.open(output_file) as verify:
        print(f'   Output dtype: {verify.dtypes[0]} (should match original)')
        print(f'   Output nodata: {verify.nodata} (should match original)')
        
        # Read a sample to check value range
        sample = verify.read(1)
        valid_data = sample[~np.isnan(sample)] if np.isnan(verify.nodata or np.nan) else sample[sample != verify.nodata]
        if len(valid_data) > 0:
            print(f'   Data range: [{valid_data.min():.6f}, {valid_data.max():.6f}]')
    
    # Get output file size
    output_size_gb = os.path.getsize(output_file) / (1024**3)
    print(f'   Clipped size: {output_size_gb:.2f} GB')
    
    if output_size_gb < original_size_gb:
        reduction_pct = (1 - output_size_gb / original_size_gb) * 100
        print(f'   Size reduction: {reduction_pct:.1f}%')
    else:
        increase_pct = (output_size_gb / original_size_gb - 1) * 100
        print(f'   WARNING: Size increased by {increase_pct:.1f}%!')
    
    print(f'   ✓ Saved to: {output_file}')
    print()

print('Processing complete')
print(f'Output files saved in: {output_dir}')


Brazil boundaries bounds: [-73.99047852 -33.75079346 -28.84918213   5.27111816]
Easternmost longitude: -28.849182128999928
(Fernando de Noronha is at ~-32.38°E, so this should be >= -32.38)
Found 21 TIFF files to process
Processing: flood_pc_100_glob.tif
   Original size: 3.59 GB
   Original dimensions: 432000 x 216000
   Original bounds: BoundingBox(left=-180.0, bottom=-90.0, right=180.0, top=90.0)
   Original dtype: uint32
   Original nodata: 0.0
   Clipped dimensions: 54170 x 46827
