Quick Analyse of 

as_con_3s.tif
as_msk_3s.tif
hyd_as_lup_15s.tif 


| File name              | Content / Purpose                                                             | Quick analysis idea                                      |
|------------------------|-------------------------------------------------------------------------------|---------------------------------------------------------|
| **as_con_3s.tif**      | Conditioned DEM — elevation model hydrologically corrected (stream burning, sinks filled) | Clip to Bhutan + buffer, plot hillshade, compare vs. raw DEM |
| **as_msk_3s.tif**      | Land mask — binary mask of land vs. ocean                                     | Clip to Bhutan + buffer, check unique values (0 = water, 1 = land) |
| **hyd_as_lup_15s.tif** | Lookup table at 15″ resolution (land/sea mask, auxiliary layer for hydrological atlas) | Clip to Bhutan extent, print unique values, verify categories |

In [1]:
from pathlib import Path
import os
os.environ["GEOPANDAS_IO_ENGINE"] = "pyogrio"   
import geopandas as gpd
import rasterio   #This is a library for working with raster geospatial data (GeoTIFF)
import rioxarray as rxr # extension of xarray + rasterio, simplifies reading/writing GeoTIFF and reprojection to different CRS


con_path = Path("../../data/HydroSHEDS/as_con_3s.tif")
con_msk = Path("../../data/HydroSHEDS/as_msk_3s.tif")
hyd_as_lup = Path("../../data/HydroSHEDS/hyd_as_lup_15s.tif")

if not (con_path.exists() == con_msk.exists ()== hyd_as_lup.exists ()  == True):
    raise FileNotFoundError("One or more input files are missing. Please ensure all required files are present.")

# Define the bounding box for Bhutan + buffer (in degrees)
min_lon, max_lon = 87.0, 93.5
min_lat, max_lat = 25.0, 29.5


from typing import Dict
import xarray as xr

# Mapping: input → output filename (same folder)
out_map: Dict[Path, str] = {
    con_path:   "as_con_Bhutan_and_buffer.tif",
    con_msk:    "as_msk_Bhutan_and_buffer.tif",
    hyd_as_lup: "hyd_as_lup_Bhutan_and_buffer.tif",
}

# Bounding box (EPSG:4326, degrees) — use as is
minx, miny, maxx, maxy = min_lon, min_lat, max_lon, max_lat

def clip_to_bbox_save(in_path: Path, out_name: str):
    out_path = in_path.parent / out_name

    # Read raster
    da: xr.DataArray = rxr.open_rasterio(in_path, masked=True)

    # Clip by bbox (without reprojection)
    clipped = da.rio.clip_box(minx=minx, miny=miny, maxx=maxx, maxy=maxy)

    # Save GeoTIFF (same CRS as input)
    clipped.rio.to_raster(out_path, compress="LZW")

    print(f"Saved: {out_path} | shape={tuple(clipped.shape)}")

for in_path, out_name in out_map.items():
    clip_to_bbox_save(in_path, out_name)

Saved: ../../data/HydroSHEDS/as_con_Bhutan_and_buffer.tif | shape=(1, 5400, 7801)
Saved: ../../data/HydroSHEDS/as_msk_Bhutan_and_buffer.tif | shape=(1, 5400, 7801)
Saved: ../../data/HydroSHEDS/hyd_as_lup_Bhutan_and_buffer.tif | shape=(1, 1081, 1561)


In [3]:
from pathlib import Path
import numpy as np
import rasterio
from rasterio.transform import xy
import pandas as pd

# Your three clipped rasters
files = [
    Path("../../data/HydroSHEDS/as_con_Bhutan_and_buffer.tif"),
    Path("../../data/HydroSHEDS/as_msk_Bhutan_and_buffer.tif"),
    Path("../../data/HydroSHEDS/hyd_as_lup_Bhutan_and_buffer.tif"),
]

def quick_stats(fpath: Path):
    """Print basic metadata and value statistics for a single-band GeoTIFF."""
    with rasterio.open(fpath) as src:
        arr = src.read(1, masked=True)  # read first (and only) band as a masked array
        print(f"\nFile: {fpath.name}")
        print(f"  CRS: {src.crs}")
        print(f"  Shape (rows, cols): {arr.shape}")
        print(f"  Dtype: {arr.dtype}")
        print(f"  Nodata: {src.nodata}")
        # stats on valid pixels only
        valid = arr.compressed()
        print(f"  Min: {valid.min()}, Max: {valid.max()}")
        print(f"  Mean: {valid.mean():.3f}, Std: {valid.std():.3f}")
        # If it looks categorical (few unique values), list them
        uniq = np.unique(valid)
        if uniq.size <= 20:
            print(f"  Unique values: {uniq.tolist()}")
        else:
            print(f"  Unique values: {uniq.size} (too many to list)")

def raster_to_csv(fpath: Path, max_points: int = 1_000_000):
    """
    Export raster pixels to a flat CSV with three columns:
    - lon, lat, value  (if CRS is geographic)
    - x,   y,   value  (otherwise)

    max_points limits the output size by regular sub-sampling if needed.
    """
    with rasterio.open(fpath) as src:
        arr = src.read(1, masked=True)
        rows, cols = arr.shape

        # Build 1D coordinate vectors using the transform
        # x for all columns (take row=0), y for all rows (take col=0)
        x_vec = np.array(xy(src.transform, 0, np.arange(cols), offset='center')[0])
        y_vec = np.array(xy(src.transform, np.arange(rows), 0, offset='center')[1])

        # Create a regular grid without allocating a full mesh (use broadcasting)
        # We will flatten later.
        # Apply mask to filter out nodata
        mask = ~arr.mask if np.ma.isMaskedArray(arr) else np.ones_like(arr, dtype=bool)

        # If too many points, sub-sample uniformly to keep CSV manageable
        total = rows * cols
        step = 1
        if total > max_points:
            # choose a step so that (rows/step)*(cols/step) ≈ max_points
            step_row = max(1, int(np.ceil(rows / np.sqrt(max_points))))
            step_col = max(1, int(np.ceil(cols / np.sqrt(max_points))))
            step = (step_row, step_col)
        else:
            step = (1, 1)

        r_idx = np.arange(0, rows, step[0])
        c_idx = np.arange(0, cols, step[1])

        # use np.ix_ to build row/col index grids
        subgrid = mask[np.ix_(r_idx, c_idx)]
        values = arr[np.ix_(r_idx, c_idx)]


        # Build coordinate grids for the sub-sampled indices
        xx = x_vec[c_idx][None, :]             # shape (1, C)
        yy = y_vec[r_idx][:, None]             # shape (R, 1)
        # Broadcast to subgrid shape
        X = np.broadcast_to(xx, subgrid.shape)
        Y = np.broadcast_to(yy, subgrid.shape)

        # Keep only valid pixels
        valid = subgrid & ~values.mask if np.ma.isMaskedArray(values) else subgrid
        Xv = X[valid]
        Yv = Y[valid]
        Vv = values[valid]

        # Name columns depending on CRS type
        is_geographic = bool(src.crs and src.crs.is_geographic)
        cols_names = ("lon", "lat", "value") if is_geographic else ("x", "y", "value")
        df = pd.DataFrame({cols_names[0]: Xv, cols_names[1]: Yv, cols_names[2]: Vv})

        out_csv = fpath.with_suffix("").as_posix() + "_points.csv"
        pd.options.display.float_format = '{:.6f}'.format
        df.to_csv(out_csv, index=False)
        print(f"  → CSV saved: {out_csv} | rows={len(df)} (subsample step={step})")

# Run quick stats for each file
for f in files:
    quick_stats(f)

# (Optional) Export each raster to CSV with lon/lat(or x/y) and value
for f in files:
    raster_to_csv(f, max_points=500_000)  #


File: as_con_Bhutan_and_buffer.tif
  CRS: EPSG:4326
  Shape (rows, cols): (5400, 7801)
  Dtype: int16
  Nodata: 32767.0
  Min: -19, Max: 8084
  Mean: 2322.381, Std: 2108.095
  Unique values: 7755 (too many to list)

File: as_msk_Bhutan_and_buffer.tif
  CRS: EPSG:4326
  Shape (rows, cols): (5400, 7801)
  Dtype: uint8
  Nodata: 255.0
  Min: 1, Max: 3
  Mean: 1.000, Std: 0.000
  Unique values: [1, 3]

File: hyd_as_lup_Bhutan_and_buffer.tif
  CRS: EPSG:4326
  Shape (rows, cols): (1081, 1561)
  Dtype: uint32
  Nodata: 4294967295.0
  Min: 0, Max: 3077080
  Mean: 6885.105, Std: 88565.652
  Unique values: 21265 (too many to list)
  → CSV saved: ../../data/HydroSHEDS/as_con_Bhutan_and_buffer_points.csv | rows=439425 (subsample step=(8, 12))
  → CSV saved: ../../data/HydroSHEDS/as_msk_Bhutan_and_buffer_points.csv | rows=439425 (subsample step=(8, 12))
  → CSV saved: ../../data/HydroSHEDS/hyd_as_lup_Bhutan_and_buffer_points.csv | rows=281861 (subsample step=(2, 3))
