This notebook will be used to develop and compare regression models to correlate the eHydro bathymetric surveys with cloud-masked Sentinel-2 surface refelctances. These models will hopefully provide USACE and the eHydro program with a new, robust, accurate tool for unmanned bathymetric estiamtes. This will be possible at 10-meter resolution at a frequency of up to 5 days.
- First starting with XGBoost, RF, and SVM-RBF regressors in the SWG. May try some NN as well
- band maths here with the green and blue bands (short wavelengths penetrate water columns more)
- include some metadata (AD, CX, BD? Single vs dual beam?)? Will look into more that may be beneficial

In [23]:
import os
import rasterio
from rasterio.warp import reproject, Resampling, calculate_default_transform
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import geopandas as gpd
import pandas as pd
import numpy as np

# Functions

In [36]:
def extract_raster_data(raster_dict):
    images_data = {}

    # Loop through each pair in the dictionary
    for site, paths in raster_dict.items():
        bathy_path = paths[0]
        s2_path = paths[1]

        # --- Step 1: Open Bathymetry Raster ---
        with rasterio.open(bathy_path) as bathy:
            bathy_data = bathy.read(1)  # Bathymetry band
            bathy_nodata = bathy.nodata  # NoData value
            bathy_transform = bathy.transform
            bathy_shape = bathy.shape

        # --- Step 2: Open Sentinel-2 Raster ---
        with rasterio.open(s2_path) as s2:
            if s2.shape != bathy_shape or s2.transform != bathy_transform:
                raise ValueError(
                    f"\nRaster shape or transform mismatch for site {site}. "
                    f"\nBathymetry Shape:\n{bathy_shape}, \nSentinel-2 Shape:\n{s2.shape}."
                    f"\nBathymetry Shape:\n{bathy_transform}, \nSentinel-2 Shape:\n{s2.transform}."
                    "\nEnsure Sentinel-2 and bathymetry rasters have identical extents and resolutions."
                )

            # Read Sentinel-2 bands
            red = s2.read(1)  # Red band
            green = s2.read(2)  # Green band
            blue = s2.read(3)  # Blue band
            nir = s2.read(4)  # NIR band
            s2_nodata = s2.nodata  # NoData value

        # --- Step 3: Flatten Bands ---
        flatbands = [band.flatten() for band in [bathy_data, red, green, blue, nir]]


In [None]:

        # --- Step 4: Mask NoData Values ---
        valid_mask = (
            ~np.isnan(flatbands[0]) &  # Valid bathy pixels
            (flatbands[0] != bathy_nodata) &  # Exclude bathy NoData
            (flatbands[1] != s2_nodata) &  # Exclude Sentinel-2 NoData
            (flatbands[2] != s2_nodata) &
            (flatbands[3] != s2_nodata) &
            (flatbands[4] != s2_nodata)
        )

        # Apply the mask
        valid_bands = [band[valid_mask] for band in flatbands]

        # --- Step 5: Combine Features and Targets ---
        site_features = np.column_stack(valid_bands[1:])  # Sentinel-2 bands (R, G, B, NIR)
        site_targets = valid_bands[0]  # Bathymetry (target)

        images_data[site] = [site_targets, site_features]

    return images_data

# Establish working directories

In [28]:
S2_PATH = '/mnt/d/eHydro/CESWG/s2_rasters'
BATHY_PATH = '/mnt/d/eHydro/CESWG/bathy_rasters'

In [29]:
surveynames = [f[:-4] for f in os.listdir(BATHY_PATH) if f.endswith('.tif')]

In [30]:
# get paths to each image for each eHydro-Sentinel2 image pair

images = {}
for name in surveynames:
    images[name] = [os.path.join(BATHY_PATH, f'{name}.tif'), os.path.join(S2_PATH, f'{name}.tif')]

In [None]:
# extract the data for each pair into a new data frame. Images stored as arrays for matplotlib, gdal, and rasterio

images_data = extract_raster_data(images)

# Prepare data for test_train_split
- k-fold segmentation for training?
- tiling of images or whole images?
- try 3 regression models for now: SVM, RF, and XGBoost
- may try ElasticNet from cuML, and some shallow NNs

In [None]:
# import the needed libraries for the models
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
