This notebook will be used to develop and compare regression models to correlate the eHydro bathymetric surveys with cloud-masked Sentinel-2 surface refelctances. These models will hopefully provide USACE and the eHydro program with a new, robust, accurate tool for unmanned bathymetric estiamtes. This will be possible at 10-meter resolution at a frequency of up to 5 days.
- First starting with XGBoost, RF, and SVM-RBF regressors in the SWG. May try some NN as well
- band maths here with the green and blue bands (short wavelengths penetrate water columns more)
- include some metadata (AD, CX, BD? Single vs dual beam?)? Will look into more that may be beneficial

In [23]:
import os
import rasterio
from rasterio.warp import reproject, Resampling, calculate_default_transform
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Functions

In [33]:
# Function to normalize an array
def normalize(array):
    return (array - np.nanmin(array)) / (np.nanmax(array) - np.nanmin(array))

# Function to read .tif files
def read_tif(file_path):
    with rasterio.open(file_path) as src:
        array = src.read()  # Read all bands
        profile = src.profile  # Get metadata (optional, for reference)
    return array, profile

# Function to extract features and labels
def extract_features_and_labels(sentinel_tif, bathymetry_tif):
    # Read Sentinel-2 data (RGB and NIR bands)
    sentinel_data, sentinel_profile = read_tif(sentinel_tif)
    rgb_nir = sentinel_data[:4]  # Assuming R, G, B, and NIR are the first 4 bands

    # Normalize each band
    normalized_rgb_nir = np.array([normalize(band) for band in rgb_nir])

    # Read bathymetry data
    bathymetry_data, bathymetry_profile = read_tif(bathymetry_tif)

    # Flatten and align the data
    X = normalized_rgb_nir.reshape(4, -1).T  # Flatten RGB+NIR bands (features)
    y = bathymetry_data.flatten()  # Flatten bathymetry data (labels)

    # Create a valid mask (check per pixel across X and y)
    valid_mask = (~np.isnan(y)) & (y != -9999)  # Check bathymetry validity
    X = X[valid_mask]  # Apply mask to features
    y = y[valid_mask]  # Apply mask to labels

    return X, y


In [31]:
def prepare_dataset(file_dict):
    all_X = []
    all_y = []

    for name, paths in file_dict.items():
        X, y = extract_features_and_labels(paths[0], paths[1])
        all_X.append(X)
        all_y.append(y)

    # Concatenate all data
    all_X = np.vstack(all_X)
    all_y = np.hstack(all_y)

    return all_X, all_y

In [25]:

def extract_raster_data(raster_dict):
    images_data = {}

    # Loop through each pair in the dictionary
    for site, paths in raster_dict.items():
        bathy_path = paths[0]
        s2_path = paths[1]

        # --- Step 1: Open Bathymetry Raster ---
        with rasterio.open(bathy_path) as bathy:
            bathy_data = bathy.read(1)  # Bathymetry band
            bathy_nodata = bathy.nodata  # NoData value
            bathy_transform = bathy.transform
            bathy_shape = bathy.shape

        # --- Step 2: Open Sentinel-2 Raster ---
        with rasterio.open(s2_path) as s2:
            if s2.shape != bathy_shape or s2.transform != bathy_transform:
                raise ValueError(
                    f"\nRaster shape or transform mismatch for site {site}. "
                    f"\nBathymetry Shape:\n{bathy_shape}, \nSentinel-2 Shape:\n{s2.shape}."
                    f"\nBathymetry Shape:\n{bathy_transform}, \nSentinel-2 Shape:\n{s2.transform}."
                    "\nEnsure Sentinel-2 and bathymetry rasters have identical extents and resolutions."
                )

            # Read Sentinel-2 bands
            red = s2.read(1)  # Red band
            green = s2.read(2)  # Green band
            blue = s2.read(3)  # Blue band
            nir = s2.read(4)  # NIR band
            s2_nodata = s2.nodata  # NoData value

        # --- Step 3: Flatten Bands ---
        flatbands = [band.flatten() for band in [bathy_data, red, green, blue, nir]]

        # --- Step 4: Mask NoData Values ---
        valid_mask = (
            ~np.isnan(flatbands[0]) &  # Valid bathy pixels
            (flatbands[0] != bathy_nodata) &  # Exclude bathy NoData
            (flatbands[1] != s2_nodata) &  # Exclude Sentinel-2 NoData
            (flatbands[2] != s2_nodata) &
            (flatbands[3] != s2_nodata) &
            (flatbands[4] != s2_nodata)
        )

        # Apply the mask
        valid_bands = [band[valid_mask] for band in flatbands]

        # --- Step 5: Combine Features and Targets ---
        site_features = np.column_stack(valid_bands[1:])  # Sentinel-2 bands (R, G, B, NIR)
        site_targets = valid_bands[0]  # Bathymetry (target)

        images_data[site] = [site_targets, site_features]

    return images_data

# Establish working directories

In [10]:
S2_PATH = '/home/clay/Documents/SDB/CESWG/processed/S2'
BATHY_PATH = '/home/clay/Documents/SDB/CESWG/processed/Bathy'

In [11]:
surveynames = [f[:-4] for f in os.listdir(BATHY_PATH) if f.endswith('.tif')]

In [29]:
# get paths to each image for each eHydro-Sentinel2 image pair

images = {}
for name in surveynames:
    images[name] = [os.path.join(S2_PATH, f'{name}.tif'), os.path.join(BATHY_PATH, f'{name}.tif')]

In [None]:
X, y = prepare_dataset(images)

In [None]:
X

In [18]:
# extract the data for each pair into a new data frame. Images stored as arrays for matplotlib, gdal, and rasterio

images_data = extract_raster_data(images)

# Prepare data for test_train_split
- k-fold segmentation for training?
- tiling of images or whole images?
- try 3 regression models for now: SVM, RF, and XGBoost
- may try ElasticNet from cuML, and some shallow NNs

In [None]:
# import the needed libraries for the models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [42]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

1. Initial RF Regression:
- R2 Score: 0.3354
- RMSE: 12.8882
- MAE: 9.9499

In [None]:
# Random Forest Model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

r2 = r2_score(y_test, rf_predictions)
rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
mae = mean_absolute_error(y_test, rf_predictions)
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

2. Initial SVM regression:
- R2 Score: 
- RMSE: 
- MAE: 

In [None]:
# Support Vector Machine Model
svm_model = SVR()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)


r2 = r2_score(y_test, svm_predictions)
rmse = np.sqrt(mean_squared_error(y_test, svm_predictions))
mae = mean_absolute_error(y_test, svm_predictions)
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

3. Initial XGBoost regression:
- R2 Score: 
- RMSE: 
- MAE: 

In [None]:
# XGBoost Model
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

r2 = r2_score(y_test, xgb_predictions)
rmse = np.sqrt(mean_squared_error(y_test, xgb_predictions))
mae = mean_absolute_error(y_test, xgb_predictions)
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

In [None]:
models = {
    "Random Forest": rf_model,
    "Support Vector Machine": svm_model,
    "XGBoost": xgb_model
}

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    print(f"Model: {name}")
    print(f"R2 Score: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"MAPE: {mape:.2f}%")
    print("-" * 30)