This notebook will be used to develop and compare regression models to correlate the eHydro bathymetric surveys with cloud-masked Sentinel-2 surface refelctances. These models will hopefully provide USACE and the eHydro program with a new, robust, accurate tool for unmanned bathymetric estiamtes. This will be possible at 10-meter resolution at a frequency of up to 5 days.
- First starting with XGBoost, RF, and SVM-RBF regressors in the SWG. May try some NN as well
- band maths here with the green and blue bands (short wavelengths penetrate water columns more)
- include some metadata (AD, CX, BD? Single vs dual beam?)? Will look into more that may be beneficial

In [21]:
import os
import random
import rasterio
from rasterio.warp import reproject, Resampling, calculate_default_transform
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Functions

In [14]:
# Function to normalize an array
def normalize(array):
    return (array - np.nanmin(array)) / (np.nanmax(array) - np.nanmin(array))

# Function to read .tif files
def read_tif(file_path):
    with rasterio.open(file_path) as src:
        array = src.read()  # Read all bands
        profile = src.profile  # Get metadata (optional, for reference)
    return array, profile

def extract_raster_data(pair_tuple):
    images_data = []

    for paths in pair_tuple:
        bathy_path = paths[1]
        s2_path = paths[0]

        # --- Step 1: Open Bathymetry Raster ---
        with rasterio.open(bathy_path) as bathy:
            bathy_data = bathy.read(1)  # Bathymetry data (band 1)
            bathy_nodata = bathy.nodata  # NoData value
            bathy_transform = bathy.transform
            bathy_shape = bathy.shape

        # --- Step 2: Open Sentinel-2 Raster ---
        with rasterio.open(s2_path) as s2:
            if s2.shape != bathy_shape or s2.transform != bathy_transform:
                raise ValueError(
                    f"Inconsistent shapes or transforms:\n"
                    f"Bathymetry Shape: {bathy_shape}, Sentinel-2 Shape: {s2.shape}.\n"
                    f"Bathymetry Transform: {bathy_transform}, Sentinel-2 Transform: {s2.transform}.\n"
                    f"Ensure rasters have identical extents and resolutions."
                )

            # Read Sentinel-2 bands
            bands = {
                "red": normalize(s2.read(3)),
                "green": normalize(s2.read(2)),
                "blue": normalize(s2.read(1)),
                "nir": normalize(s2.read(4))
            }
            s2_nodata = s2.nodata  # Sentinel-2 NoData value

        # --- Step 3: Flatten Bands ---
        flat_bathy = bathy_data.flatten()
        flat_bands = {key: band.flatten() for key, band in bands.items()}

        # --- Step 4: Mask NoData Values ---
        valid_mask = (
            ~np.isnan(flat_bathy) &  # Valid bathy pixels
            (flat_bathy != bathy_nodata)  # Exclude bathy NoData
        )

        for band in flat_bands.values():
            valid_mask &= (band != s2_nodata)  # Exclude Sentinel-2 NoData

        # Apply the mask
        valid_bathy = flat_bathy[valid_mask].reshape(-1, 1)  # Reshape bathy to (n_pixels, 1)
        valid_features = np.column_stack([band[valid_mask] for band in flat_bands.values()])

        # --- Step 5: Combine Features and Targets ---
        combined_features = np.concatenate((valid_bathy, valid_features), axis=1)  # Combine bathy and S2
        images_data.append((combined_features, valid_bathy.flatten()))  # Flatten bathy for targets

    return images_data

def prepare_data(pairs):
    X = []
    y = []
    for features, targets in pairs:
        X.append(features[:,1:])  # Keep features from this pair
        y.append(targets)   # Keep corresponding targets
    return np.vstack(X), np.hstack(y)

# Establish working directories

In [4]:
S2_PATH = '/home/clay/Documents/SDB/CESWG/processed/S2'
BATHY_PATH = '/home/clay/Documents/SDB/CESWG/processed/Bathy'

In [None]:
# get paths to each image for each eHydro-Sentinel2 image pair

images = []
for name in [f[:-4] for f in os.listdir(BATHY_PATH) if f.endswith('.tif')]:
    images.append((os.path.join(S2_PATH, f'{name}.tif'), os.path.join(BATHY_PATH, f'{name}.tif')))

images_data = extract_raster_data(images)

# Sentinel-2 band manipulation and including other data (good reference is Chybicki et al. 2023)
- Blue Green ratios (Blue/Green, Green/Blue, looking for others)
- Stumpf log ratio of blue green (https://aslopubs.onlinelibrary.wiley.com/doi/10.4319/lo.2003.48.1_part_2.0547)
- Coordinates
- NCF channel ID name
- Survey type
- Single vs Double beam
- Spectral indices?? (NDVI and NDWI, NDWI could make sense but would leave this to the end)


The Chybicki 2023 paper had models perform extremely well when including all bands, the Stumpf log ratio, and the UTM coordinates. I think including more blue-green ratios and the survey type as well will increase my accuracy

In [None]:
def blue_green_ratios(pairs):
    s2 = pairs[0][0][:, 1:]     # in order B, G, R, NIR
    bathy = pairs[0][1]

    bluegreen = images_data[0][0][:, 1] / images_data[0][0][:, 2] 
    greenblue = images_data[0][0][:, 2] / images_data[0][0][:, 1] 

    

In [None]:
# first, create simple ratios of blue/green and green/blue
# blue penetrates, while green attenuates
# can check correlation between features and apply PCA if needed

bluegreen = images_data[0][0][:, 1] / images_data[0][0][:, 2] 
greenblue = images_data[0][0][:, 2] / images_data[0][0][:, 1] 


In [None]:
# Stumpf log ratio
lograt = np.log(images_data[0][0][:, 1]) / np.log(images_data[0][0][:, 2])

In [None]:
# Coordinates (of survey or S2, they should match)
# extract UTM coords

file_name = [os.path.join(S2_PATH, f) for f in os.listdir(S2_PATH)][0]
with rasterio.open(file_name) as src:
            band1 = src.read(1)
            print('Band1 has shape', band1.shape)
            height = band1.shape[0]
            width = band1.shape[1]
            cols, rows = np.meshgrid(np.arange(width), np.arange(height))
            xs, ys = rasterio.transform.xy(src.transform, rows, cols)
            lons= np.array(xs)
            lats = np.array(ys)
            print('lons shape', lons.shape)

In [None]:
# NCF Channel ID from the file names



In [None]:
# Survey type from the name


In [None]:
# Single vs double beam?


In [None]:
# check the correlation between each feature

df = pd.DataFrame(X_train, columns=['Blue', 'Green', 'Red', 'NIR'])
df['Bathymetry'] = y_train

# Correlation matrix
print(df.corr())

# Prepare data for test_train_split
- k-fold segmentation for training?
- tiling of images or whole images?
- try 3 regression models for now: SVM, RF, and XGBoost
- may try ElasticNet from cuML, and some shallow NNs

In [115]:
# Define split ratios
train_ratio = 0.7  # 70% for training
val_ratio = 0.2    # 20% for validation
test_ratio = 0.1   # 10% for testing

train_pairs, temp_pairs = train_test_split(images_data, test_size=(1 - train_ratio), random_state=42)
val_pairs, test_pairs = train_test_split(temp_pairs, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=42)

In [134]:
# Prepare data for each split
X_train, y_train = prepare_data(train_pairs)
X_val, y_val = prepare_data(val_pairs)
X_test, y_test = prepare_data(test_pairs)

1. XGBoost Regression

In [None]:

# Define and configure the XGBoost regressor
xgb_model = XGBRegressor(
    n_estimators=200,      # Number of trees
    learning_rate=0.1,     # Learning rate
    max_depth=6,           # Maximum tree depth
    random_state=42        # Random seed for reproducibility
)

# Train the model
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],  # For validation during training
)

In [None]:
xgb_predictions = xgb_model.predict(X_test)

r2 = r2_score(y_test, xgb_predictions)
rmse = np.sqrt(mean_squared_error(y_test, xgb_predictions))
mae = mean_absolute_error(y_test, xgb_predictions)
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

1. RF Regression:
- R2 Score: 0.3354
- RMSE: 12.8882
- MAE: 9.9499

In [None]:
# Random Forest Model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

r2 = r2_score(y_test, rf_predictions)
rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
mae = mean_absolute_error(y_test, rf_predictions)
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

2. SVM regression:
- R2 Score: 
- RMSE: 
- MAE: 

In [None]:
# Support Vector Machine Model
svm_model = SVR()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)


r2 = r2_score(y_test, svm_predictions)
rmse = np.sqrt(mean_squared_error(y_test, svm_predictions))
mae = mean_absolute_error(y_test, svm_predictions)
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")