This notebook will be used to develop and compare regression models to correlate the eHydro bathymetric surveys with cloud-masked Sentinel-2 surface refelctances. These models will hopefully provide USACE and the eHydro program with a new, robust, accurate tool for unmanned bathymetric estiamtes. This will be possible at 10-meter resolution at a frequency of up to 5 days.
- Will train an RF RRegression using RAPIDS/cuML, CatBoost using GPU, a custom NN, and maybe an XGBoost model on GPU

In [None]:
import os
import re
import pickle
import rasterio
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import geopandas as gpd
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Functions

In [None]:
def plot_histograms(df):
    num_columns = min(9, len(df.columns))  # Limit to 7 columns
    plt.figure(figsize=(15, 10))  # Adjust the figure size

    for i in range(num_columns):
        plt.subplot(3, 3, i + 1)  # Create a grid for plots (3x3 max)
        column = df.columns[i]
        plt.hist(df[column], bins=100, alpha=0.75, color='blue', edgecolor='black')
        plt.title(f'Histogram of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')

    plt.tight_layout()
    plt.show()


# Establish working directories

In [None]:
# your work directory
WORK_DIR = '/mnt/Crucial/SDB/CESWG'

# Load data from parquet file saved in 02_data_prep.ipynb

In [None]:
combined_data = pd.read_parquet(os.path.join(WORK_DIR,'SDB_data.parquet'), engine='pyarrow')

# Compare the different variables

In [None]:
plt.figure(figsize=(12, 10))
ax = sns.heatmap(
    combined_data.corr(),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    linewidths=0.5,
    annot_kws={"size": 10},
    cbar_kws={"shrink": 0.8}
)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
combined_data.describe().T

# Prepare data for test_train_split
- Trains per pixel


- k-fold segmentation for training?
- try 3 regression models for now: SVM, RF, and XGBoost
- may try ElasticNet from cuML, and some shallow NNs

In [None]:
X = combined_data.drop(columns=['Bathymetry'])
y = combined_data['Bathymetry']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split temp into validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Confirm dataset sizes
print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Testing set size:", X_test.shape[0])
# X_test.to_csv(os.path.join(WORK_DIR, 'data.csv'), sep='\t', encoding='utf-8', index=False, header=True)

list of X columns include 'Blue', 'Green', 'Red', 'NIR', 'Blue/Green', 'Green/Blue', 'Stumpf', 'NSMI', 'TI', 'X', 'Y', 'Channel_Name_Encoded'

In [None]:
testcols = ['Blue', 'Green', 'Stumpf','X', 'Y', 'Channel_Name_Encoded']

In [None]:
new_train = X_train[testcols].copy()
new_test = X_test[testcols].copy()
new_val = X_val[testcols].copy()

In [None]:
Xcols = list(X_train.columns)
constant_cols = Xcols[:4] + Xcols[-3:]
non_constant_cols = list(set(Xcols).symmetric_difference(set(constant_cols)))

traindfs = []
testdfs = []
valdfs = []

for i in range(len(non_constant_cols)):
    constant_cols.append(non_constant_cols[i])
    new_X_train = X_train[constant_cols].copy()
    new_X_test = X_test[constant_cols].copy()
    new_X_val = X_val[constant_cols].copy()
    traindfs.append(new_X_train)
    testdfs.append(new_X_test)
    valdfs.append(new_X_val)

# 1. RF Regression:
- interesting note, with RF (not sure if all bagging or tree-like) decreases accuracy with increasing variables?

For gridsearch param tuning:
1. {'criterion': 'squared_error',
 'max_depth': 8,
 'max_features': 'sqrt',
 'n_estimators': 200}
2. {'criterion': 'squared_error',
 'max_depth': 12,
 'max_features': 'sqrt',
 'n_estimators': 100}
3. {'criterion': 'squared_error',
 'max_depth': 20,
 'max_features': 'sqrt',
 'n_estimators': 100}
 4. {'criterion': 'friedman_mse',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 500}

Try cross-validation, and binneed splitting to ensure less bias during training?

In [None]:
param_grid = {
    'n_estimators': [100, 250],
    'max_depth': [None, 500],
    'min_samples_split': [1, 2],
    'min_samples_leaf': [1, 2], 
    'max_features': ['sqrt'],
    'criterion' :['squared_error', 'friedman_mse']
}

rfr = RandomForestRegressor(n_jobs=18, random_state=42)
# rfr.fit(X_train, y_train)

CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 5)
CV_rfr.fit(new_test, y_test)
CV_rfr.best_params_

In [None]:
# Random Forest Model
# better performance without the blue/green and green/blue bands

rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth= None, 
    criterion='squared_error',  
    min_weight_fraction_leaf=0.0, 
    max_features='sqrt',                   # 'sqrt', 'log2', int, or float
    min_samples_leaf=1,
    min_samples_split=2,
    max_leaf_nodes=None, 
    min_impurity_decrease=0.0, 
    bootstrap=True, 
    oob_score=False, 
    n_jobs=-1, 
    random_state=42, 
    verbose=0, 
    warm_start=False, 
    ccp_alpha=0.0, 
    max_samples=None, 
    monotonic_cst=None
)

# rf_model.fit(traindfs[i], y_train)
rf_model.fit(new_train, y_train)


# scores = -1 * cross_val_score(rf_model, testdfs[0], y_test, cv=3, scoring= 'neg_root_mean_squared_error', n_jobs=18)
# display(scores.mean())

In [None]:
train_val_metrics = {}

for pair in zip([new_test, new_val], [y_test, y_val], ['test','val']):
    predictions = rf_model.predict(pair[0])

    train_val_metrics[pair[2]] = [r2_score(pair[1], predictions), np.sqrt(mean_squared_error(pair[1], predictions)), mean_absolute_error(pair[1], predictions)]
    
    print(f"{pair[2]}")
    # print(f"R2 Score: {train_val_metrics[pair[2]][0]}")

    a_r2 = 1 - ((1 - train_val_metrics[pair[2]][0]) * (pair[0].shape[0] - 1)) / (pair[0].shape[0] - pair[0].shape[1] - 1)
    print(f"Adjusted R2 score: {a_r2}")

    print(f"RMSE Score: {train_val_metrics[pair[2]][1]} ft")
    print(f"MAE Score: {train_val_metrics[pair[2]][2]} ft")

- ['Blue', 'Green', 'Stumpf','X', 'Y', 'Channel_Name_Encoded']
test
Adjusted R2 score: 0.9556086812104388
RMSE Score: 3.330541268867474 ft
MAE Score: 1.9717861590043395 ft
val
Adjusted R2 score: 0.9556332613989742
RMSE Score: 3.328907595959278 ft
MAE Score: 1.9708214148581906 ft
- ['Blue', 'Green', 'Stumpf', 'TI','X', 'Y', 'Channel_Name_Encoded']
test
Adjusted R2 score: 0.9396524517812164
RMSE Score: 3.883250836483786 ft
MAE Score: 2.340178418710885 ft
val
Adjusted R2 score: 0.9397703292743896
RMSE Score: 3.878627440716498 ft
MAE Score: 2.3377579017410937 ft

# 2. XGBoost Regression

n_estimators=500, learning_rate=0.3, max_depth=10, grow_policy= 'lossguide', booster= 'gbtree',:
- R2 Score= 0.8529
- RMSE= 6.0626
- MAE= 3.9175

In [None]:
param_grid={
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'min_child_weight': [3, 5, 7],
    'gamma': [0.0, 0.1, 0.2],
    'colsample_bytree': [0.3, 0.4]
}

xgb_model = XGBRegressor()

grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=0)
grid_search.fit(new_train, y_train)

print("Best set of params:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

'colsample_bytree': 0.4, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5

In [None]:
# Define and configure the XGBoost regressor
xgb_model = XGBRegressor(
    n_estimators=250,      # Number of trees
    learning_rate=0.5,     # Learning rate
    max_depth=20,           # Maximum tree depth
    colsample_bytree=0.5,
    min_child_weight = 5,
    grow_policy= 'lossguide',
    booster= 'gbtree',
    gamma=0.0,
    n_jobs=-1,
    random_state=42        # Random seed for reproducibility
)

# Train the model
xgb_model.fit(
    new_train, y_train
)

In [None]:
train_val_metrics = {}

for pair in zip([new_test, new_val], [y_test, y_val], ['test', 'val']):
    predictions = xgb_model.predict(pair[0])

    train_val_metrics[pair[2]] = [r2_score(pair[1], predictions), np.sqrt(mean_squared_error(pair[1], predictions)), mean_absolute_error(pair[1], predictions)]
    
    print(f"{pair[2]}")
    # print(f"R2 Score: {train_val_metrics[pair[2]][0]}")

    a_r2 = 1 - ((1 - train_val_metrics[pair[2]][0]) * (pair[0].shape[0] - 1)) / (pair[0].shape[0] - pair[0].shape[1] - 1)
    print(f"Adjusted R2 score: {a_r2}")

    print(f"RMSE Score: {train_val_metrics[pair[2]][1]} ft")
    print(f"MAE Score: {train_val_metrics[pair[2]][2]} ft")

test:
- R2 Score: 0.8458
- Adjusted R2 score: 0.845824251970853
- RMSE: 6.2069
- MAE: 3.9638

val:
- R2 Score: 0.8461
- Adjusted R2 score: 0.8460827592967077
- RMSE: 6.2003
- MAE: 3.9620

# 3. Neural Network

In [None]:
X_scaled

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
import numpy as np
from tensorflow.keras.metrics import RootMeanSquaredError

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(new_train)

# Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(new_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model
# model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=[RootMeanSquaredError()]
)

# Train the model
history = model.fit(
    new_train, 
    y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

In [None]:
# Make predictions
y_pred = model.predict(X_scaled)

# Calculate MSE and R2 score
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print(f"MSE: {mse:.4f}")
print(f"R2 Score: {r2:.4f}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model (Increased epochs, let early stopping decide when to stop)
history = model.fit(
    X_train_scaled, y_train_scaled,
    epochs=100, batch_size=32,  # Increased epochs
    validation_split=0.2,
    callbacks=[early_stopping],
    shuffle=True
)

# Function to inverse transform predictions back to feet
def inverse_transform_predictions(y_pred_scaled):
    return (y_pred_scaled * y_std) + y_mean  # Reverse standardization

# Normalize test features
X_test_scaled = scaler.transform(X_test)
X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0)  # Replace NaNs with 0

# Predict bathymetry
y_pred_scaled = model.predict(X_test_scaled).flatten()  # Flatten ensures it's 1D
y_pred = inverse_transform_predictions(y_pred_scaled)  # Convert back to feet

# Compute RMSE only on valid values
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Root Mean Squared Error (RMSE): {rmse:.2f} ft")

