# This notebook is going to download eHydro bathymetry data from the USACE ArcGIS REST repository, as well as retrieve cloud masked imagery of the same location, at the same time, for training of Satellite Derived Bathymetry model(s) for the National Channel Framework (NCF)

In [1]:
import requests
import os
import zipfile
import numpy as np
import ee
from osgeo import gdal
import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.mask import mask
from datetime import datetime, timedelta
import time
from tqdm import tqdm
import geopandas as gpd
from pykrige.ok import OrdinaryKriging  # PyKrige for Kriging interpolation
import fiona

In [None]:
ee.Initialize(project = '') ##enter your project name here as a string to initialize exchanges with ee api

# Functions

In [8]:
def get_gee_search_dates(time):
    date_obj = datetime.utcfromtimestamp(time / 1000)
    return ((date_obj - timedelta(days=1)).strftime('%Y-%m-%d'), (date_obj + timedelta(days=1)).strftime('%Y-%m-%d'))

def ehydro_date_convert(time):
    return datetime.utcfromtimestamp(time / 1000).strftime('%Y-%m-%d')

def download_file(url, destination):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    with open(destination, 'wb') as file, tqdm(
        desc=f"Downloading {os.path.basename(destination)}",
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=1024):
            file.write(chunk)
            bar.update(len(chunk))

def get_s2_sr_cld_col(aoi, start_date, end_date, cloud_filter):
    # Import and filter S2 SR.
    s2_sr_col = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
        .filterBounds(aoi)
        .filterDate(start_date, end_date)
        .filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', cloud_filter)))

    # Import and filter s2cloudless.
    s2_cloudless_col = (ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY')
        .filterBounds(aoi)
        .filterDate(start_date, end_date))

    # Join the filtered s2cloudless collection to the SR collection by the 'system:index' property.
    combined_coll = ee.ImageCollection(ee.Join.saveFirst('s2cloudless').apply(**{
        'primary': s2_sr_col,
        'secondary': s2_cloudless_col,
        'condition': ee.Filter.equals(**{
            'leftField': 'system:index',
            'rightField': 'system:index'
        })
    }))

    return combined_coll

def interpolate_bathymetry(surveyname, resolution, storage_dir):
    folder_path = os.path.join(DOWNLOAD_DIR, surveyname)
    input_shapefile = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.gdb')][0]
    output_raster = "/tmp/bathy_interp.tif"  # Output raster file path
    clipped_raster = '/tmp/bath_clip.tif'
    resampled_raster = os.path.join(storage_dir, f'{surveyname}.tif')
    z_field = "depthMean"  # Attribute containing bathymetry or depth values
    resolution = 10  # Desired pixel resolution in meters

    gdf = gpd.read_file(input_shapefile, layer="Bathymetry_Vector")
    xmin, ymin, xmax, ymax = gdf.total_bounds  # Get the extent of the layer

    # Calculate raster width and height
    width = round((xmax - xmin) / resolution)
    height = round((ymax - ymin) / resolution)

    # --- Step 3: Create the Raster Using gdal.Grid ---
    gdal.Grid(
        output_raster,                # Output raster path
        input_shapefile,              # Input vector data
        format="GTiff",               # Output file format
        algorithm="invdist",          # Interpolation method (IDW)
        zfield=z_field,               # Attribute containing bathymetry values
        outputBounds=[xmin, ymin, xmax, ymax],  # Set bounds
        width=width,                  # Number of columns
        height=height,                # Number of rows
        layers="Bathymetry_Vector",   # Specify the layer
        z_multiply=-1                 # Flip depths to negative
    )
    
    # --- Step 4: Clip the Raster to the GDF Geometry ---
    # Combine all geometries into a single boundary
    geometry = [gdf.geometry.union_all()]

    # Open the created raster and clip it using the GDF boundary
    with rasterio.open(output_raster) as src:
        clipped_image, clipped_transform = mask(
            src, geometry, crop=True, nodata=np.nan
        )
        clipped_meta = src.meta.copy()
        clipped_meta.update({
            "driver": "GTiff",
            "height": clipped_image.shape[1],
            "width": clipped_image.shape[2],
            "transform": clipped_transform,
            "nodata": np.nan
        })

    # Save the clipped raster to a new file
    with rasterio.open(clipped_raster, "w", **clipped_meta) as dst:
        dst.write(clipped_image[0], 1)  # Access first band

    # --- Step 5: Resample the Clipped Raster to 10m Resolution ---
    gdal.Warp(
        resampled_raster,       # Output resampled raster path
        clipped_raster,         # Input clipped raster
        xRes=resolution,              # Set pixel size in x direction
        yRes=resolution,              # Set pixel size in y direction
        resampleAlg=gdal.GRA_Bilinear, # Bilinear interpolation for resampling
        targetAlignedPixels=True,     # Align pixels to the grid
        dstNodata=np.nan              # Set NoData value to NaN
    )

    os.remove(output_raster)
    os.remove(clipped_raster)
    print(f"Resampled raster saved to: {resampled_raster}")


# Densify lines and extract points
def extract_points_from_contours(gdf, spacing):
    points = []
    depths = []
    for idx, row in gdf.iterrows():
        line = row.geometry
        depth = row[z_field]
        # Sample points along the line at a regular interval
        for i in np.arange(0, line.length, spacing):
            point = line.interpolate(i)  # Interpolate point at distance `i`
            points.append(point)
            depths.append(depth)
    return points, depths

def interpolate_bathymetry_with_kriging_from_contours(surveyname, resolution, storage_dir):
    folder_path = os.path.join(DOWNLOAD_DIR, surveyname)
    input_shapefile = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.gdb')][0]
    output_raster = "/tmp/bathy_interp_kriging.tif"
    clipped_raster = '/tmp/bathy_clip.tif'
    reprojected_raster = os.path.join(storage_dir, f'{surveyname}_epsg4326.tif')
    z_field = "elevation"  # Attribute containing contour elevation values

    # --- Step 1: Load Contours and Extract Points ---
    gdf = gpd.read_file(input_shapefile, layer="ElevationContour_ALL")
    spacing = resolution / 2  # Half the grid resolution for point extraction
    points, depths = extract_points_from_contours(gdf, spacing)

    # Convert to GeoDataFrame
    points_gdf = gpd.GeoDataFrame({'geometry': points, z_field: depths}, crs=gdf.crs)

    # Extract coordinates and depth values
    coords = np.array([(point.x, point.y) for point in points_gdf.geometry])
    depths = points_gdf[z_field].values

    # --- Step 2–6: Follow Original Workflow ---
    xmin, ymin, xmax, ymax = points_gdf.total_bounds
    grid_x = np.arange(xmin, xmax, resolution)
    grid_y = np.arange(ymin, ymax, resolution)

    # Perform Kriging Interpolation
    kriging_model = OrdinaryKriging(
        coords[:, 0], coords[:, 1], depths, variogram_model="linear"
    )
    grid_z, _ = kriging_model.execute("grid", grid_x, grid_y)

    transform = rasterio.transform.from_origin(xmin, ymax, resolution, resolution)
    meta = {
        "driver": "GTiff",
        "dtype": "float32",
        "nodata": np.nan,
        "width": grid_x.shape[0],
        "height": grid_y.shape[0],
        "count": 1,
        "crs": "EPSG:4326",
        "transform": transform
    }

    with rasterio.open(output_raster, "w", **meta) as dst:
        dst.write(grid_z.T.astype(np.float32), 1)

    geometry = [gdf.geometry.unary_union]
    with rasterio.open(output_raster) as src:
        clipped_image, clipped_transform = mask(src, geometry, crop=True, nodata=np.nan)
        clipped_meta = src.meta.copy()
        clipped_meta.update({
            "driver": "GTiff",
            "height": clipped_image.shape[1],
            "width": clipped_image.shape[2],
            "transform": clipped_transform,
            "nodata": np.nan
        })

    with rasterio.open(clipped_raster, "w", **clipped_meta) as dst:
        dst.write(clipped_image[0], 1)

    gdal.Warp(
        reprojected_raster, clipped_raster, dstSRS="EPSG:4326",
        xRes=resolution, yRes=resolution, resampleAlg=gdal.GRA_Bilinear, dstNodata=np.nan
    )

    os.remove(output_raster)
    os.remove(clipped_raster)

    print(f"Reprojected raster saved to: {reprojected_raster}")


# Query bathy data
AVAILABLE FIELD NAMES:
- Field Name: OBJECTID, Type: esriFieldTypeOID
- Field Name: surveyjobidpk, Type: esriFieldTypeString
- Field Name: sdsid, Type: esriFieldTypeString
- Field Name: sdsfeaturename, Type: esriFieldTypeString
- Field Name: sdsmetadataid, Type: esriFieldTypeString
- Field Name: surveytype, Type: esriFieldTypeString
- Field Name: channelareaidfk, Type: esriFieldTypeString
- Field Name: dateuploaded, Type: esriFieldTypeDate
- Field Name: usacedistrictcode, Type: esriFieldTypeString
- Field Name: surveydatestart, Type: esriFieldTypeDate
- Field Name: surveydateend, Type: esriFieldTypeDate
- Field Name: sourcedatalocation, Type: esriFieldTypeString
- Field Name: sourceprojection, Type: esriFieldTypeString
- Field Name: mediaidfk, Type: esriFieldTypeString
- Field Name: projectedarea, Type: esriFieldTypeDouble
- Field Name: sdsfeaturedescription, Type: esriFieldTypeString
- Field Name: dateloadedenterprise, Type: esriFieldTypeDate
- Field Name: datenotified, Type: esriFieldTypeDate
- Field Name: sourcedatacontent, Type: esriFieldTypeString
- Field Name: plotsheetlocation, Type: esriFieldTypeString
- Field Name: sourceagency, Type: esriFieldTypeString
- Field Name: globalid, Type: esriFieldTypeGlobalID
- Field Name: Shape__Area, Type: esriFieldTypeDouble
- Field Name: Shape__Length, Type: esriFieldTypeDouble

For training the model, will probably want to include options for with:
- usace district
- time of year (date and season)
- NCF ID
- survey type (single vs dual beam; XC, BD, AD, etc.)

In [9]:
# initiate search parameters for eHydro

s2_cloud_cov = 20 ## percentage of clouds in sentinel-2 multispectral imagery, less means you see more surface
search_date = '2018-01-01'  # Date threshold, getting data from 2018 to present
usace_code = "CESWG"        # Galveston District (for now)

NUM_OF_QUERIES = 3          # number of iterations for the request to run
QUERY_TIME_DELAY = 2        # query time delay in seconds, used when requesting all features
URL = "https://services7.arcgis.com/n1YM8pTrFmm7L4hs/ArcGIS/rest/services/eHydro_Survey_Data/FeatureServer/0/query"


In [None]:
# Parameters for the initial query
params = {
    'where': f"surveydatestart >= '{search_date}' AND usacedistrictcode='{usace_code}'",
    'outFields': '*',  # Retrieve all fields
    'resultRecordCount': 2000,  # Maximum records per request
    'resultOffset': 0,  # Starting offset
    'f': 'json',  # Output format
    'outSR': '4326',  # Spatial reference
}

all_features = []

for i in range(NUM_OF_QUERIES):
    response = requests.get(URL, params=params)
    if response.status_code == 200:
        data = response.json()
        features = data.get('features', [])
        if not features:
            break
        all_features.extend(features)
        params['resultOffset'] += params['resultRecordCount']
        print(f"Retrieved {len(features)} features.")
        time.sleep(QUERY_TIME_DELAY)  # Delay of 1 second
    else:
        print(f"Error: {response.status_code}, {response.text}")
        break

1. Extract date and aoi from the surveys for GEE
- plan is to iterate through the queries (probably by district code) and check to see if GEE has a corresponding Sentinel-2 image

In [11]:
geeinfo = {}
dates = []
for feature in all_features:
    dates.append(ehydro_date_convert(feature['attributes']['surveydatestart']))
    area = ee.Geometry.Polygon(feature['geometry']['rings'][0])

    date_tuple = get_gee_search_dates(feature['attributes']['surveydatestart'])

    geeinfo[feature['attributes']['surveyjobidpk']] = [area, date_tuple]
surveykeys = list(geeinfo.keys())

2. Iterate through responses and check if GEE has corresponding image(s)
- if not, the response will be deleted

In [12]:
for survey, items in geeinfo.items():
    aoi = items[0]
    dates = items[1]

    coll = get_s2_sr_cld_col(aoi, dates[0], dates[1], s2_cloud_cov)

    if coll.size().getInfo() > 0:
        geeinfo[survey].append(coll)

Get all surveys that have initial images

In [22]:
goodsurveys = []
for survey, items in geeinfo.items():
    if len(items) > 2:
        goodsurveys.append(survey)

if len(goodsurveys) == 0:
    print('No appropriate images were found')

3. Extract the eHydro bathy data download urls

In [None]:
bathyinfo = {}
for i, feature in enumerate(all_features):
    bathyinfo[surveykeys[i]] = feature['attributes']['sourcedatalocation']

4. Download the eHydro data locally for training models

In [None]:
DOWNLOAD_DIR = f'/media/clay/SamsungExt/SDB/eHydro/{usace_code}/bathy'
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

for survey in goodsurveys:
    file_path = os.path.join(DOWNLOAD_DIR, f"{survey}.zip")
    try:
        download_file(bathyinfo[survey], file_path)
    except Exception as e:
        print(f"Failed to download {bathyinfo[survey]}: {e}")

print('='*250)
print("All files downloaded.")

# Unzip downloaded data

In [41]:
zipnames = [f[:-4] for f in os.listdir(DOWNLOAD_DIR) if f.endswith('.zip')]
if len(zipnames) > 0:
    for name in zipnames:
        zipfile_path = os.path.join(DOWNLOAD_DIR, f'{name}.zip')
        with zipfile.ZipFile(zipfile_path,'r') as zip_ref:
            zip_ref.extractall(zipfile_path[:-4])
            os.remove(zipfile_path)
    surveynames = [f for f in os.listdir(DOWNLOAD_DIR)]
else:
    surveynames = [f for f in os.listdir(DOWNLOAD_DIR)]

# Extract the .gdb files from the survey data

In [55]:
# use this if you need to unzip
gdbinfo = {}
for name in surveynames:
    folder_path = os.path.join(DOWNLOAD_DIR, name)
    gdb_file = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.gdb')][0]
    bathyvector = gpd.read_file(gdb_file, layer='Bathymetry_Vector')
    contours = gpd.read_file(gdb_file, layer="ElevationContour_ALL")

    gdbinfo[name] = [bathyvector, contours]

In [None]:
for i in range(3):
    display(gdbinfo[surveynames[i]][1])

# Generate 10m pixel resolution bathymetry rasters from the .gdb files
Each .zip file comes with required and optional files. One of the required files is the .gdb file. This contains needed bathymetry and geospatial extents. Two files within the .gdb can be used to get the bathymetry: Bathymetry_Vector or ElevationContour_ALL. Bathymetry_Vector is a shapefile containing polygons assigned a mean depth. ElevationContour_ALL is a shapefile containing multipart lines with each line denoting an elevation. This raster generation for the bathymetry is done using the Bathymetry_Vector multipolygon shapefile

In [None]:
STORAGE_DIR = f'/media/clay/SamsungExt/SDB/eHydro/{usace_code}/bathy_rasters'
os.makedirs(STORAGE_DIR, exist_ok=True)

for key in gdbinfo.keys():
    interpolate_bathymetry_with_kriging(key, 10, STORAGE_DIR)

# Now, go to 01b_get_s2.ipynb to use the extent of the valid data in the created bathymetry rasters to get cloud-masked Sentinel-2 L2A products from Google Earth Engine.