# This notebook is going to download eHydro bathymetry data from the USACE ArcGIS REST repository, as well as retrieve cloud masked imagery of the same location, at the same time, for training of Satellite Derived Bathymetry model(s) for the National Channel Framework (NCF)

In [17]:
import geopandas as gpd
import fiona
import numpy as np
from rasterio.crs import CRS
from rasterio.features import rasterize
from rasterio.transform import from_origin, from_bounds
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.mask import mask
from datetime import datetime, timedelta
import rasterio
import matplotlib.pyplot as plt
import tqdm
import time
import ee
from osgeo import gdal
import requests
import os
import zipfile

In [2]:
ee.Initialize(project = '') ##enter your project name here as a string to initialize exchanges with ee api

# Functions

In [33]:
def get_gee_search_dates(time):
    date_obj = datetime.utcfromtimestamp(time / 1000)
    return ((date_obj - timedelta(days=1)).strftime('%Y-%m-%d'), (date_obj + timedelta(days=1)).strftime('%Y-%m-%d'))

def ehydro_date_convert(time):
    return datetime.utcfromtimestamp(time / 1000).strftime('%Y-%m-%d')

def download_file(url, destination):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    with open(destination, 'wb') as file, tqdm(
        desc=f"Downloading {os.path.basename(destination)}",
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=1024):
            file.write(chunk)
            bar.update(len(chunk))

def get_s2_sr_cld_col(aoi, start_date, end_date, cloud_filter):
    # Import and filter S2 SR.
    s2_sr_col = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
        .filterBounds(aoi)
        .filterDate(start_date, end_date)
        .filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', cloud_filter)))

    # Import and filter s2cloudless.
    s2_cloudless_col = (ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY')
        .filterBounds(aoi)
        .filterDate(start_date, end_date))

    # Join the filtered s2cloudless collection to the SR collection by the 'system:index' property.
    combined_coll = ee.ImageCollection(ee.Join.saveFirst('s2cloudless').apply(**{
        'primary': s2_sr_col,
        'secondary': s2_cloudless_col,
        'condition': ee.Filter.equals(**{
            'leftField': 'system:index',
            'rightField': 'system:index'
        })
    }))

    return combined_coll
def export_rasterized_bathymetry_native_crs(gdf, outpath, resolution):
    """
    Exports a rasterized bathymetry GeoDataFrame in its original CRS.

    Args:
        gdf: GeoDataFrame with bathymetry polygons and a "depthMean" column.
        outpath: Output file path for the raster.
    """
    # Get the bounding box and resolution
    xmin, ymin, xmax, ymax = gdf.total_bounds

    # Define the transform
    transform = from_origin(xmin, ymax, resolution, resolution)

    # Prepare raster shapes
    shapes = [(geom, value) for geom, value in zip(gdf.geometry, gdf["depthMean"])]

    # Calculate the raster dimensions
    height = int((ymax - ymin) / resolution)
    width = int((xmax - xmin) / resolution)

    # Rasterize the bathymetry data
    raster = rasterize(
        shapes,
        out_shape=(height, width),
        transform=transform,
        fill=np.nan,  # No data outside polygons
        dtype="float32",
    )

    # Save the raster in its original CRS
    with rasterio.open(
        outpath,
        "w",
        driver="GTiff",
        height=height,
        width=width,
        count=1,
        dtype="float32",
        crs=gdf.crs,  # Keep the original CRS
        transform=transform,
        nodata=np.nan,
    ) as dst:
        dst.write(raster, 1)

    print(f"Raster in original CRS saved to {outpath}")

def visualize_raster(path):
    with rasterio.open(path) as src:
        bathy = src.read(1)
        xmin, ymin, xmax, ymax = src.bounds
    
    plt.imshow(
        bathy,
        extent=(xmin, xmax, ymin, ymax),
        origin="lower",
        cmap="viridis"
    )
    plt.colorbar(label="Depth (Feet)")
    plt.title("Rasterized Bathymetry")
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.show()

# Query bathy data
AVAILABLE FIELD NAMES:
- Field Name: OBJECTID, Type: esriFieldTypeOID
- Field Name: surveyjobidpk, Type: esriFieldTypeString
- Field Name: sdsid, Type: esriFieldTypeString
- Field Name: sdsfeaturename, Type: esriFieldTypeString
- Field Name: sdsmetadataid, Type: esriFieldTypeString
- Field Name: surveytype, Type: esriFieldTypeString
- Field Name: channelareaidfk, Type: esriFieldTypeString
- Field Name: dateuploaded, Type: esriFieldTypeDate
- Field Name: usacedistrictcode, Type: esriFieldTypeString
- Field Name: surveydatestart, Type: esriFieldTypeDate
- Field Name: surveydateend, Type: esriFieldTypeDate
- Field Name: sourcedatalocation, Type: esriFieldTypeString
- Field Name: sourceprojection, Type: esriFieldTypeString
- Field Name: mediaidfk, Type: esriFieldTypeString
- Field Name: projectedarea, Type: esriFieldTypeDouble
- Field Name: sdsfeaturedescription, Type: esriFieldTypeString
- Field Name: dateloadedenterprise, Type: esriFieldTypeDate
- Field Name: datenotified, Type: esriFieldTypeDate
- Field Name: sourcedatacontent, Type: esriFieldTypeString
- Field Name: plotsheetlocation, Type: esriFieldTypeString
- Field Name: sourceagency, Type: esriFieldTypeString
- Field Name: globalid, Type: esriFieldTypeGlobalID
- Field Name: Shape__Area, Type: esriFieldTypeDouble
- Field Name: Shape__Length, Type: esriFieldTypeDouble

For training the model, will probably want to include options for with:
- usace district
- time of year (date and season)
- NCF ID
- survey type (single vs dual beam; XC, BD, AD, etc.)

In [4]:
# initiate search parameters for eHydro

s2_cloud_cov = 20 ## percentage of clouds in sentinel-2 multispectral imagery, less means you see more surface
search_date = '2018-01-01'  # Date threshold, getting data from 2018 to present
usace_code = "CESWG"        # Galveston District (for now)

NUM_OF_QUERIES = 3          # number of iterations for the request to run
QUERY_TIME_DELAY = 2        # query time delay in seconds, used when requesting all features
URL = "https://services7.arcgis.com/n1YM8pTrFmm7L4hs/ArcGIS/rest/services/eHydro_Survey_Data/FeatureServer/0/query"
DOWNLOAD_DIR = f'/home/clay/Documents/SDB/{usace_code}/bathy'


In [None]:
# Parameters for the initial query
params = {
    'where': f"surveydatestart >= '{search_date}' AND usacedistrictcode='{usace_code}'",
    'outFields': '*',  # Retrieve all fields
    'resultRecordCount': 2000,  # Maximum records per request
    'resultOffset': 0,  # Starting offset
    'f': 'json',  # Output format
    'outSR': '4326',  # Spatial reference
}

all_features = []

for i in range(NUM_OF_QUERIES):
    response = requests.get(URL, params=params)
    if response.status_code == 200:
        data = response.json()
        features = data.get('features', [])
        if not features:
            break
        all_features.extend(features)
        params['resultOffset'] += params['resultRecordCount']
        print(f"Retrieved {len(features)} features.")
        time.sleep(QUERY_TIME_DELAY)  # Delay of 1 second
    else:
        print(f"Error: {response.status_code}, {response.text}")
        break

1. Extract date and aoi from the surveys for GEE
- plan is to iterate through the queries (probably by district code) and check to see if GEE has a corresponding Sentinel-2 image

In [9]:
geeinfo = {}
dates = []
for feature in all_features:
    dates.append(ehydro_date_convert(feature['attributes']['surveydatestart']))
    area = ee.Geometry.Polygon(feature['geometry']['rings'][0])

    date_tuple = get_gee_search_dates(feature['attributes']['surveydatestart'])

    geeinfo[feature['attributes']['surveyjobidpk']] = [area, date_tuple]
surveykeys = list(geeinfo.keys())

2. Iterate through responses and check if GEE has corresponding image(s)
- if not, the response will be deleted

In [None]:
for survey, items in geeinfo.items():
    aoi = items[0]
    dates = items[1]

    coll = get_s2_sr_cld_col(aoi, dates[0], dates[1], s2_cloud_cov)

    if coll.size().getInfo() > 0:
        geeinfo[survey].append(coll)

Get all surveys that have initial images

In [22]:
goodsurveys = []
for survey, items in geeinfo.items():
    if len(items) > 2:
        goodsurveys.append(survey)

if len(goodsurveys) == 0:
    print('No appropriate images were found')

3. Extract the eHydro bathy data download urls

In [None]:
bathyinfo = {}
for i, feature in enumerate(all_features[:10]):
    bathyinfo[surveykeys[i]] = feature['attributes']['sourcedatalocation']

4. Download the eHydro data locally for training models

In [None]:
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

for survey in goodsurveys:
    file_path = os.path.join(DOWNLOAD_DIR, f"{survey}.zip")
    try:
        download_file(bathyinfo[survey], file_path)
    except Exception as e:
        print(f"Failed to download {bathyinfo[survey]}: {e}")

print('='*250)
print("All files downloaded.")

# Unzip downloaded data

In [5]:
zipnames = [f[:-4] for f in os.listdir(DOWNLOAD_DIR) if f.endswith('.zip')]
if len(zipnames) > 0:
    for name in zipnames:
        zipfile_path = os.path.join(DOWNLOAD_DIR, f'{name}.zip')
        with zipfile.ZipFile(zipfile_path,'r') as zip_ref:
            zip_ref.extractall(zipfile_path[:-4])
            os.remove(zipfile_path)
    surveynames = [f for f in os.listdir(DOWNLOAD_DIR) if not f.startswith('.')]
else:
    surveynames = [f for f in os.listdir(DOWNLOAD_DIR) if not f.startswith('.')]

# Get the .gdb file, specifically the "Bethymetry_Vector" layer and the "depthMean" and "geometry" columns
- gets bathymetry raster in the native crs within eHydro
- Sentinel-2 retreival notebook will get the bbox and convert to EPSG:4326 for image searching

In [None]:
gdbinfo={}
output_dir = DOWNLOAD_DIR + '_rasters'
os.makedirs(output_dir, exist_ok=True)

for name in surveynames:
    folder_path = os.path.join(DOWNLOAD_DIR, name)
    gdbfile = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.gdb')][0]
    layers = fiona.listlayers(gdbfile)

    gdf = gpd.read_file(gdbfile, layer='Bathymetry_Vector')

    outfile = os.path.join(output_dir, f"{name}.tif")

    export_rasterized_bathymetry_native_crs(gdf, outfile, 32.8084)  # 32.8084 ft roughly equal to 10 meters
    gdbinfo[name] = [gdf, outfile]

In [None]:
# for key in gdbinfo.keys():
#     visualize_raster(gdbinfo[key][1])

# Now, go to 01b_get_s2.ipynb to use the extent of the valid data in the created bathymetry rasters to get cloud-masked Sentinel-2 L2A products from Google Earth Engine.