This notebook is going to leverage asf_search to retrieve Sentinel-2 .SAFE images that correspond to the eHydro hydrographic surveys. These .SAFE files will then be fed into ACOLITE for the needed preprocessing. Once preprocessed, the images for the hydrographic surveys and the Sentinel-2 images will be fed into 02_data_prep.ipynb to ensure the same area coverage

I am big dumb. Instead of querying from eHydro I can just manually get the already created time cubes that are used for csat.

Include:
- OSM for identifying bridges
- Use the TUR and SPM bands for identifying adequate imagery. E.g., if the images has not clear enough
- Maybe try to get bathy survey bounds and dates directly from API search? Not sure how much more efficient it would actually be

In [1]:
import sys, os
from tqdm import tqdm
from osgeo import gdal
import rasterio
import numpy as np
from pyproj import Transformer
from datetime import datetime, timedelta
import re
import matplotlib.pyplot as plt
from collections import Counter
import asf_search as asf
from shapely.geometry import Polygon
import pandas as pd
import requests
from sentinelhub import (
    SHConfig,
    DataCollection,
    SentinelHubCatalog,
    SentinelHubRequest,
    SentinelHubDownloadClient,
    BBox,
    bbox_to_dimensions,
    CRS,
    MimeType,
    Geometry,
)
import netCDF4 as nc
import xarray as xr
import rioxarray as rxr
from pyproj import CRS

# Functions

In [2]:
def visualize_bathy_raster(path):
    with rasterio.open(path) as src:
        bathy = src.read(1)
        xmin, ymin, xmax, ymax = src.bounds
    
    plt.imshow(
        bathy,
        extent=(xmin, xmax, ymin, ymax),
        origin="lower",
        cmap="viridis"
    )
    plt.colorbar(label="Depth (Feet)")
    plt.title("Rasterized Bathymetry")
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.show()

def extract_date(filepath):
    """extract search date window from the eHydro data"""
    match = re.search(r'\d{4}\d{2}\d{2}', filepath)
    date = datetime.strptime(match.group(), '%Y%m%d')
    return (date - timedelta(days=1)).strftime('%Y-%m-%d'), (date + timedelta(days=1)).strftime('%Y-%m-%d')

def get_reach_date(surveyname):
    pattern = re.compile(r'([A-Za-z]{2}_[A-Za-z0-9]{2}_[A-Za-z]{3,4}_\d{8})')
    match = pattern.search(surveyname)

    if match:
        return match.group(1)
    else:  
        return None

def extract_valid_bounds_to_epsg4326(raster_path):
    """Extracts the bounding box of valid (non-NaN) data from a raster and converts it to EPSG:4326."""
    with rasterio.open(raster_path) as src:
        # Read the raster data
        data = src.read(1)  # Assuming a single band
        transform = src.transform  # Affine transform of the raster
        src_crs = src.crs  # Source CRS of the raster

        # Create a mask for valid (non-NaN) pixels
        valid_mask = ~np.isnan(data)

        # Find the row and column indices of valid pixels
        valid_rows, valid_cols = np.where(valid_mask)

        if valid_rows.size == 0 or valid_cols.size == 0:
            raise ValueError("No valid data in the raster.")

        # Calculate the geographic coordinates of the valid bounds
        min_row, max_row = valid_rows.min(), valid_rows.max()
        min_col, max_col = valid_cols.min(), valid_cols.max()

        # Use the transform to convert row/col to geographic bounds
        min_x, min_y = rasterio.transform.xy(transform, min_row, min_col, offset="ul")
        max_x, max_y = rasterio.transform.xy(transform, max_row, max_col, offset="ul")

        # Bounds in the source CRS
        bounds_src_crs = (min_x, min_y, max_x, max_y)

        # Transform bounds to EPSG:4326
        transformer = Transformer.from_crs(src_crs, "EPSG:4326", always_xy=True)
        min_x_4326, min_y_4326 = transformer.transform(min_x, min_y)
        max_x_4326, max_y_4326 = transformer.transform(max_x, max_y)

        bounds_epsg4326 = (min_x_4326, min_y_4326, max_x_4326, max_y_4326)

        # Create polygon coordinates in clockwise order starting from top-left
        coords = [
            (min_x_4326, max_y_4326),  # top-left
            (max_x_4326, max_y_4326),  # top-right
            (max_x_4326, min_y_4326),  # bottom-right
            (min_x_4326, min_y_4326),  # bottom-left
            (min_x_4326, max_y_4326)   # back to top-left to close the polygon
        ]
    
        # Format coordinates into WKT string
        coord_str = ','.join([f'{x} {y}' for x, y in coords])
        wkt = f'POLYGON(({coord_str}))'
    
        # bbox = ee.Geometry.BBox(bounds_epsg4326[0], bounds_epsg4326[1], bounds_epsg4326[2], bounds_epsg4326[3])

    return wkt

def run_acolite_processing(surveyname, processed_acolite_path):
    import os, sys
    user_home = os.path.expanduser("~")
    sys.path.append(os.path.join(user_home, 'tools/acolite'))
    import acolite as ac

    # Create the ACOLITE output directory
    acolite_survey = os.path.join(processed_acolite_path, get_reach_date(surveyname))
    os.makedirs(acolite_survey, exist_ok=True)

    # Find all .SAFE bundles for the survey
    safe_sets = [
        os.path.join(S2_PATH, surveyname, f)
        for f in os.listdir(os.path.join(S2_PATH, surveyname))
        if f.endswith('.SAFE')
    ]

    # Extract bounding polygon (ignore returned EPSG since we won't reproject)
    bounds = extract_valid_bounds_to_epsg4326(
        os.path.join(BATHY_RASTER_PATH, f'{get_reach_date(surveyname)}.tif')
    )

    settings_file = None

    # Process each SAFE bundle
    for bundle in safe_sets:
        # Parse default/empty settings
        settings = ac.acolite.settings.parse(settings_file)

        # General options
        settings['inputfile'] = bundle
        settings['output'] = acolite_survey
        settings['polygon'] = bounds
        settings['s2_target_res'] = 10  # (10, 20, or 60 for Sentinel-2)
        settings['atmospheric_correction_option'] = 'dark_spectrum'
        settings['merge_tiles'] = True

        # L2W parameters
        settings['l2w_parameters'] = ['rhow_*', 'Rrs_*', 'tur_novoa2017', 'spm_novoa2017']
        settings['dsf_aot_estimate'] = 'fixed'
        settings['dsf_residual_glint_correction'] = True

        # Gains
        settings['gains'] = True

        # Disable any image (PNG) outputs
        settings['rgb_rhot'] = False
        settings['rgb_rhos'] = False

        # Keep XY in NetCDF (optional)
        settings['output_xy'] = True

        # GeoTIFFs
        settings['l2w_export_geotiff'] = True
        settings['rgb_rhow'] = True
        settings['l1r_export_geotiff'] = False
        settings['l2r_export_geotiff'] = False
        settings['export_geotiff_coordinates'] = True
        settings['export_geotiff_match_file'] = False
        settings['export_cloud_optimized_geotiff'] = False

        # Delete intermediate NetCDF files, but keep L2W
        settings['l1r_delete_netcdf'] = True
        settings['l2r_delete_netcdf'] = True
        # (Do NOT set l2w_delete_netcdf=True or you'll lose the L2W file!)

        # Run ACOLITE processing
        ac.acolite.acolite_run(settings=settings)

def get_access_token(username: str, password: str) -> str:
    data = {
        "client_id": "cdse-public",
        "username": username,
        "password": password,
        "grant_type": "password",
        "scope": "openid"
    }
    try:
        r = requests.post(
            "https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token",
            data=data,
        )
        r.raise_for_status()
    except Exception as e:
        raise Exception(
            f"Access token creation failed. Reponse from the server was: {r.json()}"
        )
    return r.json()["access_token"]

def stack_tifs_to_multiband(tif_files, output_file):
    if not tif_files:
        raise ValueError("No input files provided.")
    
    # Open the first file to retrieve spatial metadata and data type
    with rasterio.open(tif_files[0]) as src:
        meta = src.meta.copy()
        width = src.width
        height = src.height
        crs = src.crs
        transform = src.transform
        dtype = src.dtypes[0]
    
    # Update metadata for multi-band output (number of bands equals number of files)
    meta.update({
        'count': len(tif_files),
    })
    
    # Create an empty numpy array to hold all bands
    stacked_data = np.empty((len(tif_files), height, width), dtype=dtype)
    
    # Read each file and add it to the array
    for idx, tif in enumerate(tif_files):
        with rasterio.open(tif) as src:
            # Check dimensions for consistency
            if src.width != width or src.height != height:
                raise ValueError(f"File {tif} dimensions do not match the first file.")
            stacked_data[idx, :, :] = src.read(1)
    
    # Write the stacked array to a new multi-band GeoTIFF file
    with rasterio.open(output_file, 'w', **meta) as dst:
        dst.write(stacked_data)
    
    print(f"Multi-band file written to: {output_file}")

In [3]:
# txt file continaing username and password for copernicus browser, as well as the client id and secret for sentinelhub
# you gotta make your own, too lazy to keep typing in my info

with open('/home/clay/Desktop/s2_login_stuff.txt') as f:        
    lines = f.readlines()

config = SHConfig()
config.sh_client_id = lines[0][:-1]
config.sh_client_secret = lines[1][:-1]
config.sh_base_url = 'https://sh.dataspace.copernicus.eu'
config.sh_token_url = 'https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token'

# Set working directory

In [4]:
usace_code = 'CESWG'
BATHY_PATH = f'/home/clay/Documents/SDB/{usace_code}/bathy'        
BATHY_RASTER_PATH = f'/home/clay/Documents/SDB/{usace_code}/bathy_rasters'
S2_PATH = f'/home/clay/Documents/SDB/{usace_code}/s2_SAFE'
os.makedirs(S2_PATH, exist_ok=True)

In [5]:
surveynames = [f for f in os.listdir(BATHY_PATH)]

# Search for appropriate Sentinel-2 L1C .SAFE files
- .SAFE needed for input into ACOLITE

In [None]:
access_token = get_access_token(lines[-2][:-1], lines[-1][:-1])
data_collection = 'SENTINEL-2'

survey_info = {}
for survey_name in surveynames:

    raster = os.path.join(BATHY_PATH, f"{survey_name}.tif")
    date = extract_date(raster)
    bounds = extract_valid_bounds_to_epsg4326(raster) 

    json = requests.get(f"https://catalogue.dataspace.copernicus.eu/odata/v1/Products?$filter=Collection/Name eq '{data_collection}' and OData.CSC.Intersects(area=geography'SRID=4326;{bounds}') and ContentDate/Start gt {date[0]}T00:00:00.000Z and ContentDate/Start lt {date[1]}T00:00:00.000Z").json()
    results=pd.DataFrame.from_dict(json['value'])

    if len(results) != 0:
        urls = []
        s2_names = []
        for s2_name in list(results.Name):
            if 'L1C' in s2_name:
                urls.append(f"https://zipper.dataspace.copernicus.eu/odata/v1/Products({(results[results.Name == s2_name]['Id'].values[0])})/$value")
                s2_names.append(s2_name)
        survey_info[survey_name] = (urls, s2_names)
    else:
        continue

# Download Copernicus Hub for Sentinel-2 L1C .SAFE files
- iterate by eHydro name. Create a folder for each survey
- store all .SAFE files in designated survey folder
- .SAFE files named appropriately as stored in .items
- Mosaic together during ACOLITE processing, or in 02_data_prep.ipynb

no worky ones:
- S2A_MSIL1C_20221025T165401_N0510_R026_T15RTN_20240728T042150.SAFE
- S2A_MSIL1C_20230413T164841_N0509_R026_T15RUN_20230413T220740.SAFE
- S2A_MSIL1C_20230413T164841_N0510_R026_T15RUN_20240824T151650.SAFE
- S2B_MSIL1C_20230518T164849_N0509_R026_T15RVP_20230518T220853.SAFE
- S2B_MSIL1C_20230518T164849_N0509_R026_T15RUP_20230518T220853.SAFE
- S2A_MSIL1C_20230304T165201_N0510_R026_T15RUP_20240819T071321.SAFE
- S2B_MSIL1C_20200811T164849_N0500_R026_T15RUP_20230510T185335.SAFE
- S2B_MSIL1C_20200811T164849_N0500_R026_T15RTN_20230510T185335.SAFE
- S2B_MSIL1C_20200811T164849_N0500_R026_T15RUN_20230510T185335.SAFE
- S2B_MSIL1C_20221013T170239_N0400_R069_T15RTN_20221013T215905.SAFE
- S2A_MSIL1C_20200501T165901_N0500_R069_T14RQT_20230330T120915.SAFE
- S2A_MSIL1C_20200501T165901_N0500_R069_T15RTM_20230330T120915.SAFE
- S2A_MSIL1C_20230722T164901_N0509_R026_T15RUN_20230722T215211.SAFE
- S2B_MSIL1C_20191218T170719_N0500_R069_T14RQS_20230607T033311.SAFE
- S2B_MSIL1C_20191218T170719_N0500_R069_T15RTM_20230607T033311.SAFE

In [None]:
for key, items in tqdm(survey_info.items(), desc="Processing surveys"):
    if len(items[0]) == 0:
        continue
        
    os.makedirs(os.path.join(S2_PATH, key), exist_ok=True)
    headers = {"Authorization": f"Bearer {access_token}"}
    session = requests.Session()
    session.headers.update(headers)
    access_token = get_access_token(lines[-2][:-1], lines[-1][:-1])
    
    # Process each file within the survey
    for url, s2_name in zip(items[0], items[1]):
        try:
            response = session.get(url, headers=headers, stream=True)
            response.raise_for_status()  # Raise an exception for bad status codes
            
            total_size = int(response.headers.get('content-length', 0))
            file_path = os.path.join(S2_PATH, f"{key}/{s2_name[:-5]}.zip")
            
            # Progress bar for individual file download
            with tqdm(
                total=total_size,
                unit='B',
                unit_scale=True,
                desc=f"Downloading {s2_name}",
                leave=True  # Keep the progress bar after completion
            ) as pbar:
                with open(file_path, "wb") as file:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            file.write(chunk)
                            pbar.update(len(chunk))
                            
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {s2_name}: {str(e)}")
            continue

In [None]:
# unzip the files
# you may need to use sudo apt install parallel in a bash terminal
# !find . -type f -name "*.zip" | parallel unzip -o {} -d {//}

# Feed to ACOLITE
- mosaic images if multiple corresponding to a single eHydro survey, possible due to survey covering multiple S2 tiles
- will do ACOLITE processing in this notebook once all .SAFE files are downloaded
- will reproject Bathy and S2 rasters to common CRS in 02_data_prep.ipynb

In [6]:
processed_path = f'/home/clay/Documents/SDB/{usace_code}/processed_acolite'
os.makedirs(processed_path, exist_ok=True)

verbose_acolite = False      # change this if you want to see what's going on in ACOLITE processing

In [None]:
import contextlib       # using this to supress the verbosity since it isn't possible within ACOLITE itself

if verbose_acolite == False:
    for name in surveynames:
        if os.path.exists(os.path.join(S2_PATH, name)):
            if os.path.exists(os.path.join(processed_path, name)) and len(os.listdir(os.path.join(processed_path, name))) > 0:
                continue
            else:
                with open(os.devnull, "w") as f, contextlib.redirect_stdout(f), contextlib.redirect_stderr(f):
                    run_acolite_processing(name, processed_path)
else:
    for name in surveynames:
        if os.path.exists(os.path.join(S2_PATH, name)):
            if os.path.exists(os.path.join(processed_path, name)) and len(os.listdir(os.path.join(processed_path, name))) > 0:
                continue
            else:
                run_acolite_processing(name, processed_path)



In [None]:


test_tifs = [os.path.join('/media/clay/Crucial/test_acolite/VT_03_MME_20230814', f) for f in os.listdir('/media/clay/Crucial/test_acolite/VT_03_MME_20230814') if f.endswith('.tif') and 'rhow_' in f]
stack_tifs_to_multiband(test_tifs, '/media/clay/Crucial/test_acolite/VT_03_MME_20230814_rhow.tif')

Just some notes after first test run for ACOLITE
- will want to get the L2W.nc file, this is the only one I really need
- output only the rhos values, rhot is pointless for my bathy model
- need to figure out something to mask the land, if not done by default.
- make sure glint correction is applied, it should be by default
- DSF vs RAdCor? In manual it seems like RAdCor is best for inland waterways like the NCF
- Should look into developing some code that can split the survey aoi between scenes if they overlap. Would let only certain aois be processed reducing the run time and storage sizes
- DOCKER FOR PARALLEL PROCESSING (doing this last after I have figured out all the processing settings I want)

In [None]:
# Define file paths
target_survey = surveynames[10]

nc_file = [os.path.join(processed_path, target_survey, f) for f in os.listdir(os.path.join(processed_path, target_survey)) if f.endswith('L2W.nc')][0]
bathy_file = os.path.join(BATHY_PATH, f'{target_survey}.tif')  # Assuming bathy is a GeoTIFF

# Open the NetCDF file
dataset = nc.Dataset(nc_file, mode='r')

# List all available variables
print("Available Variables in L2W File:")
print(dataset.variables.keys())

# Select bands to visualize
bands_to_plot = ['Rrs_665', 'rhow_665', 'TUR_Novoa2017', 'SPM_Novoa2017']  # Modify based on dataset

# Create figure and axes (1 row, 5 columns: bathy-overlay + 4 bands)
fig, axes = plt.subplots(1, len(bands_to_plot) + 1, figsize=(5 * (len(bands_to_plot) + 1), 5))

### **Load rhow_665 (Base Layer)**
if 'rhow_665' in dataset.variables:
    rhow_data = dataset.variables['rhow_665'][:]
    rhow_data = np.ma.masked_invalid(rhow_data)  # Mask invalid values
else:
    print("rhow_665 not found in dataset.")

### **Load and Overlay Bathymetry**
with rasterio.open(bathy_file) as bathy:
    bathy_data = bathy.read(1)  # Read first band
    bathy_data = np.ma.masked_invalid(bathy_data)  # Mask invalid values
    
    # Ensure bathymetry data matches rhow dimensions (optional resizing)
    if bathy_data.shape != rhow_data.shape:
        print("Resizing bathymetry raster to match rhow_665 dimensions...")
        from skimage.transform import resize
        bathy_data = resize(bathy_data, rhow_data.shape, mode='constant', preserve_range=True)

    # Plot rhow_665 first
    im1 = axes[0].imshow(rhow_data, cmap='viridis', interpolation='nearest')
    
    # Overlay bathymetry with transparency
    im2 = axes[0].imshow(bathy_data, cmap='terrain', alpha=0.5, interpolation='nearest')

    axes[0].set_title("rhow_665 + Bathymetry Overlay")
    axes[0].axis('off')
    fig.colorbar(im1, ax=axes[0], fraction=0.046, pad=0.04, label="rhow_665 Reflectance")
    fig.colorbar(im2, ax=axes[0], fraction=0.046, pad=0.04, label="Bathymetry Depth")

### **Plot Remaining ACOLITE Bands**
for i, band in enumerate(bands_to_plot, start=1):  # Start at index 1 after overlay
    if band in dataset.variables:
        data = dataset.variables[band][:]
        data = np.ma.masked_invalid(data)  # Mask invalid values
        
        # Plot reflectance or water quality parameters
        im = axes[i].imshow(data, cmap='viridis', interpolation='nearest')
        axes[i].set_title(band)
        axes[i].axis('off')
        fig.colorbar(im, ax=axes[i], fraction=0.046, pad=0.04)
    else:
        axes[i].set_title(f"{band} Not Found")
        axes[i].axis('off')

# Show the plots
plt.tight_layout()
plt.show()

# Close the dataset
dataset.close()


# The above cell processes the L1C images (still need to mess with settings a bit), but does not clip to the boundaries of the survey. Clipping to only survey pixels will probably be done in 02_data_prep.ipynb