# Use this notebook to extract some of the needed data for training the model. Assuming the bathy data is downloaded from 01_get_data.ipynb, and the imagery is downloaded from 01b_get_s2_SAFE.ipynb

In [1]:
import os
import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
import numpy as np

# Functions

In [2]:
def normalize(array):
    return (array - np.nanmin(array)) / (np.nanmax(array) - np.nanmin(array))

def reproject_sentinel2(bathy_raster, s2_raster, reprojected_s2):
    # Open the bathymetry raster to get its CRS
    with rasterio.open(bathy_raster) as bathy_src:
        bathy_crs = bathy_src.crs

    # Open the Sentinel-2 raster for reprojection
    with rasterio.open(s2_raster) as s2_src:
        # Calculate the transform, width, and height for the new CRS
        transform, width, height = calculate_default_transform(
            s2_src.crs, bathy_crs, s2_src.width, s2_src.height, *s2_src.bounds
        )

        # Update metadata for the reprojected Sentinel-2 raster
        new_meta = s2_src.meta.copy()
        new_meta.update({
            "crs": bathy_crs,
            "transform": transform,
            "width": width,
            "height": height
        })

        # Reproject and save the Sentinel-2 raster
        with rasterio.open(reprojected_s2, "w", **new_meta) as dst:
            for i in range(1, s2_src.count + 1):  # Loop through bands
                reproject(
                    source=rasterio.band(s2_src, i),
                    destination=rasterio.band(dst, i),
                    src_transform=s2_src.transform,
                    src_crs=s2_src.crs,
                    dst_transform=transform,
                    dst_crs=bathy_crs,
                    resampling=Resampling.cubic_spline  # Cubic spline interpolation
                )

    print(f"Reprojected Sentinel-2 raster saved to: {reprojected_s2}")

def resample_bathy_to_sentinel2(bathy_raster, sentinel_raster, resampled_bathy):

    with rasterio.open(sentinel_raster) as sentinel_src:
        sentinel_transform = sentinel_src.transform
        sentinel_crs = sentinel_src.crs
        sentinel_width = sentinel_src.width
        sentinel_height = sentinel_src.height

    with rasterio.open(bathy_raster) as bathy_src:
        resampled_bathymetry = np.empty((sentinel_height, sentinel_width), dtype=bathy_src.dtypes[0])

        # Perform the reprojection and resampling
        reproject(
            source=rasterio.band(bathy_src, 1),  # Use the bathymetry data as the source
            destination=resampled_bathymetry,   # Target array for the resampled bathymetry
            src_transform=bathy_src.transform,
            src_crs=bathy_src.crs,
            dst_transform=sentinel_transform,  # Match Sentinel-2 raster's transform
            dst_crs=sentinel_crs,              # Match Sentinel-2 raster's CRS
            dst_width=sentinel_width,
            dst_height=sentinel_height,
            resampling=Resampling.bilinear     
        )

        bathy_meta = bathy_src.meta.copy()
        bathy_meta.update({
            "transform": sentinel_transform,
            "crs": sentinel_crs,
            "width": sentinel_width,
            "height": sentinel_height,
            "dtype": bathy_src.dtypes[0]  # Use the bathymetry data type
        })

    # Save the resampled bathymetry raster
    with rasterio.open(resampled_bathy, "w", **bathy_meta) as dst:
        dst.write(resampled_bathymetry, 1)

    print(f"Resampled bathymetry raster saved to: {resampled_bathy}")

def clip_sentinel_by_bathy(bathy_raster, sentinel_raster, output_sentinel):
    # Open the bathymetry raster
    with rasterio.open(bathy_raster) as bathy_src:
        bathy_data = bathy_src.read(1)  # Read the first band (assumes single-band data)
        valid_bathy_mask = ~np.isnan(bathy_data)  # Non-NaN bathymetry pixels are valid
        bathy_transform = bathy_src.transform
        bathy_crs = bathy_src.crs

    # Open the Sentinel-2 raster
    with rasterio.open(sentinel_raster) as sentinel_src:
        # Ensure CRS and transform match
        if sentinel_src.crs != bathy_crs or sentinel_src.transform != bathy_transform:
            raise ValueError("Sentinel-2 raster must already be aligned with bathymetry raster.")

        sentinel_data = sentinel_src.read()  # Read all bands of Sentinel-2 raster

        # Create a mask for Sentinel-2 valid pixels (exclude zero values)
        valid_sentinel_mask = sentinel_data[0, :, :] != 0  # Assuming first band is representative

        # Combine masks (valid bathymetry AND valid Sentinel-2)
        combined_mask = valid_bathy_mask & valid_sentinel_mask

        # Apply the combined mask to Sentinel-2 data
        clipped_sentinel_data = np.where(combined_mask, sentinel_data, np.nan)

        # Update metadata
        sentinel_meta = sentinel_src.meta.copy()
        sentinel_meta.update({
            "dtype": "float32",
            "nodata": np.nan
        })

    # Save the clipped Sentinel-2 raster
    with rasterio.open(output_sentinel, "w", **sentinel_meta) as dst:
        dst.write(clipped_sentinel_data)

    print(f"Clipped Sentinel-2 raster saved to: {output_sentinel}")

def clip_bathy_by_sentinel(bathy_raster, clipped_sentinel_raster, output_bathy):
    # Open the clipped Sentinel-2 raster to create a valid mask
    with rasterio.open(clipped_sentinel_raster) as sentinel_src:
        sentinel_data = sentinel_src.read(1)  # Read the first band (assumes single-band data)
        valid_sentinel_mask = ~np.isnan(sentinel_data)  # Non-NaN pixels are valid
        sentinel_transform = sentinel_src.transform
        sentinel_crs = sentinel_src.crs

    # Open the bathymetry raster
    with rasterio.open(bathy_raster) as bathy_src:
        # Ensure CRS and transform match
        if bathy_src.crs != sentinel_crs or bathy_src.transform != sentinel_transform:
            raise ValueError("Bathymetry raster must already be aligned with Sentinel-2 raster.")

        bathy_data = bathy_src.read(1)  # Assuming single-band bathymetry data
        clipped_bathy_data = np.where(valid_sentinel_mask, bathy_data, np.nan)  # Mask bathymetry data

        # Update metadata
        bathy_meta = bathy_src.meta.copy()
        bathy_meta.update({
            "dtype": "float32",
            "nodata": np.nan
        })

    # Save the clipped bathymetry raster
    with rasterio.open(output_bathy, "w", **bathy_meta) as dst:
        dst.write(clipped_bathy_data, 1)

    print(f"Clipped bathymetry raster saved to: {output_bathy}")

# Establish working directories

In [3]:
S2_PATH = '/home/clay/Documents/SDB/CESWG/processed_acolite'
BATHY_PATH = '/home/clay/Documents/SDB/CESWG/bathy_rasters'
S2_PROJ = S2_PATH.replace('s2_rasters','s2_proj')
BATHY_PROJ = BATHY_PATH.replace('bathy_rasters','bathy_proj')

FINAL_PATH = '/home/clay/Documents/SDB/CESWG/processed'
S2_FINAL = os.path.join(FINAL_PATH, 'S2')
BATHY_FINAL = os.path.join(FINAL_PATH, 'Bathy')


os.makedirs(S2_PROJ, exist_ok=True)
os.makedirs(BATHY_PROJ, exist_ok=True)
os.makedirs(FINAL_PATH, exist_ok=True)
os.makedirs(S2_FINAL, exist_ok=True)
os.makedirs(BATHY_FINAL, exist_ok=True)

In [19]:
surveynames = [f[:-4] for f in os.listdir(BATHY_PATH) if f.endswith('.tif')]
surveyinfo = {}
for f in surveynames:
    surveyinfo[f] = [os.path.join(BATHY_PATH, f"{f}.tif"), os.path.join(S2_PATH, f"{f}.tif")]

# Filter out the L2W.nc files that have no pixels

# Rasterize the merged L2W.nc files produced by ACOLITE
- ensure each band is included in the tif
- if multiple, rasterize each, then take medians of the overlap

In [None]:
def netcdf_to_raster(nc_path, variable, out_tif):
    """
    Converts a NetCDF (.nc) file to a GeoTIFF raster.
    
    Args:
        nc_path (str): Path to the input NetCDF file.
        variable (str): Name of the variable to extract (e.g., 'rhow_442' or 'TUR_Novoa2017').
        out_tif (str): Output path for the GeoTIFF raster.
    """
    # Open the NetCDF file
    ds = xr.open_dataset(nc_path)

    # Ensure the variable exists in the dataset
    if variable not in ds.variables:
        raise ValueError(f"Variable '{variable}' not found in the NetCDF file. Available variables: {list(ds.variables.keys())}")

    # Extract the variable (e.g., rhow_442)
    da = ds[variable]

    # Extract spatial coordinates (lat, lon)
    lat = ds["lat"].values
    lon = ds["lon"].values

    # Ensure latitudes are sorted in descending order
    if lat[0] < lat[-1]:
        da = da[::-1, :]
        lat = lat[::-1]

    # Define raster transform (assumes regular grid)
    transform = from_origin(lon.min(), lat.max(), abs(lon[1] - lon[0]), abs(lat[1] - lat[0]))

    # Convert data to numpy array
    data_array = da.values.astype("float32")

    # Handle NoData values
    nodata_value = -9999.0
    data_array = np.where(np.isnan(data_array), nodata_value, data_array)

    # Save as GeoTIFF
    with rasterio.open(
        out_tif,
        "w",
        driver="GTiff",
        height=data_array.shape[0],
        width=data_array.shape[1],
        count=1,
        dtype="float32",
        crs="EPSG:4326",  # Set CRS, modify if needed
        transform=transform,
        nodata=nodata_value,
    ) as dst:
        dst.write(data_array, 1)


# Reproject ACOLITE tif products from EPSG:4326 to the eHydro CRS

In [None]:
for name, rasters in surveyinfo.items():
    reprojected_s2 = os.path.join(S2_PROJ, f"{name}.tif")
    reproject_sentinel2(rasters[0], rasters[1], reprojected_s2)

# Resample bathy rasters from 10 ft resolution to same resolution as S2 rasters
- I think I resampled these to 32.8084 ft (10-meter) already, but can check just in case

In [None]:
for name, rasters in surveyinfo.items():
    reprojected_bathy = os.path.join(BATHY_PROJ, f"{name}.tif")
    reprojected_s2 = os.path.join(S2_PROJ, f"{name}.tif")

    resample_bathy_to_sentinel2(rasters[0], reprojected_s2, reprojected_bathy)

In [33]:
# new dictionary with rasters of matching CRS and spatial resolution (~10 meters)

reprojected_rasters = {}
for name, rasters in surveyinfo.items():
    for raster in rasters:
        if 'bathy_rasters' in raster:
            bathypath = raster.replace('bathy_rasters', 'bathy_proj')
        elif 's2_rasters' in raster:
            s2path = raster.replace('s2_rasters', 's2_proj')
    reprojected_rasters[name] = [bathypath, s2path]

# Clip the S2 rasters by the valid bathymetry pixels
- need bounds of non np.nan pixels for clipping

In [None]:
for name, rasters in reprojected_rasters.items():
    reprojected_s2 = os.path.join(S2_FINAL, f"{name}.tif")

    clip_sentinel_by_bathy(rasters[0], rasters[1], reprojected_s2)

# Clip the bathymetry rasters by the valid cloud-masked S2 pixels
- need bounds of valid pixels, seems like these will be values above 0.0 since no nan-value is applied in GEE

In [None]:
for name, rasters in reprojected_rasters.items():
    final_bathy = os.path.join(BATHY_FINAL, f"{name}.tif")
    final_s2 = os.path.join(S2_FINAL, f"{name}.tif")

    clip_bathy_by_sentinel(rasters[0], final_s2, final_bathy)

Should be done with needed preprocessing, can now move to training the model on the data
- will try traditional ML regression models, as well as CNN
- May try majority voting of multiple training set models like in Tan et al. 2022

# Thinking that exporting the data to a .csv or parquet can alleviate some of my memory issues. Going to try it

In [None]:

def prepare_train_data(surveynames):
    pairs = [(os.path.join(S2_PATH, f'{name}.tif'), os.path.join(BATHY_PATH, f'{name}.tif')) for name in surveynames]
    
    good_pairs = []
    goodnames = []
    for name, pair in zip(surveynames, pairs):
        with rasterio.open(pair[0]) as src:
            band = src.read(1)
            if band.shape[0] != 0:
                good_pairs.append(pair)
                goodnames.append(name)

    images_data = [extract_raster_data(pair) for pair in good_pairs]
    ncf_channels, survey_types = survey_name_type(goodnames)
    # all_bands = [create_composite_bands_with_existing(pair[0]) for pair in images_data]
    pixel_positions = [get_pixel_positions(os.path.join(S2_PATH, f'{name}.tif')) for name in goodnames]
    
    data = {}
    for i, name in enumerate(goodnames):
        # Extract data for the current iteration
        bands = images_data[i]               # Shape (n_pixels, 7)
        positions = pixel_positions[i]     # Shape (n_pixels, 2)
    
        data[name] = pd.DataFrame({
                "Blue": bands[0][0][:, 0],
                "Green": bands[0][0][:, 1],
                "Red": bands[0][0][:, 2],
                "NIR": bands[0][0][:, 3],
                "Blue/Green": bands[0][0][:, 4],
                "Green/Blue": bands[0][0][:, 5],
                "Stumpf": bands[0][0][:, 6],
                "NSMI": bands[0][0][:, 7],
                "TI": bands[0][0][:, 8],
                "X": positions[:, 0],
                "Y": positions[:, 1],
                "Channel_Name": [ncf_channels[i]] * len(bands[0][0]),  # Repeating value directly
                "Bathymetry": bands[0][1]
            })

    combined_df = pd.concat(data.values(), ignore_index=True)

    encoder = LabelEncoder()
    combined_df['Channel_Name_Encoded'] = encoder.fit_transform(combined_df['Channel_Name'])

    output = open(os.path.join(WORK_DIR, 'Channel_Name_label_encoders.pkl'), 'wb')
    pickle.dump(encoder, output)
    output.close()
    
    # Drop original categorical columns
    combined_df.drop(columns=['Channel_Name'], inplace=True)

    X = combined_df.drop(columns=['Bathymetry'])
    y = combined_df['Bathymetry']

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

    # Split temp into validation (15%) and test (15%)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    return combined_df, X_train, y_train, X_test, y_test, X_val, y_val