# Use this notebook to extract some of the needed data for training the model. Assuming the bathy data is downloaded from 01_get_data.ipynb, and the imagery is downloaded from 01b_get_s2_SAFE.ipynb

In [1]:
import os
import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling, transform_bounds, transform as rio_transform
from rasterio import Affine
import numpy as np
import re
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd

# Functions

In [2]:
def mask_acolite_images(rhow_paths, rrs_paths, spm_tur_paths):
    def mask_image(image_path):
        with rasterio.open(image_path) as src:
            meta = src.meta.copy()
            meta['descriptions'] = src.descriptions
            bands = [src.read(i) for i in range(1, src.count + 1)]
            max_val = bands[0].max()
            masked_bands = [np.where(band == max_val, np.nan, band) for band in bands]
        return masked_bands, meta

    def process_and_write(paths):
        masked_files = []
        for path in paths:
            masked_bands, meta = mask_image(path)
            meta['count'] = len(masked_bands)
            mask_path = path.replace('stacked', 'masked')
            os.makedirs(os.path.dirname(mask_path), exist_ok=True)
            meta['dtype'] = 'float32'
            with rasterio.open(mask_path, 'w', **meta) as dst:
                for i, band in enumerate(masked_bands, start=1):
                    dst.write(band.astype(np.float32), i)
                    # Reassign original band description if available
                    if meta.get('descriptions') and len(meta['descriptions']) >= i:
                        dst.set_band_description(i, meta['descriptions'][i-1])
            masked_files.append(mask_path)
        return masked_files

    masked_rhow_files = process_and_write(rhow_paths)
    masked_rrs_files = process_and_write(rrs_paths)
    masked_spm_files = process_and_write(spm_tur_paths)
    
    return masked_rhow_files, masked_rrs_files, masked_spm_files

def reproject_acolite(bathy_raster, paths, target_resolution_m=10):
    """
    Reprojects a list of rasters (paths) to a common grid defined by the union 
    of their extents in the target CRS (from bathy_raster) using a target resolution
    (in meters, then converted to feet). The output rasters are saved to disk with 
    'masked' replaced by 'projected' in their filenames.
    
    Parameters:
        bathy_raster (str): File path to a bathy raster whose CRS will be used.
        paths (list of str): List of file paths to the input rasters.
        target_resolution_m (float): Desired resolution in meters. (Default: 10)
        
    Returns:
        out_paths (list of str): List of reprojected raster file paths.
        dst_transform (Affine): The common affine transform used.
        width (int): Width (in pixels) of the common grid.
        height (int): Height (in pixels) of the common grid.
    """
    # Get the target CRS from the bathy raster.
    with rasterio.open(bathy_raster) as bathy_src:
        target_crs = bathy_src.crs

    # Convert target resolution to feet.
    meter_to_feet = 3.28084
    target_res = target_resolution_m * meter_to_feet

    # Compute the union bounds (in target CRS) of all input rasters.
    union_xmin, union_ymin, union_xmax, union_ymax = None, None, None, None
    for path in paths:
        with rasterio.open(path) as src:
            # Transform the source bounds to the target CRS.
            src_bounds = src.bounds
            dst_bounds = transform_bounds(
                src.crs, target_crs,
                src_bounds.left, src_bounds.bottom,
                src_bounds.right, src_bounds.top,
                densify_pts=21  # densify to improve accuracy on curved transforms
            )
            # dst_bounds is (xmin, ymin, xmax, ymax)
            if union_xmin is None:
                union_xmin, union_ymin, union_xmax, union_ymax = dst_bounds
            else:
                union_xmin = min(union_xmin, dst_bounds[0])
                union_ymin = min(union_ymin, dst_bounds[1])
                union_xmax = max(union_xmax, dst_bounds[2])
                union_ymax = max(union_ymax, dst_bounds[3])

    # Define the common affine transform.
    # In raster space, the transform is typically defined as:
    # Affine(pixel_width, 0, x_min, 0, -pixel_height, y_max)
    dst_transform = Affine(target_res, 0, union_xmin,
                           0, -target_res, union_ymax)
    
    # Compute the dimensions of the common grid.
    width = int(np.ceil((union_xmax - union_xmin) / target_res))
    height = int(np.ceil((union_ymax - union_ymin) / target_res))

    out_paths = []
    # Reproject each raster to the common grid.
    for path in paths:
        with rasterio.open(path) as src:
            meta = src.meta.copy()
            meta['descriptions'] = src.descriptions
            source_nodata = src.nodata if src.nodata is not None else np.nan
            
            # Update metadata for the new common grid.
            meta.update({
                "crs": target_crs,
                "transform": dst_transform,
                "width": width,
                "height": height,
                "dtype": "float32",
                "nodata": source_nodata
            })
            
            # Prepare an array to hold all bands.
            count = src.count
            dest_array = np.empty((count, height, width), dtype=src.dtypes[0])
            
            # Reproject each band.
            for i in range(1, count + 1):
                band = src.read(i)
                dest = np.empty((height, width), dtype=src.dtypes[0])
                reproject(
                    source=band,
                    destination=dest,
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=dst_transform,
                    dst_crs=target_crs,
                    resampling=Resampling.bilinear,
                    src_nodata=source_nodata,
                    dst_nodata=source_nodata
                )
                dest_array[i - 1, :, :] = dest

        # Build output file path (e.g., replacing 'masked' with 'projected').
        out_path = path.replace('masked', 'projected')
        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        with rasterio.open(out_path, 'w', **meta) as dst:
            dst.write(dest_array.astype(np.float32))
            for i, desc in enumerate(meta.get('descriptions', []), start=1):
                dst.set_band_description(i, desc)
        out_paths.append(out_path)
    
    return out_paths

def visualize_rasters(raster_path, hydro_path, spm_path):
    import math
    import matplotlib.gridspec as gridspec
    """
    Visualize all bands from a masked tif file (rhow or Rrs) along with the bathymetry raster,
    and additionally visualize the two-band SPM_TUR file on the last row.
    
    Parameters:
        raster_path (str): File path of the masked rhow or Rrs tif file.
        hydro_path (str): File path of the bathymetry tif.
        spm_path (str): File path of the SPM_TUR tif file (assumed to contain two bands).
    """
    # Read main raster bands and metadata
    with rasterio.open(raster_path) as src:
        meta = src.meta.copy()
        masked_bands = [src.read(i) for i in range(1, src.count + 1)]
        band_names = src.descriptions

    # Read bathymetry band
    with rasterio.open(hydro_path) as src_hydro:
        hydro_band = src_hydro.read(1)
    
    # Read SPM_TUR bands (assumed two bands)
    with rasterio.open(spm_path) as src_spm:
        spm_bands = [src_spm.read(i) for i in range(1, src_spm.count + 1)]
        spm_band_names = src_spm.descriptions

    # Set up grid for main group (masked bands plus bathymetry)
    n_main = len(masked_bands) + 1  # main bands + bathy
    n_cols = 4
    n_rows_main = math.ceil(n_main / n_cols)
    total_rows = n_rows_main + 1  # extra row for spm_tur

    fig = plt.figure(figsize=(5 * n_cols, 4 * total_rows))
    gs = gridspec.GridSpec(total_rows, n_cols)

    # Plot main bands
    for idx, band in enumerate(masked_bands):
        ax = fig.add_subplot(gs[idx])
        im = ax.imshow(band, cmap='viridis')
        if band_names is not None and idx < len(band_names) and band_names[idx]:
            ax.set_title(f"Wavelength: {band_names[idx]} mm")
        else:
            ax.set_title(f"Band {idx + 1}")
        plt.colorbar(im, ax=ax)

    # Plot bathymetry in the next available slot in the main group
    ax_bathy = fig.add_subplot(gs[len(masked_bands)])
    im_bathy = ax_bathy.imshow(hydro_band, cmap='viridis')
    ax_bathy.set_title("Hydro Survey")
    plt.colorbar(im_bathy, ax=ax_bathy)

    # Turn off any empty main group axes
    for j in range(n_main, n_rows_main * n_cols):
        ax = fig.add_subplot(gs[j])
        ax.axis('off')

    # Center the SPM_TUR bands in the final row
    spm_count = len(spm_bands)
    offset = (n_cols - spm_count) // 2
    for col in range(n_cols):
        ax_spm = fig.add_subplot(gs[n_rows_main, col])
        if offset <= col < offset + spm_count:
            idx = col - offset
            im_spm = ax_spm.imshow(spm_bands[idx], cmap='viridis')
            if spm_band_names is not None and idx < len(spm_band_names) and spm_band_names[idx]:
                ax_spm.set_title(spm_band_names[idx])
            else:
                ax_spm.set_title(f"SPM_TUR Band {idx + 1}")
            plt.colorbar(im_spm, ax=ax_spm)
        else:
            ax_spm.axis('off')

    plt.tight_layout()
    plt.show()

def resample_bathy_to_acolite(bathy_raster, sentinel_raster, output_path):
    # Open the sentinel raster to get its spatial resolution and CRS
    with rasterio.open(sentinel_raster) as sentinel_src:
        sentinel_transform = sentinel_src.transform
        sentinel_crs = sentinel_src.crs
        sentinel_width = sentinel_src.width
        sentinel_height = sentinel_src.height

    # Open the bathymetry raster and create an empty array with the sentinel's shape
    with rasterio.open(bathy_raster) as bathy_src:
        resampled_bathy = np.empty((sentinel_height, sentinel_width), dtype=bathy_src.dtypes[0])
        
        # Resample the bathymetry raster to match the sentinel raster's resolution and CRS
        reproject(
            source=rasterio.band(bathy_src, 1),  # Only the first band is used
            destination=resampled_bathy,
            src_transform=bathy_src.transform,
            src_crs=bathy_src.crs,
            dst_transform=sentinel_transform,
            dst_crs=sentinel_crs,
            dst_width=sentinel_width,
            dst_height=sentinel_height,
            resampling=Resampling.bilinear
        )
        
        # Update metadata for the output raster
        bathy_meta = bathy_src.meta.copy()
        bathy_meta.update({
            "transform": sentinel_transform,
            "crs": sentinel_crs,
            "width": sentinel_width,
            "height": sentinel_height,
            "dtype": bathy_src.dtypes[0]
        })

    # Save the resampled bathymetry raster to the provided output location
    with rasterio.open(output_path, "w", **bathy_meta) as dst:
        dst.write(resampled_bathy, 1)

def clip_acolite_by_bathy(file_list):
    # first file is the bathymetry raster
    bathy_raster = file_list[0]
    
    # Read bathymetry data and compute valid mask once.
    with rasterio.open(bathy_raster) as bathy_src:
        bathy_data = bathy_src.read(1)  # Assumes single-band bathy
        valid_bathy_mask = ~np.isnan(bathy_data)
        bathy_transform = bathy_src.transform
        bathy_crs = bathy_src.crs

    # Loop over the last three rasters in the file_list.
    for sentinel_raster in file_list[-3:]:
        with rasterio.open(sentinel_raster) as sentinel_src:
            # Save band descriptions before processing
            band_descriptions = sentinel_src.descriptions
            # Ensure CRS and transform match
            if sentinel_src.crs != bathy_crs or sentinel_src.transform != bathy_transform:
                raise ValueError("Sentinel-2 raster must already be aligned with bathymetry raster.")
            sentinel_data = sentinel_src.read()  # Read all bands
            # Create a mask for valid sentinel pixels (exclude zero values in first band)
            valid_sentinel_mask = sentinel_data[0, :, :] != 0
            # Combine the two masks
            combined_mask = valid_bathy_mask & valid_sentinel_mask
            # Apply mask to all bands in sentinel_data
            clipped_sentinel_data = np.where(combined_mask, sentinel_data, np.nan)
            # Copy metadata and update as needed
            sentinel_meta = sentinel_src.meta.copy()
            sentinel_meta.update({
                "dtype": "float32",
                "nodata": np.nan
            })
        
        # Define output path; update the folder name.
        if 'median_acolite' in sentinel_raster:
            output_sentinel = sentinel_raster.replace("median_acolite", "clipped_acolite")
            out_str = re.sub(r'_\d{2}_\d{2}_\d{2}(?=\.tif$)', '', output_sentinel)
        elif 'projected_acolite' in sentinel_raster:
            output_sentinel = sentinel_raster.replace("projected_acolite", "clipped_acolite")
            out_str = re.sub(r'_\d{2}_\d{2}_\d{2}(?=\.tif$)', '', output_sentinel)

        os.makedirs(os.path.dirname(out_str), exist_ok=True)
        with rasterio.open(out_str, "w", **sentinel_meta) as dst:
            dst.write(clipped_sentinel_data.astype(np.float32))
            # Preserve original band descriptions
            for i, desc in enumerate(band_descriptions, start=1):
                dst.set_band_description(i, desc)

def clip_bathy_with_acolite_nan(bathy_path, acolite_path, out_path):
    """
    Clip 'bathy_path' by the valid pixels of 'acolite_path'.
    Both rasters must already match exactly in dimensions, transform, and CRS.
    ACOLITE's NoData is assumed to be NaN outside coverage.
    """
    with rasterio.open(bathy_path) as bathy_src, rasterio.open(acolite_path) as aco_src:
        # 1) Check alignment
        if (bathy_src.width != aco_src.width) or (bathy_src.height != aco_src.height):
            raise ValueError("ERROR: Rasters have different dimensions.")
        if bathy_src.transform != aco_src.transform:
            raise ValueError("ERROR: Rasters have different transforms.")
        if bathy_src.crs != aco_src.crs:
            raise ValueError("ERROR: Rasters have different CRS.")

        # 2) Read data
        bathy_data = bathy_src.read(1)    # First band of bathy
        acolite_data = aco_src.read(1)   # First band of ACOLITE

        # 3) Build a valid mask where ACOLITE is finite (non-NaN)
        valid_mask = np.isfinite(acolite_data)

        # 4) Apply the mask: keep bathy where valid_mask==True, else set NaN
        clipped_bathy = np.where(valid_mask, bathy_data, np.nan)

        # 5) Update metadata for float32 + NaN
        out_meta = bathy_src.meta.copy()
        out_meta.update({
            "dtype": "float32",
            "nodata": np.nan,  
            "count": 1
        })

    # 6) Write the clipped bathy
    with rasterio.open(out_path, "w", **out_meta) as dst:
        dst.write(clipped_bathy.astype(np.float32), 1)

In [3]:
def normalize(array):
    return (array - np.nanmin(array)) / (np.nanmax(array) - np.nanmin(array))

def extract_raster_data(paths):
    images_data = []

    bathy_path = paths[0]
    reflectance_path = paths[1]
    tur_spm_path = paths[2]

    # --- Step 1: Open Bathymetry Raster ---
    with rasterio.open(bathy_path) as bathy:
        bathy_data = bathy.read(1)  # Bathymetry data (band 1)
        bathy_nodata = bathy.nodata  # NoData value
        bathy_transform = bathy.transform
        bathy_shape = bathy.shape

    # --- Step 2: Open Reflectnace Raster ---
    with rasterio.open(reflectance_path) as reflect_src:    
        if reflect_src.shape != bathy_shape or reflect_src.transform != bathy_transform:
            raise ValueError(
                f"Inconsistent shapes or transforms:\n"
                f"Bathymetry Shape: {bathy_shape}, Sentinel-2 Shape: {reflect_src.shape}.\n"
                f"Bathymetry Transform: {bathy_transform}, Sentinel-2 Transform: {reflect_src.transform}.\n"
                f"Ensure rasters have identical extents and resolutions."
            )
        
        # Read Sentinel-2 reflect_bands
        reflect_bands = {
            "1610_norm": normalize(reflect_src.read(1)),       # SWIR1, B11
            "2186_norm": normalize(reflect_src.read(2)),       # SWIR2, B12
            "442_norm": normalize(reflect_src.read(3)),        # Aerosols, B1
            "492_norm": normalize(reflect_src.read(4)),        # Blue, B2
            "559_norm": normalize(reflect_src.read(5)),        # Green, B3
            "665_norm": normalize(reflect_src.read(6)),        # Red, B4
            "704_norm": normalize(reflect_src.read(7)),        # Red Edge 1, B5
            "739_norm": normalize(reflect_src.read(8)),        # Red Edge 2, B6
            "780_norm": normalize(reflect_src.read(9)),        # Red Edge 3, B7
            "833_norm": normalize(reflect_src.read(10)),       # NIR, B8
            "864_norm": normalize(reflect_src.read(11)),       # Red Edge 4, B8A
            "stumpf": normalize(np.log(reflect_src.read(4)) / np.log(reflect_src.read(5))),
            "nsmi": normalize((reflect_src.read(6) + reflect_src.read(5) - reflect_src.read(4)) / (reflect_src.read(6) + reflect_src.read(5) + reflect_src.read(4))),    # normalized suspended material index, Red+Green-Blue / Red+Green+Blue
            'ndssi': normalize((reflect_src.read(4) - reflect_src.read(10))/ (reflect_src.read(4) + reflect_src.read(10))),      # normalized difference suspended sediment index, Blue-NIR / Blue+NIR
            "ndti": normalize((reflect_src.read(6) - reflect_src.read(5)) / (reflect_src.read(6) + reflect_src.read(5)))          # normalized differnce tubridity index, Red-Green / Red + Green
            }
        Rrs_nodata = reflect_src.nodata  # Sentinel-2 NoData value

    with rasterio.open(tur_spm_path) as tur_spm_src:    
        if tur_spm_src.shape != bathy_shape or tur_spm_src.transform != bathy_transform:
            raise ValueError(
                f"Inconsistent shapes or transforms:\n"
                f"Bathymetry Shape: {bathy_shape}, Sentinel-2 Shape: {tur_spm_src.shape}.\n"
                f"Bathymetry Transform: {bathy_transform}, Sentinel-2 Transform: {tur_spm_src.transform}.\n"
                f"Ensure rasters have identical extents and resolutions."
            )
        
        tur_spm_bands = {
            "SPM": normalize(tur_spm_src.read(1)),       
            "TUR": normalize(tur_spm_src.read(2))
            }
        tur_spm_nodata = tur_spm_src.nodata  # tur_spm NoData value

        # --- Step 3: Flatten Bands ---
        flat_bathy = bathy_data.flatten()
        flat_bands = {**{key: band.flatten() for key, band in reflect_bands.items()},
                  **{key: band.flatten() for key, band in tur_spm_bands.items()}}
        # flat_bands = {key: band.flatten() for key, band in reflect_bands.items()}


        # --- Step 4: Mask NoData Values ---
        valid_mask = (
            ~np.isnan(flat_bathy) &  # Valid bathy pixels
            (flat_bathy != bathy_nodata)  # Exclude bathy NoData
        )

        for band in flat_bands.values():
            valid_mask &= (band != Rrs_nodata) & (band != tur_spm_nodata)  # Exclude Sentinel-2 and tur_spm NoData
        
        # Apply the mask
        valid_bathy = flat_bathy[valid_mask].reshape(-1, 1)  # Reshape bathy to (n_pixels, 1)
        valid_features = np.column_stack([band[valid_mask] for band in flat_bands.values()])

        # --- Step 5: Combine Features and Targets ---
        # combined_features = np.concatenate((valid_bathy, valid_features), axis=1)  # Combine bathy and S2
        images_data.append((valid_features, valid_bathy.flatten()))  # Flatten bathy for targets

        return images_data
    
def get_pixel_positions(raster_path):
    with rasterio.open(raster_path) as src:
        # Get the affine transformation of the raster
        transform = src.transform
        
        # Read the first band to determine valid (non-NaN) pixels
        band_data = src.read(1, masked=True)  # Read the first band as a masked array
        valid_mask = ~band_data.mask          # Valid pixels where mask is False

        # Get raster dimensions
        height, width = src.height, src.width

        # Create arrays of pixel indices
        row_indices, col_indices = np.meshgrid(np.arange(height), np.arange(width), indexing="ij")

        # Compute x, y positions using the affine transform
        xs, ys = rasterio.transform.xy(transform, row_indices, col_indices, offset='center')
        xs = np.array(xs).flatten()
        ys = np.array(ys).flatten()

        # Filter x, y positions to include only valid pixels
        valid_positions = np.column_stack((xs[valid_mask.flatten()], ys[valid_mask.flatten()]))

    return valid_positions

def prepare_train_data(surveyinfo):
    images_data = [extract_raster_data(paths) for paths in list(surveyinfo.values())]
    ncf_channels = [name[:-9] for name in list(surveyinfo.keys())]
    pixel_positions = [get_pixel_positions(paths[0]) for paths in list(surveyinfo.values())]

    data = {}
    for i, name in enumerate(list(surveyinfo.keys())):
        # Extract data for the current iteration
        bands = images_data[i]               # Shape (n_pixels, 11)
        positions = pixel_positions[i]     # Shape (n_pixels, 2)
    
        data[name] = pd.DataFrame({
                "1610_norm": bands[0][0][:, 0],
                "2186_norm": bands[0][0][:, 1],
                "442_norm": bands[0][0][:, 2],
                "492_norm": bands[0][0][:, 3],
                "559_norm": bands[0][0][:, 4],
                "665_norm": bands[0][0][:, 5],
                "704_norm": bands[0][0][:, 6],
                "739_norm": bands[0][0][:, 7],
                "780_norm": bands[0][0][:, 8],
                "833_norm": bands[0][0][:, 9],
                "864_norm": bands[0][0][:, 10],
                "stumpf": bands[0][0][:, 11],
                "nsmi": bands[0][0][:, 12],
                'ndssi': bands[0][0][:, 13],
                "ndti": bands[0][0][:, 14],
                "SPM": bands[0][0][:, 15],
                "TUR": bands[0][0][:, 16],
                "X": positions[:, 0],
                "Y": positions[:, 1],
                "Channel_Name": [ncf_channels[i]] * len(bands[0][0]),  # Repeating value directly
                "Bathymetry": bands[0][1]
            })

    combined_df = pd.concat(data.values(), ignore_index=True)

    encoder = LabelEncoder()
    combined_df['Channel_Name_Encoded'] = encoder.fit_transform(combined_df['Channel_Name'])

    output = open(os.path.join(WORK_DIR, 'Channel_Name_label_encoders.pkl'), 'wb')
    pickle.dump(encoder, output)
    output.close()
    
    # Drop original categorical columns
    combined_df.drop(columns=['Channel_Name'], inplace=True)

    combined_df.to_parquet(os.path.join(WORK_DIR,'SDB_data.parquet'), engine='pyarrow', index=False)
    print(f"Pixelwise training data saved to {os.path.join(WORK_DIR,'SDB_data.parquet')}")

    return combined_df

# Establish working directories

In [4]:
S2_PATH = '/mnt/Crucial/SDB/CESWG/stacked_acolite'
BATHY_PATH = '/mnt/Crucial/SDB/CESWG/bathy_rasters'
S2_MASK = S2_PATH.replace('stacked', 'masked')
S2_PROJ = S2_PATH.replace('stacked', 'projected')
BATHY_PROJ = BATHY_PATH.replace('rasters', 'proj')

FINAL_PATH = '/home/clay/Documents/SDB/CESWG/processed'
S2_FINAL = os.path.join(FINAL_PATH, 'S2')
BATHY_FINAL = os.path.join(FINAL_PATH, 'Bathy')

os.makedirs(S2_MASK, exist_ok=True)
os.makedirs(S2_PROJ, exist_ok=True)
os.makedirs(BATHY_PROJ, exist_ok=True)
os.makedirs(FINAL_PATH, exist_ok=True)
os.makedirs(S2_FINAL, exist_ok=True)
os.makedirs(BATHY_FINAL, exist_ok=True)

PermissionError: [Errno 13] Permission denied: '/mnt/Crucial'

In [None]:
surveynames = [f[:-4] for f in os.listdir(BATHY_PATH) if f.endswith('.tif')]
surveyinfo = {}
for f in surveynames:
    hydro_tif = os.path.join(BATHY_PATH, f"{f}.tif")
    acolite_path = os.path.join(S2_PATH, f)
    
    if os.path.exists(acolite_path):
        rhow_paths = [os.path.join(acolite_path, file) for file in os.listdir(acolite_path) if 'rhow' in file if file.endswith('.tif')]
        Rrs_paths = [os.path.join(acolite_path, file) for file in os.listdir(acolite_path) if 'Rrs' in file if file.endswith('.tif')]
        spm_tur_paths = [os.path.join(acolite_path, file) for file in os.listdir(acolite_path) if 'TUR_SPM' in file if file.endswith('.tif')]

        surveyinfo[f] = [hydro_tif, rhow_paths, Rrs_paths, spm_tur_paths]

# Adjust the acolite tif Nan values

In [None]:
# in case I already ran it
masked_surveyinfo = {}

for f, paths in surveyinfo.items():
    files = [file for file in os.listdir(os.path.join(S2_MASK, f)) if file.endswith('.tif')]
    rhow_paths = []
    rrs_paths = []
    spm_tur_paths = []

    for file in files:
        if file.startswith('rhow'):
            rhow_paths.append(os.path.join(S2_MASK, f, file))
        elif file.startswith('Rrs'):
            rrs_paths.append(os.path.join(S2_MASK, f, file))
        else:
            spm_tur_paths.append(os.path.join(S2_MASK, f, file))
    
    masked_surveyinfo[f] = rhow_paths, rrs_paths, spm_tur_paths

In [None]:
masked_surveyinfo = {}

for f, paths in surveyinfo.items():
    hydro_tif, rhow_paths, Rrs_paths, spm_tur_paths = paths
    masked_surveyinfo[f] = mask_acolite_images(rhow_paths, Rrs_paths, spm_tur_paths)

# Remove the ACOLITE tifs that have no data (only Nans)

In [None]:
non_nan_masked_surveyinfo = {}

for survey, file_tuple in masked_surveyinfo.items():
    filtered_groups = []
    for group in file_tuple:
        filtered_files = []
        for file_path in group:
            with rasterio.open(file_path) as src:
                data = src.read()  # Read all bands
            if not np.isnan(data).all():
                filtered_files.append(file_path)
        filtered_groups.append(filtered_files)
    # Add survey only if at least one file has data in any group.
    if any(filtered_groups):
        non_nan_masked_surveyinfo[survey] = tuple(filtered_groups)

In [None]:
# compare number of rasters with available data and total rasters
display(len(non_nan_masked_surveyinfo))
display(len(masked_surveyinfo))

In [None]:
# visualize the masked results alongside the ehydro survey
i = -3

non_nan_masked_surveys = list(non_nan_masked_surveyinfo.keys())
survey = non_nan_masked_surveys[i]

rhow_paths, rrs_paths, spm_tur_paths = non_nan_masked_surveyinfo[survey]
hydro_tif = surveyinfo[survey][0]

visualize_rasters(rhow_paths[0], hydro_tif, spm_tur_paths[0])

# Reproject ACOLITE tif products to the eHydro CRS
- need to work on preserving the spatial resolution of the original ACOLITE products (10 meter)
- current methods resample to 10ft resolution
- can do a crude one where I convert 10 meter to ft and directly pass?

In [None]:
# in case I already ran it
reprojected_surveyinfo = {}

for f, paths in non_nan_masked_surveyinfo.items():
    files = [file for file in os.listdir(os.path.join(S2_PROJ, f)) if file.endswith('.tif')]
    rhow_paths = []
    rrs_paths = []
    spm_tur_paths = []

    for file in files:
        if file.startswith('rhow'):
            rhow_paths.append(os.path.join(S2_PROJ, f, file))
        elif file.startswith('Rrs'):
            rrs_paths.append(os.path.join(S2_PROJ, f, file))
        else:
            spm_tur_paths.append(os.path.join(S2_PROJ, f, file))
    
    reprojected_surveyinfo[f] = rhow_paths, rrs_paths, spm_tur_paths

In [None]:
reprojected_surveyinfo = {}

for f, results in non_nan_masked_surveyinfo.items():
    hydro_tif = surveyinfo[f][0]
    rhow_results, Rrs_results, spm_tur_results = results
    paths = []
    for grp in [rhow_results, Rrs_results, spm_tur_results]:
        paths.append(reproject_acolite(hydro_tif, grp, target_resolution_m=10))
    reprojected_surveyinfo[f] = paths

# Get median raster from the surveys that have multiple S2 images

In [None]:
for f, results in reprojected_surveyinfo.items():
    rhow_results, Rrs_results, spm_tur_results = results
    groups = [(rhow_results, "rhow"), (Rrs_results, "Rrs"), (spm_tur_results, "TUR_SPM")]
    
    for group_files, prod in groups:
        if len(group_files) > 1:
            # Read all files into a stack
            arrays = []
            meta = None
            band_descriptions = None
            for path in group_files:
                with rasterio.open(path) as src:
                    if meta is None:
                        meta = src.meta.copy()
                        band_descriptions = src.descriptions
                    arrays.append(src.read())  # shape: (bands, height, width)
            arr_stack = np.stack(arrays, axis=0)  # shape: (n_files, bands, height, width)
            median_arr = np.nanmedian(arr_stack, axis=0)  # shape: (bands, height, width)
            
            # Prepare output path: replace 'projected_acolite' with 'median_acolite'
            # and change the product prefix in the filename.
            out_path = group_files[0].replace("projected_acolite", "median_acolite")
            out_path = out_path.replace(prod, f"median_{prod}")
            out_str = re.sub(r'_\d{2}_\d{2}_\d{2}(?=\.tif$)', '', out_path)
            os.makedirs(os.path.dirname(out_str), exist_ok=True)
            
            # Update metadata if needed (ensure dtype and band count)
            meta.update(dtype="float32", count=median_arr.shape[0])
            # Write the median raster keeping band descriptions
            with rasterio.open(out_str, "w", **meta) as dst:
                dst.write(median_arr.astype(np.float32))
                for i, desc in enumerate(band_descriptions, start=1):
                    dst.set_band_description(i, desc)
            print(f"Median {prod} raster for survey {f} saved to: {out_str}")
        else:
            print(f"Only one {prod} file for survey {f}; skipping median calculation.")

# Resample bathy rasters from 10 ft resolution to same resolution as S2 rasters

In [None]:
# collect median rasters and singular rasters for resampling the bathy rasters
# use set notation to remove the medians from the reprojected rasters
proj_names = os.listdir(S2_PROJ)
median_names = os.listdir(S2_PROJ.replace('projected', 'median'))
names_only_proj = list(set(proj_names) - set(median_names))
acolite_names = median_names + names_only_proj

hydro_paths = [os.path.join(BATHY_PATH, f'{f}.tif') for f in acolite_names]
median_paths = [os.path.join(S2_PROJ.replace('projected', 'median'), f) for f in median_names]
proj_paths = [os.path.join(S2_PROJ, f) for f in names_only_proj]
acolite_paths = median_paths + proj_paths

resample_hydroinfo = {}
for name, hydro, apath in zip(acolite_names, hydro_paths, acolite_paths):
    resample_hydroinfo[name] = [hydro] + [os.path.join(apath, f) for f in os.listdir(apath) if f.endswith('tif')]

In [None]:
resampled_tifs = {}

for f, paths in resample_hydroinfo.items():
    resample_bathy_to_acolite(paths[0], paths[1], paths[0].replace('bathy_rasters', 'bathy_proj'))

    resampled_tifs[f] = [paths[0].replace('bathy_rasters', 'bathy_proj')] + [paths[1], paths[2], paths[3]]

# Clip the S2 rasters by the valid bathymetry pixels
- need bounds of non np.nan pixels for clipping

In [None]:
for name, rasters in resampled_tifs.items():
    clip_acolite_by_bathy(rasters)

# Clip the bathymetry rasters by the valid cloud-masked S2 pixels
- need bounds of valid pixels, seems like these will be values above 0.0 since no nan-value is applied in GEE

In [None]:
for name, rasters in resampled_tifs.items():
    outpath = rasters[0].replace('proj', 'clipped')
    os.makedirs(os.path.dirname(outpath), exist_ok=True)

    if 'median_acolite' in rasters[1]:
        rhow_path = rasters[1].replace("median_acolite", "clipped_acolite")
        rhow_str = re.sub(r'_\d{2}_\d{2}_\d{2}(?=\.tif$)', '', rhow_path)

    elif 'projected_acolite' in rasters[1]:
        rhow_path = rasters[1].replace("projected_acolite", "clipped_acolite")
        rhow_str = re.sub(r'_\d{2}_\d{2}_\d{2}(?=\.tif$)', '', rhow_path)

    clip_bathy_with_acolite_nan(rasters[0], rhow_str, outpath)

Should be done with needed preprocessing, can now move to training the model on the data
- will try traditional ML regression models, as well as CNN
- May try majority voting of multiple training set models like in Tan et al. 2022

# Create dataframe for training ML models on per-pixel basis, export to parquet file

In [None]:
REDE_BATHY = '/mnt/Crucial/SDB/CESWG/bathy_clipped'
REDE_ACOLITE = '/mnt/Crucial/SDB/CESWG/clipped_acolite'
WORK_DIR = '/mnt/Crucial/SDB/CESWG'

In [5]:
REDE_BATHY = '/home/clay/Documents/SDB/CESWG/bathy_clipped'
REDE_ACOLITE = '/home/clay/Documents/SDB/CESWG/clipped_acolite'
WORK_DIR = '/home/clay/Documents/SDB/CESWG'

In [6]:
names = os.listdir(REDE_BATHY)

rhow_info = {}              # create a dictionary storing the data for rhow and tur_spm
Rrs_info = {}               # create a dictionary storign the data for Rrs and tur_spm. May not use since it could be redundant, but nice to have
for name in names:
    bathy_path = os.path.join(REDE_BATHY, name)
    rhow_path = [os.path.join(REDE_ACOLITE, name[:-4], f) for f in os.listdir(os.path.join(REDE_ACOLITE, name[:-4])) if 'rhow' in f][0]
    rrs_path = [os.path.join(REDE_ACOLITE, name[:-4], f) for f in os.listdir(os.path.join(REDE_ACOLITE, name[:-4])) if 'Rrs' in f][0]
    tur_spm_path = [os.path.join(REDE_ACOLITE, name[:-4], f) for f in os.listdir(os.path.join(REDE_ACOLITE, name[:-4])) if 'TUR_SPM' in f][0]

    rhow_info[name[:-4]] = [bathy_path, rhow_path, tur_spm_path]
    Rrs_info[name[:-4]] = [bathy_path, rrs_path, tur_spm_path]

In [7]:
combined_data = prepare_train_data(Rrs_info)
combined_data

  return (array - np.nanmin(array)) / (np.nanmax(array) - np.nanmin(array))
  return (array - np.nanmin(array)) / (np.nanmax(array) - np.nanmin(array))


Pixelwise training data saved to /home/clay/Documents/SDB/CESWG/SDB_data.parquet


Unnamed: 0,1610_norm,2186_norm,442_norm,492_norm,559_norm,665_norm,704_norm,739_norm,780_norm,833_norm,...,stumpf,nsmi,ndssi,ndti,SPM,TUR,X,Y,Bathymetry,Channel_Name_Encoded
0,0.897389,0.967754,0.760320,0.630682,0.722982,0.505699,0.972862,0.966278,1.000000,0.871044,...,0.397993,0.371699,0.133565,0.430897,0.491689,0.491689,3.236274e+06,1.381865e+07,14.372604,16
1,0.897389,0.967754,0.760320,0.509405,0.688707,0.608875,0.972862,0.966278,1.000000,0.883636,...,0.518798,0.547963,0.087404,0.569220,0.595376,0.595376,3.236306e+06,1.381865e+07,14.801007,16
2,0.947371,0.737859,0.884240,0.354741,0.582226,0.640859,0.736661,0.944633,0.947024,0.933258,...,0.646988,0.717626,0.000000,0.659826,0.627736,0.627737,3.236339e+06,1.381865e+07,15.323374,16
3,0.943454,0.731772,0.887762,0.377076,0.427083,0.592018,0.723197,0.937943,0.929376,0.873307,...,0.527334,0.617087,0.051151,0.682958,0.578378,0.578378,3.236372e+06,1.381865e+07,14.946900,16
4,0.897389,0.763548,0.760320,0.561001,0.331673,0.641276,0.743230,0.904535,0.706734,0.837527,...,0.251321,0.371542,0.136936,0.792293,0.628158,0.628158,3.236241e+06,1.381862e+07,16.153040,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2124295,,,,,,,,,,,...,,,,,0.383646,0.383646,2.834474e+06,1.337846e+07,11.000000,120
2124296,,,,,,,,,,,...,,,,,0.330307,0.330307,2.834507e+06,1.337846e+07,11.000000,120
2124297,,,,,,,,,,,...,,,,,0.499987,0.499987,2.834539e+06,1.337846e+07,11.000000,120
2124298,,,,,,,,,,,...,,,,,0.430120,0.430120,2.834572e+06,1.337846e+07,11.000000,120
