# Notebook for Downloading and Formatting Bathymetry Data

In this notebook, we will:

* Download a CSV file containing all session information.
* Manually format this file to match the structure of campagne.csv.
* Search for all relevant files locally and copy them into a dedicated folder.
* Group and merge the files that are closest in time (by year).
* Apply a Bash script to assign colors to depth values, making the data visualizable in a web browser.

In [1]:
# Base import and global path

import shutil
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path
from pyproj import Transformer
from shapely.geometry import box
from sklearn.cluster import DBSCAN

import rasterio
from rasterio.merge import merge
from rasterio.shutil import copy
from rasterio.enums import Resampling


RAW_BATHY_FOLDER = Path("./data/raw_bathy_files")
MERGED_RASTERS = Path("./data/merged_rasters")

## Gather all bathy rasters in one place

In [2]:
# Download the file from https://docs.google.com/spreadsheets/d/1blCznrPE0fuGqqTTMyzl7-GvqGByyAQGtQSecNV4DHY/edit?usp=sharing
# in csv format and name it campagne.csv
# In the top left header, add the verification column to match all 'v'
# Remove all lines without data.

df = pd.read_csv("campagne.csv")

df2 = df[df["verification"] == "v1"]
df2 = df2[df2["bathy (yes/no)"] == "yes"]

print(f"From {len(df)} sessions, only {len(df2)} have bathymetry data")

From 349 sessions, only 182 have bathymetry data


In [None]:
# The get_path function try to match the session_name with a session on your computer.
# Please, change the path to your local folder.

def get_path(row):
    name = row["session name"]

    date = name.split("_")[0][0:6]


    letter_disk = ""
    if date in ["202312", "202311"]:
        letter_disk = "D"
    elif date in ["202211", "202309", "202310"]:
        letter_disk = "E"
    else:
        letter_disk = "F"

    if 1 <= int(date[5]) <= 7 and int(date[3]) == 3 and int(date[4]) == 0 and "MDG" not in name:
        date = "202301-07"

    return f"/media/bioeos/{letter_disk}/{date}_plancha_session"

df2["root_path"] = df2.apply(lambda row: get_path(row), axis=1)

df3 = df2[["root_path", "session name"]]

In [None]:
# Now, we copy each raster file in the folder to convert them after.

if RAW_BATHY_FOLDER.exists():
    shutil.rmtree(RAW_BATHY_FOLDER)
RAW_BATHY_FOLDER.mkdir(parents=True)

for i, row in df3.iterrows():

    bathy_raster = Path(row["root_path"], row["session name"], "PROCESSED_DATA", "BATHY", f"{row['session name']}_bathy_raster-linear.tif")

    if not bathy_raster.exists():
        print(f"We don't found the bathy raster at path {bathy_raster}")
    else:
        shutil.copy(bathy_raster, Path(RAW_BATHY_FOLDER, bathy_raster.name))

## Regroup and merge rasters by year and zone to get less rasters

In [None]:
# This cell gather all functions necessary to produce the merged raster.

def get_raster_bbox(raster_path):
    """Extract the bounding box of a raster file in EPSG:3857 (meters)."""
    with rasterio.open(raster_path) as dataset:
        bounds = dataset.bounds  # (left, bottom, right, top)

        # Convert coordinates to meters (EPSG:4326 to EPSG:3857)
        transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True)
        min_x, min_y = transformer.transform(bounds.left, bounds.bottom)
        max_x, max_y = transformer.transform(bounds.right, bounds.top)

        return min_x, min_y, max_x, max_y, raster_path

def bbox_distance(bbox1, bbox2):
    """Compute the minimum distance between two bounding boxes."""
    min_x1, min_y1, max_x1, max_y1, _ = bbox1
    min_x2, min_y2, max_x2, max_y2, _ = bbox2

    # Compute horizontal and vertical distances
    dx = max(min_x2 - max_x1, min_x1 - max_x2, 0)  # Distance along x-axis
    dy = max(min_y2 - max_y1, min_y1 - max_y2, 0)  # Distance along y-axis

    return np.hypot(dx, dy)  # Euclidean distance

def group_rasters_by_bbox(raster_paths, distance_threshold=10):
    """Groups raster files based on bounding box proximity."""
    bboxes = [get_raster_bbox(path) for path in raster_paths]

    # Compute pairwise distance matrix
    distance_matrix = np.zeros((len(bboxes), len(bboxes)))
    for i in range(len(bboxes)):
        for j in range(len(bboxes)):
            if i != j:
                distance_matrix[i, j] = bbox_distance(bboxes[i], bboxes[j])

    # Use DBSCAN clustering
    clustering = DBSCAN(eps=distance_threshold, min_samples=1, metric="precomputed").fit(distance_matrix)

    # Organize rasters by cluster labels
    grouped_rasters = {}
    for bbox, label in zip(bboxes, clustering.labels_):
        if label not in grouped_rasters:
            grouped_rasters[label] = []
        grouped_rasters[label].append(bbox)

    return list(grouped_rasters.values())

def merge_rasters_to_cog(groups, output_folder):
    """Merge each group into a single COG file."""
    output_folder = Path(output_folder)
    output_folder.mkdir(exist_ok=True)

    for i, group in enumerate(groups):
        raster_paths = [bbox[4] for bbox in group]  # Extract file paths
        print(f"Generate {i}")

        # Open all rasters
        datasets = [rasterio.open(path) for path in raster_paths]

        # Merge the rasters
        mosaic, out_transform = merge(datasets, resampling=Resampling.bilinear)

        # Use metadata from first raster
        out_meta = datasets[0].meta.copy()
        out_meta.update({
            "driver": "COG",
            "height": mosaic.shape[1],
            "width": mosaic.shape[2],
            "transform": out_transform,
            "compress": "LZW",
            "BIGTIFF": "IF_SAFER"
        })

        merged_path = Path(output_folder, f"group_{i}.tif")

        # Save the merged raster
        with rasterio.open(merged_path, "w", **out_meta) as dest:
            dest.write(mosaic)

        # Convert to COG
        cog_path = Path(output_folder, f"bathy_group_{i}.tif")
        copy(merged_path, cog_path, driver="COG", compress="LZW")

        merged_path.unlink()

        # Close datasets
        for dataset in datasets:
            dataset.close()

In [6]:
years = list(set([a.name[0:4] for a in Path(RAW_BATHY_FOLDER).iterdir()]))

print(f"Years found {sorted(years)}")

for year in years:
    print(f"Working with year {year}")
    raster_files = sorted([a for a in Path(RAW_BATHY_FOLDER).iterdir() if year in a.name])

    groups = group_rasters_by_bbox(raster_files, distance_threshold=10)

    output_folder = Path(MERGED_RASTERS, year)
    if output_folder.exists():
        shutil.rmtree(output_folder)
    output_folder.mkdir(parents=True)

    cog_files = merge_rasters_to_cog(groups, output_folder)

Years found ['2022', '2023', '2024', '2025']
Working with year 2024
Generate 0
Generate 1
Generate 2
Generate 3
Generate 4
Generate 5
Working with year 2022
Generate 0
Generate 1
Generate 2
Generate 3
Generate 4
Generate 5
Generate 6
Generate 7
Generate 8
Generate 9
Working with year 2023
Generate 0
Generate 1
Generate 2
Generate 3
Generate 4
Generate 5
Generate 6
Generate 7
Generate 8
Generate 9
Generate 10
Generate 11
Generate 12
Generate 13
Generate 14
Generate 15
Generate 16
Generate 17
Generate 18
Generate 19
Generate 20
Generate 21
Generate 22
Working with year 2025
Generate 0
Generate 1
Generate 2
Generate 3
Generate 4


## Create web optimized raster with internal tiling

You need to execute the script 1.create_web_optimize_cog.sh

In [None]:
#!/bin/bash

source /home/bioeos/miniconda3/etc/profile.d/conda.sh
conda activate titiler_env

BATHY_FOLDER=./data/bathy_cogs
MERGED_RASTER_FOLDER=./data/merged_rasters

rm -rf $BATHY_FOLDER


for YEAR_FOLDER in $MERGED_RASTER_FOLDER/*;
do
    echo $YEAR_FOLDER;
    YEAR=$(basename "$YEAR_FOLDER" .tif)

    BATHY_FOLDER_YEAR=$BATHY_FOLDER/$YEAR
    mkdir -p $BATHY_FOLDER_YEAR

    for FILE in $YEAR_FOLDER/*.tif;
    do
        echo $FILE;
        BASENAME=$(basename "$FILE" .tif)

        # 1. COLOR COG (RGBA)
        COLOR_TIF="${BATHY_FOLDER}/${BASENAME}_color.tif"
        gdaldem color-relief $FILE color.txt $COLOR_TIF -alpha

        COLOR_COG="${BATHY_FOLDER_YEAR}/${BASENAME}_color_cog.tif"
        rio cogeo create \
            --cog-profile webp \
            --web-optimized \
            --overview-level 8 \
            $COLOR_TIF $COLOR_COG
        
        # 2. BATHY COG (Float32)
        BATHY_COG="${BATHY_FOLDER_YEAR}/${BASENAME}_depth_cog.tif"
        rio cogeo create \
            --cog-profile deflate \
            --web-optimized \
            --overview-resampling average \
            $FILE $BATHY_COG
            
        rm $COLOR_TIF
    done
done