In [None]:
import glob
import os
import time
from io import StringIO
from pathlib import Path

import geopandas as gpd
import pandas as pd
import requests
from shapely.geometry import Polygon

import seabeepy as sb
from seabeepy.config import SETTINGS

In [None]:
# Login to MinIO
minio_client = sb.storage.minio_login(
    user=SETTINGS.MINIO_ACCESS_ID, password=SETTINGS.MINIO_SECRET_KEY
)

# Identify missions that have filed to publish

This notebook gets a list of all datasets on the GeoNode and compares it to the list of missions with valid config. files and `publish: true`. Missions that have not been published successfully can be deleted and reprocessed, if necessary..

In [None]:
def get_geonode_data(geonode_url):
    # Initialize an empty list to store each dataset's information
    datasets = []

    # Initialize the URL for the first page of results
    url = f"{geonode_url}/api/v2/datasets/"

    while url:
        # Get a page of results
        response = requests.get(url)
        data = response.json()

        for dataset in data["datasets"]:
            # Get the bounding box and convert it into a polygon
            bbox = dataset["ll_bbox_polygon"]["coordinates"][0]
            polygon = Polygon(bbox)

            # Append the dataset's name and polygon to the list
            datasets.append(
                {
                    "name": dataset["name"],
                    "type": dataset["subtype"],
                    "abstract": dataset["abstract"],
                    "geometry": polygon,
                }
            )

        # Get the URL for the next page of results, or None if this is the last page
        url = data["links"]["next"]

        time.sleep(0.5)

    # Convert the list into a GeoDataFrame
    gdf = gpd.GeoDataFrame(datasets, crs="EPSG:4326")

    # Reproject to EPSG:3035 (ETRS89-extended / LAEA Europe) to calculate area in km^2
    gdf["area_km2"] = gdf.to_crs("EPSG:3035")["geometry"].area / 10**6

    return gdf


def extract_minio_path(x):
    """Parse the MinIO path from the 'abstract', where available."""
    try:
        return (
            pd.read_html(StringIO(x), index_col=0)[0].loc["MinIO path", 1]
            if pd.notnull(x)
            else ""
        )
    except Exception:
        return ""


def extract_geopackage_path(x):
    """Parse the gpkg path from the 'abstract', where available."""
    try:
        return (
            pd.read_html(StringIO(x), index_col=0)[0].loc["Geopackage Path", 1]
            if pd.notnull(x)
            else ""
        )
    except Exception:
        return ""


def assign_theme(x):
    """Parse the SeaBee 'theme' from the 'abstract', where available."""
    try:
        return (
            pd.read_html(StringIO(x), index_col=0)[0].loc["Theme", 1]
            if pd.notnull(x)
            else ""
        )
    except Exception:
        return ""

## 1. Directories to scan

In [None]:
base_dirs = [
    # r"/home/notebook/shared-seabee-ns9879k/seabirds/2017",
    # r"/home/notebook/shared-seabee-ns9879k/seabirds/2018",
    # r"/home/notebook/shared-seabee-ns9879k/seabirds/2019",
    # r"/home/notebook/shared-seabee-ns9879k/seabirds/2020",
    # r"/home/notebook/shared-seabee-ns9879k/seabirds/2021",
    # r"/home/notebook/shared-seabee-ns9879k/seabirds/2022",
    # r"/home/notebook/shared-seabee-ns9879k/seabirds/2023",
    # r"/home/notebook/shared-seabee-ns9879k/seabirds/2024",
    r"/home/notebook/shared-seabee-ns9879k/seabirds/2025",
    # r"/home/notebook/shared-seabee-ns9879k/niva-tidy/2022",
    # r"/home/notebook/shared-seabee-ns9879k/niva-tidy/2023",
    # r"/home/notebook/shared-seabee-ns9879k/niva-tidy/2024",
    # r"/home/notebook/shared-seabee-ns9879k/niva-tidy/2025",
    # r"/home/notebook/shared-seabee-ns9879k/dmc",
    # r"/home/notebook/shared-seabee-ns9879k/imr",
    # r"/home/notebook/shared-seabee-ns9879k/ntnu",
    # r"/home/notebook/shared-seabee-ns9879k/obama-next",
    # r"/home/notebook/shared-seabee-ns9879k/massimal",
]

## 2. Get all GeoNode data

In [None]:
# Get all geonode data
url = r"https://geonode.seabee.sigma2.no"
gdf = get_geonode_data(url)

## 3. Identify unpublished orthomosaics

In [None]:
# Get rasters
ras_gdf = gdf.query("(name != 'coastline') and (type == 'raster')").copy()
ras_gdf["minio_path"] = ras_gdf["abstract"].apply(extract_minio_path)
ras_gdf["theme"] = ras_gdf["abstract"].apply(assign_theme)
published_list = ras_gdf["name"].tolist()
ras_gdf.head()

In [None]:
mission_list = [
    f.parent
    for base_dir in base_dirs
    for f in Path(base_dir).rglob("config.seabee.yaml")
]
mission_list = [f for f in mission_list if sb.ortho.check_config_valid(f)]
mission_list = [f for f in mission_list if sb.ortho.parse_config(f)["publish"]]
print(len(mission_list))

In [None]:
for mission_fold in mission_list:
    layer_name = sb.ortho.get_layer_name(mission_fold)
    stan_path = os.path.join(mission_fold, "orthophoto", layer_name + ".tif")
    if os.path.exists(stan_path) and (layer_name not in published_list):
        print(mission_fold)
        # sb.storage.delete_file(stan_path, minio_client)

## 4. Identify unpublished `detections` geopackages

In [None]:
# Get vectors
vec_gdf = gdf.query("(name != 'coastline') and (type == 'vector')").copy()
vec_gdf["minio_path"] = gdf["abstract"].apply(extract_geopackage_path)
published_list = vec_gdf["name"].tolist()
vec_gdf.head()

In [None]:
mission_list = [
    f.parent
    for base_dir in base_dirs
    for f in Path(base_dir).rglob("config.seabee.yaml")
]
mission_list = [f for f in mission_list if sb.ortho.check_config_valid(f)]
mission_list = [
    f
    for f in mission_list
    if sb.ortho.parse_config(f)["publish"] and sb.ortho.parse_config(f)["classify"]
]
print(len(mission_list))

In [None]:
for mission_fold in mission_list:
    layer_name = sb.ortho.get_layer_name(mission_fold) + "_detections"
    search_pattern = os.path.join(
        mission_fold, "results", "detection", "*", layer_name + ".gpkg"
    )
    matching_files = glob.glob(search_pattern)

    if matching_files and (layer_name not in published_list):
        print(mission_fold)
        if len(matching_files) == 1:
            gpkg_path = matching_files[0]
            print(gpkg_path)
            # sb.storage.delete_file(gpkg_path, minio_client)
            print("")
        elif len(matching_files) == 0:
            print("No detection results.")
            print("")
        else:
            print("Multiple detection results.")
            print("")