In [None]:
import fnmatch
import os
import time
from io import StringIO

import pandas as pd
import requests

import seabeepy as sb
from seabeepy.config import SETTINGS

In [None]:
# Login to MinIO
minio_client = sb.storage.minio_login(
    user=SETTINGS.MINIO_ACCESS_ID, password=SETTINGS.MINIO_SECRET_KEY
)

GEONODE_URL = os.environ.get("GEONODE_URL", r"https://geonode.seabee.sigma2.no/api/v2")

# Fix datasets with no abstract

Updating metadata via the GeoNode API occasionally times-out and fails. This leads to datasets being published correctly, but without any abstract information. This notebook attempts to identify datasets where this has happened and deletes them for republishing.

## 1. Identify datasets with no abstract

In [None]:
def get_geonode_data(geonode_url):
    # Initialize an empty list to store each dataset's information
    datasets = []

    # Initialize the URL for the first page of results
    url = f"{geonode_url}/datasets/"

    while url:
        # Get a page of results
        response = requests.get(url)
        data = response.json()

        for dataset in data["datasets"]:
            datasets.append(
                {
                    "name": dataset["name"],
                    "id": dataset["pk"],
                    "type": dataset["subtype"],
                    "abstract": dataset["abstract"],
                }
            )

        # Get the URL for the next page of results, or None if this is the last page
        url = data["links"]["next"]

        time.sleep(0.2)

    # Convert the list into a GeoDataFrame
    df = pd.DataFrame(datasets)

    return df


def find_file(directory, pattern):
    for root, dirs, files in os.walk(directory):
        for filename in fnmatch.filter(files, pattern):
            return os.path.join(root, filename)
    return None

In [None]:
# Get all missions
df = get_geonode_data(GEONODE_URL)

# Missions with no abstract
noabs_df = df[df["abstract"].str.contains("No abstract")].reset_index(drop=True)
print(len(noabs_df), "datasets without an abstract.")
noabs_df.head()

## 2. Delete datasets with no abstract

On both MinIO and GeoNode.

In [None]:
# Will only delete stuff if 'dry_run' is False
dry_run = True

In [None]:
def delete_dataset(ds_id, file_path, file_name):
    del_url = f"{GEONODE_URL}/resources/{ds_id}"
    response = requests.request("DELETE", del_url, auth=auth)
    response.raise_for_status()
    sb.storage.delete_file(file_path, minio_client)
    print(f"Deleted: {file_name}")


# Delete datasets without an abstract (on both GeoNode and MinIO)
base_fold = r"/home/notebook/shared-seabee-ns9879k"
auth = (SETTINGS.GEONODE_USER, SETTINGS.GEONODE_PASSWORD)

cnt = 0
skip = 0
for idx, row in noabs_df.iterrows():
    name = row["name"]
    ds_id = row["id"]
    ds_type = row["type"]

    if ds_type == "raster":
        tif_name = f"{name}.tif"
        tif_path = find_file(base_fold, tif_name)
        if tif_path:
            if not dry_run:
                delete_dataset(ds_id, tif_path, tif_name)
            else:
                print(f"Would delete: {tif_name}")
            cnt += 1
        else:
            print(f"Not found: {tif_name}")

    elif ds_type == "vector":
        ml_type = name.split("_")[-1]
        if ml_type == "detections":
            # Seabirds or mammals
            gpkg_name = f"{name}.gpkg"
            gpkg_path = find_file(base_fold, gpkg_name)
            if gpkg_path:
                if not dry_run:
                    delete_dataset(ds_id, gpkg_path, gpkg_name)
                else:
                    print(f"Would delete: {gpkg_name}")
                cnt += 1
            else:
                print(f"Not found: {gpkg_name}")

        elif ml_type == "classifications":
            # Habitats
            print(f"Fixing classifications not yet implemented: {name}")
        else:
            skip += 1
            print(f"Skipping: {name}")

print(f"\n{skip} datasets skipped; {cnt} datasets deleted.")