In [1]:
import datetime as dt
import os

from config import SETTINGS

import seabeepy as sb

# Upload NIVA datasets

Files within the `niva` bucket are arranged in a fairly complex hierarchy. This can probably be simplified, but it's going to take a while.

This notebook searches a "mission" folder for orthomosaics with user-specified names. For example, Hege sent me a list of mosaic names for the Kelpmap project (which I hadn't previously found because they're quite deeply buried). The orginal file names are generally not very helpful, so in this notebook the user must specify more suitable names and metadata for each file. This is done manually. The code then performs the following operations:

 1. Finds the files on MinIO
 2. Builds 3-band COGs for all datasets
 3. Uploads the files to GeoServer
 4. Publishes them to GeoNode
 5. Updates the basic metadata by extracting information from the user-specified file names
 
## 1. User input

In [2]:
# Mission folder to search
base_dir = (
    r"/home/notebook/shared-seabee-ns9879k/niva/2022/2022-08-31_RUNDE/2022-08-31_Remoy/"
)

# Input files to search for, mapped to more helpful output file names:
#     region_area_org_spec_date-time.tif
fnames_dict = {
    "rgb_total_composite.tif": "Runde_Remoy_NTNU_HSI_20220831-0800.tif",
    # "20220819_1230_RGB_120m_transparent_mosaic_group1.tif": "KELPMAP_Vega-North_Spectrofly_RGB_20220819-1230.tif",
    # "KelpMap_S_20220818_MS_v2.tif": "KELPMAP_Vega-South_Spectrofly_MS_20220818-0000.tif",
    # "1055rgb120_transparent_mosaic_group1.tif": "KELPMAP_Vega-North_NIVA_RGB_20220818-1055.tif",
    # "0944_ms_120_composite.tif": "KELPMAP_Vega-North_NIVA_MS_20220819-0944.tif",
    # "0814_rgb_115_transparent_mosaic_group1.tif": "KELPMAP_Vega-South_NIVA_RGB_20220819-0814.tif",
    # "0814_ms_115_composite.tif": "KELPMAP_Vega-South_NIVA_MS_20220819-0814.tif",
    # "1012_rgb_60_transparent_mosaic_group1.tif": "KELPMAP_Vega-South_NIVA_RGB_20220819-1012.tif",
}

# Temp folder with 'write' access
cog_fold = r"/home/notebook/cogs/"

# Raster properties
no_data = 0
red_band = 1
green_band = 2
blue_band = 3

In [3]:
# Check output names are unique
assert len(set(fnames_dict.values())) == len(fnames_dict.values())

In [4]:
def find_file_by_name(fname, parent_folder):
    result = []
    for root, dirs, files in os.walk(parent_folder):
        if fname in files:
            result.append(os.path.join(root, fname))
    return result

In [5]:
flist = []
for fname in fnames_dict.keys():
    fpaths = find_file_by_name(fname, base_dir)
    if len(fpaths) > 1:
        print(fpaths)
    elif len(fpaths) == 0:
        print("Could not find:", fname)
    else:
        flist.append(fpaths[0])
flist

['/home/notebook/shared-seabee-ns9879k/niva/2022/2022-08-31_RUNDE/2022-08-31_Remoy/1_drone/1_HSI/2022-08-31_0800_HSI/results/rgb_total_composite.tif']

## 2. Process files

In [6]:
# Login to MinIO
minio_client = sb.storage.minio_login(
    user=SETTINGS.MINIO_ACCESS_ID, password=SETTINGS.MINIO_SECRET_KEY
)

In [7]:
for fpath in flist:
    mission_fold, fname = os.path.split(fpath)
    cog_path = os.path.join(cog_fold, fnames_dict[fname])
    minio_path = os.path.join(mission_fold, fnames_dict[fname])

    print("Processing:", fnames_dict[fname])
    print("  Converting to COG...")

    sb.geo.standardise_orthophoto(
        fpath,
        cog_path,
        red_band=red_band,
        green_band=green_band,
        blue_band=blue_band,
        nodata=no_data,
    )

    print("  Copying COG to MinIO...")

    sb.storage.copy_file(cog_path, minio_path, minio_client, overwrite=False)
    os.remove(cog_path)

    print("  Uploading to GeoServer...")

    layer_name = sb.geo.upload_raster_to_geoserver(
        minio_path,
        SETTINGS.GEOSERVER_USER,
        SETTINGS.GEOSERVER_PASSWORD,
        workspace="geonode",
    )

    print("  Publishing to GeoNode...")

    sb.geo.publish_to_geonode(
        layer_name,
        SETTINGS.GEONODE_USER,
        SETTINGS.GEONODE_PASSWORD,
        workspace="geonode",
    )

    print("  Updating metadata...")

    # Extract metadata from names with format:
    #    region_area_org_spec_date-time.tif
    region, area, org, spec, date = layer_name.split("_")
    date = dt.datetime.strptime(date, "%Y%m%d-%H%M")
    abstract = (
        f"{spec} mosaic collected by {org} for the {region} survey at {area} on {date}."
        f"<br><br><b>MinIO file name:</b> {minio_path}."
    )

    # Update metadata
    metadata = {
        "abstract": abstract,
        "date": date.isoformat(),
        "date_type": "creation",
        "attribution": "SeaBee",
    }
    sb.geo.update_geonode_metadata(
        layer_name,
        SETTINGS.GEONODE_USER,
        SETTINGS.GEONODE_PASSWORD,
        metadata,
    )

    print("  Done.")

Processing: Runde_Remoy_NTNU_HSI_20220831-0800.tif
  Converting to COG...
Input file size is 6953, 5319
0...10...20...30...40...50...60...70...80...90...100 - done.
  Copying COG to MinIO...
  Uploading to GeoServer...
  Publishing to GeoNode...
  Updating metadata...
  Done.


## Update keywords and ISO fields

Geonode have a rest API for datasets on `/api/v2/datasets`, but updating `keywords` and `tkeywords` does not seem to work. One (temporary) approach to achieve it is to:

1. Get the dataset using the geonode api
2. Get the full iso `MD_Metadata` using the csw endpoint `/catalogue/csw`
    - note pycsw does not support transactions for geonode
3. Modify the iso record with seabeepy's gmd package or lxml would also work
4. Login with a csrf token and upload the iso file on `/datasets/upload`


In [None]:
# pip3 install git+https://github.com/SeaBee-no/seabeepy.git
# ignore owslib future warning
import warnings
from typing import Dict, List, Tuple

import requests

# used to marshall the metadata xml
from xsdata.formats.dataclass import serializers

from seabeepy.metadata import gmd, utils

warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
def upload_iso_metadata(
    ds_metadata: List[Tuple[str, gmd.MdMetadata]], geonode_url
) -> List[Dict[str, str]]:
    """Upload iso for a list of datasets

    Login to geonode and get a csrf token so we are allowed to post to `dataset/upload`
    """
    serializer = serializers.XmlSerializer()
    response_list = []
    login_url = f"{geonode_url}/account/login/"

    def post_metadata(client, title: str, ds_meta: gmd.MdMetadata, csrftoken: str):
        return client.post(
            f"{geonode_url}/datasets/upload",
            files={
                "base_file": (
                    "sample.xml",
                    serializer.render(ds_meta),
                    "text/xml",
                ),
            },
            data={
                "permissions": "{}",
                "charset": "undefined",
                "metadata_upload_form": "true",
                "dataset_title": title,
            },
            headers={
                # https://geonode.seabee.sigma2.no/datasets/geonode:test_workshop_Kunstcafe_transparent_mosaic_group1/metadata_upload
                "Referer": login_url,
                "X-CSRFToken": csrftoken,
                "X-Requested-With": "XMLHttpRequest",
            },
            cookies={"csrftoken": csrftoken},
        )

    client = requests.session()
    client.get(login_url)
    # Django would like the csrf token passed with the data, so we do need to save it off seperately.
    csrftoken = client.cookies["csrftoken"]
    r = client.post(
        login_url,
        headers=dict(Referer=login_url),
        data={
            "login": SETTINGS.GEONODE_USER,
            "password": SETTINGS.GEONODE_PASSWORD,
            "csrfmiddlewaretoken": csrftoken,
        },
    )
    # For some reason, we are issued a new csrf token after logging in, so update your local copy.
    csrftoken = client.cookies["csrftoken"]

    for title, ds_meta in ds_metadata:
        resp = post_metadata(client, title, ds_meta, csrftoken)
        response_list.append(resp)
    return response_list

In [None]:
def get_datasets(url) -> List[Dict]:
    resp = requests.get(url).json()
    datasets = resp["datasets"]
    if resp["links"]["next"] is not None:
        datasets.extend(get_datasets(resp["links"]["next"]))

    return datasets

In [None]:
# We are using some older and unregular routes:)
# So not part of v2 api
geonode_url = "https://geonode.seabee.sigma2.no"
# setup xml parsers
serializer = serializers.XmlSerializer(
    config=serializers.config.SerializerConfig(pretty_print=True)
)

resp = requests.get(f"{geonode_url}/api/v2/datasets").json()

datasets = get_datasets(f"{geonode_url}/api/v2/datasets")

In [None]:
datasets = [ds for ds in datasets if ds["subtype"] == "raster"]
len(datasets)

In [None]:
# Keep the list of metadata instance
# this also allow us to marshall all of them to disk
# We add the title(`alternate`) from the ds along with iso record
ds_metadata_list = []

for ds in datasets:
    ds_meta = utils.fetch_dataset_iso(ds["uuid"], geonode_url)
    abstract = ds_meta.identification_info[
        0
    ].md_data_identification.abstract.character_string
    print(
        ds_meta.identification_info[
            0
        ].md_data_identification.citation.ci_citation.title.character_string
    )
    print(abstract)
    ds_meta = utils.remove_all_keywords(ds_meta)
    # Could also just remove norwegian keywords if keeping custom keywords
    # ds_meta = utils.remove_norwegian_thesarus(ds_meta)
    # Add norwegian keywords
    # See https://register.geonorge.no/metadata-kodelister/inspiretema
    ds_meta = utils.add_norwegian_thesarus_keywords(
        ds_meta, ["Ortofoto", "Habitater og biotoper"]
    )
    # Add custom seabee keywords
    keywords = ["SeaBee"]
    if "NIVA" in abstract:
        keywords.append("NIVA")
    if "NINA" in abstract:
        keywords.append("NINA")
    if "Spectrofly" in abstract:
        keywords.append("Spectrofly")
    if "KELPMAP" in abstract:
        keywords.append("KELPMAP")

    ds_meta = utils.add_seabee_keywords(ds_meta, keywords)
    ds_metadata_list.append((ds["alternate"], ds_meta))

### We can marshall the python object to get the xml

In [None]:
with open("./data/sample.xml", "w") as f:
    f.write(serializer.render(ds_metadata_list[0][1]))

In [None]:
resp_list = upload_iso_metadata(ds_metadata_list, geonode_url)

In [None]:
for r in resp_list:
    res = r.json()
    print(res)
    print(res["status"])
    print(f"{geonode_url}{res['url']}")

In [None]:
len(resp_list)