In [1]:
import datetime as dt
import json
import os
import subprocess
import time
from glob import glob

import requests
from config import SETTINGS
from geo.Geoserver import Geoserver
from tqdm.notebook import tqdm


# Upload NIVA datasets

Files within the `niva` bucket are arranged in a fairly complex hierarchy. This can probably be simplified, but it's going to take a while.

This notebook searches a "mission" folder for orthomosaics with user-specified names. For example, Hege sent me a list of mosaic names for the Kelpmap project (which I hadn't previously found because they're quite deeply buried). The orginal file names are generally not very helpful, so in this notebook the user must specify more suitable names for each file. This is done manually. The code then performs the following operations:

 1. Finds the files on MinIO
 2. Builds 3-band COGs for all datasets
 3. Uploads the files to GeoServer
 4. Publishes them to GeoNode
 5. Updates the basic metadata by extracting information from the user-specified file names
 
## 1. Find files

In [None]:
def find_file_by_name(fname, parent_folder):
    result = []
    for root, dirs, files in os.walk(parent_folder):
        if fname in files:
            result.append(os.path.join(root, fname))
    return result

In [None]:
# Mission folder to search
base_dir = r"/home/notebook/shared-seabee-ns9879k/niva/2022/2022-08-18_KELPMAP_Vega/"

# Input files to search for, mapped to more helpful output file names:
#     project_region_area_org_spec_date-time.tif
fnames_dict = {
    "KelpMap_N_20220819_MS_comp.tif": "KELPMAP_Vega_North_Spectrofly_MS_20220819-0000.tif",
    "20220819_1230_RGB_120m_transparent_mosaic_group1.tif": "KELPMAP_Vega_North_Spectrofly_RGB_20220819-1230.tif",
    "KelpMap_S_20220818_MS_v2.tif": "KELPMAP_Vega_South_Spectrofly_MS_20220818-0000.tif",
    "1055rgb120_transparent_mosaic_group1.tif": "KELPMAP_Vega_North_NIVA_RGB_20220818-1055.tif",
    "0944_ms_120_composite.tif": "KELPMAP_Vega_North_NIVA_MS_20220819-0944.tif",
    "0814_rgb_115_transparent_mosaic_group1.tif": "KELPMAP_Vega_South_NIVA_RGB_20220819-0814.tif",
    "0814_ms_115_composite.tif": "KELPMAP_Vega_South_NIVA_MS_20220819-0814.tif",
    "1012_rgb_60_transparent_mosaic_group1.tif": "KELPMAP_Vega_South_NIVA_RGB_20220819-1012.tif",
}

cog_fold = r"/home/notebook/cogs/"
n_threads = 4

In [None]:
# Check output names are unique
assert len(set(fnames_dict.values())) == len(fnames_dict.values())

In [None]:
flist = []
for fname in fnames_dict.keys():
    fpaths = find_file_by_name(fname, base_dir)
    if len(fpaths) > 1:
        print(fpaths)
    elif len(fpaths) == 0:
        print("Could not find:", fname)
    else:
        flist.append(fpaths[0])
# flist

## 2. Convert to COGs

In [None]:
for fpath in flist:
    fname = os.path.basename(fpath)
    cog_path = os.path.join(cog_fold, fnames_dict[fname])
    cmd = [
        "gdal_translate",
        "-b",
        "1",
        "-b",
        "2",
        "-b",
        "3",
        "-of",
        "COG",
        "-ot",
        "Byte",
        "-co",
        "COMPRESS=LZW",
        "-co",
        "PREDICTOR=2",
        "-co",
        f"NUM_THREADS={n_threads}",
        "-co",
        "OVERVIEWS=IGNORE_EXISTING",
        "-co",
        "BIGTIFF=YES",
        "-scale",
        "-a_nodata",
        "0",
        fpath,
        cog_path,
    ]
    subprocess.check_call(cmd)

## 3. Upload to GeoServer

In [None]:
# Authernticate with GeoServer
geo = Geoserver(
    "https://geonode.seabee.sigma2.no/geoserver",
    username=SETTINGS.GEOSERVER_USER,
    password=SETTINGS.GEOSERVER_PASSWORD,
)

In [None]:
# Upload COGs to GeoServer
workspace = "geonode"

search_path = os.path.join(cog_fold, "*.tif")
flist = glob(search_path)
for fpath in tqdm(flist):
    fname = os.path.basename(fpath)
    layer_name = os.path.splitext(fname)[0]

    # Add to GeoServer. Note: Will overwrite layer if it exists
    status = geo.create_coveragestore(
        layer_name=layer_name, path=fpath, workspace=workspace
    )
    # print(status)

## 4. Update GeoNode

Trigger the `updatelayers` command for each new layer via the GeoNode API.

Alternatively, this can be done manually by logging in to the GeoNode administration panel and navigatinge to

    Home > Management Commands Over HTTP > Management command jobs
    
Choose `Add management command job` and set the **Command** to `updatelayers`. Check the **Autostart** box and click **Save**. If you have added a lot of data, the update process may take a while. When it is finished, the status should be updated and the new images datasets be visible in GeoNode.

In [2]:
base_url = "https://geonode.seabee.sigma2.no/api/v2/"
cmd_url = base_url + r"management/commands/"
status_url = base_url + r"management/jobs/"

headers = {"Content-Type": "application/json"}
auth = (SETTINGS.GEOSERVER_USER, SETTINGS.GEONODE_PASSWORD)

wait = 10  # seconds

search_path = os.path.join(cog_fold, "*.tif")
flist = glob(search_path)
for fpath in tqdm(flist):
    fname = os.path.basename(fpath)
    layer_name = os.path.splitext(fname)[0]

    # Update layer
    command = "updatelayers"
    kwargs = {"filter": layer_name, "store": layer_name, "workspace": "geonode"}
    response = requests.post(
        cmd_url,
        headers=headers,
        auth=auth,
        data=json.dumps({"command": command, "kwargs": kwargs}),
    )
    response.raise_for_status()

    # Optional: Wait for completion. Comment out this block to queue all
    # update operations. You can then just wait for the batch to finish
    # (e.g. check via the admin. panel)
    job_id = response.json()["data"]["id"]
    job_url = status_url + f"{job_id}/status/"
    job_status = "NOT_FINISHED"
    while job_status != "FINISHED":
        job_status = requests.get(job_url, auth=auth).json()["status"]
        time.sleep(wait)

NameError: name 'cog_fold' is not defined

## 5. Update metadata

In [None]:
base_url = "https://geonode.seabee.sigma2.no/api/v2/"
headers = {"Authorization": f"Bearer {SETTINGS.GEONODE_TOKEN}"}
auth = (SETTINGS.GEOSERVER_USER, SETTINGS.GEONODE_PASSWORD)

inv_fnames_dict = {y: x for x, y in fnames_dict.items()}
search_path = os.path.join(cog_fold, "*.tif")
flist = glob(search_path)
for fpath in tqdm(flist):
    fname = os.path.basename(fpath)
    layer_name = os.path.splitext(fname)[0]

    # Find resource ID
    filter_url = base_url + f"resources?search={layer_name}&search_fields=title"
    response = requests.request("GET", filter_url, headers=headers)
    response.raise_for_status()
    data = response.json()
    assert data["total"] == 1, f"More than one dataset found with title '{layer_name}'."
    dataset_id = data["resources"][0]["pk"]

    # Extract metadata from names with format:
    #    project_region_area_org_spec_date-time.tif
    project, area, ns, org, bands, date = layer_name.split("_")
    date = dt.datetime.strptime(date, "%Y%m%d-%H%M")
    orig_fname = inv_fnames_dict[fname]
    abstract = (
        f"{bands} mosaic collected by {org} for the {project} project at {area} {ns} on {date}."
        f"<br><br><b>MinIO file name:</b> {orig_fname}."
    )

    # Update metadata
    data = {
        "abstract": abstract,
        "date": date.isoformat(),
        "date_type": "creation",
        "attribution": "SeaBee",
    }
    update_url = base_url + f"datasets/{dataset_id}"
    response = requests.patch(update_url, auth=auth, json=data)
    response.raise_for_status()

## Update keywords and ISO fields

Geonode have a rest API for datasets on `/api/v2/datasets`, but updating `keywords` and `tkeywords` does not seem to work. One (temporary) approach to achieve it is to:

1. Get the dataset using the geonode api
2. Get the full iso `MD_Metadata` using the csw endpoint `/catalogue/csw`
    - note pycsw does not support transactions for geonode
3. Modify the iso record with seabeepy's gmd package or lxml would also work
4. Login with a csrf token and upload the iso file on `/datasets/upload`


In [2]:

from seabeepy.metadata import utils, templates, gmd
from typing import List, Dict, Tuple
# used to marshall the metadata xml
from xsdata.formats.dataclass import serializers
# ignore owslib future warning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def upload_iso_metadata(ds_metadata: List[Tuple[str,gmd.MdMetadata]], geonode_url) -> List[Dict[str,str]]:
    """Upload iso for a list of datasets

    Login to geonode and get a csrf token so we are allowed to post to `dataset/upload`
    """
    serializer = serializers.XmlSerializer()
    response_list = []
    login_url = f"{geonode_url}/account/login/"
    def post_metadata(client, title:str, ds_meta: gmd.MdMetadata, csrftoken: str):
        return client.post(
            f"{geonode_url}/datasets/upload",
            files={
                "base_file": (
                    "sample.xml",
                    serializer.render(ds_meta),
                    "text/xml",
                ),
            },
            data={
                "permissions": "{}",
                "charset": "undefined",
                "metadata_upload_form": "true",
                "dataset_title": title,
            },
            headers={
                "X-CSRFToken": csrftoken,
                "X-Requested-With": "XMLHttpRequest",
            },
            cookies={"csrftoken": csrftoken}
            )
    
    client = requests.session()
    client.get(login_url)
    # Django would like the csrf token passed with the data, so we do need to save it off seperately.
    csrftoken = client.cookies["csrftoken"]
    r = client.post(
        login_url, data={"login": SETTINGS.GEONODE_USER, "password": SETTINGS.GEONODE_PASSWORD, "csrfmiddlewaretoken": csrftoken}
    )
    # For some reason, we are issued a new csrf token after logging in, so update your local copy.
    csrftoken = client.cookies["csrftoken"]
    
    for title, ds_meta in ds_metadata:
        resp = post_metadata(client, title, ds_meta, csrftoken)
        response_list.append(resp)
    return response_list

In [4]:
# We are using some older and unregular routes:)
# So not part of v2 api
geonode_url = "http://localhost"
# setup xml parsers
serializer = serializers.XmlSerializer(config=serializers.config.SerializerConfig(pretty_print=True))

resp = requests.get(f"{geonode_url}/api/v2/datasets").json()
datasets = [ds for ds in resp["datasets"] if ds["subtype"] == "raster"]



In [5]:
# Keep the list of metadata instance
# this also allow us to marshall all of them to disk
# We add the title(`alternate`) from the ds along with iso record
ds_metadata_list = []

for ds in datasets:
    ds_meta = utils.fetch_dataset_iso(datasets[0]["uuid"], geonode_url)
    print(ds_meta.identification_info[0].md_data_identification.citation.ci_citation.title.character_string)
    print(ds_meta.identification_info[0].md_data_identification.abstract.character_string)
    ds_meta = utils.remove_all_keywords(ds_meta)
    # Could also just remove norwegian keywords if keeping custom keywords
    # ds_meta = utils.remove_norwegian_thesarus(ds_meta)
    # Add norwegian keywords
    # See https://register.geonorge.no/metadata-kodelister/inspiretema
    ds_meta = utils.add_norwegian_thesarus_keywords(ds_meta, ["Ortofoto", "Habitater og biotoper"])
    # Add custom seabee keywords
    ds_meta = utils.add_seabee_keywords(ds_meta, ["SeaBee", "NIVA"])
    ds_metadata_list.append((ds["alternate"], ds_meta))



KIMS Title
NEW NEW ABSTRACT


### We can marshall the python object to get the xml

In [6]:
with open("../data/sample.xml", "w") as f:
    f.write(serializer.render(ds_meta))

In [28]:
resp_list = upload_iso_metadata(ds_metadata_list, geonode_url)

In [29]:
for r in resp_list:
    res = r.json()
    print(res["status"])
    print(f"{geonode_url}{res['url']}")

['finished']
http://localhost/catalogue/#/dataset/4
