## PAVICS Web Processing Services using OGC-API integration with Weaver

When [Weaver component](https://github.com/bird-house/birdhouse-deploy/tree/master/birdhouse/components#weaver)
is enabled, all WPS *birds* registered as process *providers* will be automatically accessible using
[OGC-API - Processes][ogcapi-proc] interface from the endpoint where [Weaver][weaver] is defined.

[weaver]: https://github.com/crim-ca/weaver
[ogcapi-proc]: https://github.com/opengeospatial/ogcapi-processes/


In [1]:
import json
import os
import time

import requests
import urllib3

WEAVER_TEST_FQDN = os.getenv(
    "WEAVER_TEST_FQDN", os.getenv("PAVICS_HOST", "pavics.ouranos.ca")
)
WEAVER_TEST_URL = os.getenv("WEAVER_TEST_URL", f"https://{WEAVER_TEST_FQDN}/weaver")
WEAVER_TEST_SSL_VERIFY = str(os.getenv("WEAVER_TEST_SSL_VERIFY", "true")).lower() in [
    "true",
    "1",
    "on",
    "yes",
]
WEAVER_TEST_DEFAULT_BIRDS = "finch, flyingpigeon, hummingbird, raven"
WEAVER_TEST_KNOWN_BIRDS = os.getenv(
    "WEAVER_TEST_KNOWN_BIRDS", WEAVER_TEST_DEFAULT_BIRDS
)
WEAVER_TEST_KNOWN_BIRDS = list(
    bird.strip() for bird in WEAVER_TEST_KNOWN_BIRDS.split(",")
)
WEAVER_TEST_DEFAULT_FILE = "/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/ta_Amon_MRI-CGCM3_decadal1980_r1i1p1_199101-200012.nc"
WEAVER_TEST_FILE = os.getenv(
    "WEAVER_TEST_FILE",
    f"https://{WEAVER_TEST_FQDN}{WEAVER_TEST_DEFAULT_FILE}",
)
WEAVER_TEST_WPS_OUTPUTS = f"https://{WEAVER_TEST_FQDN}/wpsoutputs"  # for validation

WEAVER_TEST_REQUEST_HEADERS = {
    "Accept": "application/json",
    "Content-Type": "application/json",
}
WEAVER_TEST_REQUEST_XARGS = dict(
    headers=WEAVER_TEST_REQUEST_HEADERS, verify=WEAVER_TEST_SSL_VERIFY, timeout=5
)

if not WEAVER_TEST_SSL_VERIFY:
    urllib3.disable_warnings()

print("Variables:")
variables = [
    ("WEAVER_TEST_FQDN", WEAVER_TEST_FQDN),
    ("WEAVER_TEST_URL", WEAVER_TEST_URL),
    ("WEAVER_TEST_WPS_OUTPUTS", WEAVER_TEST_WPS_OUTPUTS),
    ("WEAVER_TEST_SSL_VERIFY", WEAVER_TEST_SSL_VERIFY),
    ("WEAVER_TEST_FILE", WEAVER_TEST_FILE),
    ("WEAVER_TEST_KNOWN_BIRDS", WEAVER_TEST_KNOWN_BIRDS),
    ("WEAVER_TEST_REQUEST_XARGS", WEAVER_TEST_REQUEST_XARGS),
]
max_len = max(len(var[0]) for var in variables) + 2
msg = f"  {{:{max_len}}}{{}}"
for var, val in variables:
    print(msg.format(var, val))


assert (
    len(WEAVER_TEST_KNOWN_BIRDS) >= 1
), "No test WPS provider provided in 'WEAVER_TEST_KNOWN_BIRDS'."

Variables:
  WEAVER_TEST_FQDN           pavics.ouranos.ca
  WEAVER_TEST_URL            https://pavics.ouranos.ca/weaver
  WEAVER_TEST_WPS_OUTPUTS    https://pavics.ouranos.ca/wpsoutputs
  WEAVER_TEST_SSL_VERIFY     True
  WEAVER_TEST_FILE           https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/ta_Amon_MRI-CGCM3_decadal1980_r1i1p1_199101-200012.nc
  WEAVER_TEST_KNOWN_BIRDS    ['finch', 'flyingpigeon', 'hummingbird', 'raven']
  WEAVER_TEST_REQUEST_XARGS  {'headers': {'Accept': 'application/json', 'Content-Type': 'application/json'}, 'verify': True, 'timeout': 5}


### Define some utility functions for displaying test results

In [2]:
def json_dump(_json):
    try:
        if isinstance(_json, str):
            _json = json.loads(_json)
        return json.dumps(_json, indent=2, ensure_ascii=False)
    except Exception:
        return str(_json)


def json_print(_json):
    print(json_dump(_json))

### Start with simple listing of registered WPS providers in Weaver


In [3]:
print("Listing WPS providers registered under Weaver...\n")

path = f"{WEAVER_TEST_URL}/providers"
query = {
    "detail": False,
    "check": False,
}  # skip pre-fetch to obtain results quickly (all checked in following cells)
resp = requests.get(path, params=query, **WEAVER_TEST_REQUEST_XARGS)
assert (
    resp.status_code == 200
), f"Error during WPS bird providers listing from [{path}]:\n{json_dump(resp.text)}"
body = resp.json()
json_print(body)

assert "providers" in body and len(
    body["providers"]
), "Could not find Weaver WPS providers"
missing = []
for bird in sorted(WEAVER_TEST_KNOWN_BIRDS):
    if bird not in body["providers"]:
        missing.append(bird)
assert (
    not missing
), f"Could not find all expected Weaver WPS providers.\nMissing: [{missing}]\nExpected: [{WEAVER_TEST_KNOWN_BIRDS}]"
bird_ids = body["providers"]

Listing WPS providers registered under Weaver...

{
  "checked": false,
  "providers": [
    "finch",
    "flyingpigeon",
    "hummingbird",
    "raven"
  ]
}


### Obtain OGC-API converted WPS processes by Weaver from original WPS providers endpoints

For each registered provider, Weaver sends a *GetCapabilities* WPS request to the remote endpoint and parses
the XML result in order to form the corresponding OGC-API JSON content.

In [4]:
print("Listing WPS provider processes converted to OGC-API interface by Weaver:\n")

process_locations = []
for bird in sorted(WEAVER_TEST_KNOWN_BIRDS):
    path = f"{WEAVER_TEST_URL}/providers/{bird}/processes"
    resp = requests.get(path, **WEAVER_TEST_REQUEST_XARGS)
    assert (
        resp.status_code == 200
    ), f"Error during WPS bird processes retrieval on: [{path}]\n[{json_dump(resp.text)}]"
    body = resp.json()
    assert len(body["processes"]), f"WPS bird [{bird}] did not list any process!"
    for process in sorted(body["processes"], key=lambda p: p["id"]):
        process_desc_url = f"{path}/{process['id']}"
        process_locations.append(process_desc_url)
        print(" -", process_desc_url)

Listing WPS provider processes converted to OGC-API interface by Weaver:

 - https://pavics.ouranos.ca/weaver/providers/catalog/processes/getpoint
 - https://pavics.ouranos.ca/weaver/providers/catalog/processes/ncplotly
 - https://pavics.ouranos.ca/weaver/providers/catalog/processes/pavicrawler
 - https://pavics.ouranos.ca/weaver/providers/catalog/processes/pavicsearch
 - https://pavics.ouranos.ca/weaver/providers/catalog/processes/pavicstestdocs
 - https://pavics.ouranos.ca/weaver/providers/catalog/processes/pavicsupdate
 - https://pavics.ouranos.ca/weaver/providers/catalog/processes/pavicsvalidate
 - https://pavics.ouranos.ca/weaver/providers/catalog/processes/period2indices
 - https://pavics.ouranos.ca/weaver/providers/finch/processes/average_polygon
 - https://pavics.ouranos.ca/weaver/providers/finch/processes/base_flow_index
 - https://pavics.ouranos.ca/weaver/providers/finch/processes/biologically_effective_degree_days
 - https://pavics.ouranos.ca/weaver/providers/finch/processes

### Dispatched execution of Flyingpigeon WPS process

Here, we attempt running the same process defined in [WPS_example Notebook](../notebooks/WPS_example.ipynb), but
through the OGC-API interface provided by Weaver.

The process execution received by Weaver gets dispatched to the real WPS location. Weaver then
monitors the process until completion and, once completed, returns the location where results can be retrieved.

In [5]:
assert (
    "hummingbird" in WEAVER_TEST_KNOWN_BIRDS
), "Hummingbird not specified within known WPS provider birds by Weaver. Cannot test dispatched process execution..."

WEAVER_BIRD_URL = f"{WEAVER_TEST_URL}/providers/hummingbird"
WEAVER_BIRD_PROCESS_URL = f"{WEAVER_BIRD_URL}/processes/ncdump"
assert (
    WEAVER_BIRD_PROCESS_URL in process_locations
), f"Could not find WPS bird process URL to test execution [{WEAVER_BIRD_PROCESS_URL}]."

print(f"Will run process: [{WEAVER_BIRD_PROCESS_URL}]")

Will run process: [https://pavics.ouranos.ca/weaver/providers/hummingbird/processes/ncdump]


#### First let's obtain the specific description of the test WPS process

This request will tell us the explicit details of the process such as its inputs, outputs, and other metadata.
Weaver parses the results retrieved from the original WPS provider using *DescribeProcess* request to
generate the corresponding outputs. Weaver also adds additional metadata when it can infer some missing
details from returned description fields.

In [8]:
# NBVAL_IGNORE_OUTPUT
# ignore detailed description prone to changes, instead run a few basic manual validations

print("Getting WPS process description...\n")

resp = requests.get(WEAVER_BIRD_PROCESS_URL, **WEAVER_TEST_REQUEST_XARGS)
assert (
    resp.status_code == 200
), f"Error getting WPS process description:\n[{json_dump(resp.text)}]"
body = resp.json()
json_print(body)

assert "hummingbird" in body["keywords"]
assert "wps-remote" in body["keywords"]
assert body["id"] == "ncdump"

Getting WPS process description...

{
  "id": "ncdump",
  "title": "NCDump",
  "version": "4.4.1.1",
  "mutable": true,
  "description": "Run ncdump to retrieve NetCDF header metadata.",
  "keywords": [
    "hummingbird",
    "Hummingbird",
    "wps-remote"
  ],
  "metadata": [
    {
      "title": "Birdhouse",
      "href": "http://bird-house.github.io/",
      "rel": "birdhouse"
    },
    {
      "title": "User Guide",
      "href": "http://birdhouse-hummingbird.readthedocs.io/en/latest/",
      "rel": "user-guide"
    }
  ],
  "inputs": {
    "dataset": {
      "title": "Dataset",
      "description": "Enter a URL pointing to a NetCDF file (optional)",
      "minOccurs": 0,
      "maxOccurs": 100,
      "schema": {
        "type": "array",
        "items": {
          "type": "string"
        },
        "minItems": 0,
        "maxItems": 100
      },
      "formats": [
        {
          "default": true,
          "mediaType": "application/x-netcdf"
        }
      ]
    },
    "d

#### Submit the new process execution

Using OGC-API interface, WPS process execution are accomplished using a *Job*. That job will tell us the status
location where we can monitor the process execution.

From the previous response, we can see that the process accepts many inputs and format variations.
In this case, we are interested in the input named `dataset` to submit the file defined by `WEAVER_TEST_FILE`.

Following execution of the process, we expect to obtain a raw text data dump of the test file content.
The location of the raw text file is expected be provided by output named `output` according to the process description.

In [17]:
print("Submitting process job with:")
print(f"  File:     [{WEAVER_TEST_FILE}]")
print(f"  Process:  [{WEAVER_BIRD_PROCESS_URL}]")

data = {
    "mode": "async",  # This tells Weaver to run the process asynchronously, such that we get non-blocking status location
    "response": "document",  # Type of status response (only this mode supported for the time being)
    "inputs": [
        {
            "id": "dataset_opendap",  # Target input of the process
            # Note: even though this is an URL, the expected type is a 'string' (not a 'File')
            #       therefore, 'data' (or 'value') must be used instead of 'href'
            "data": WEAVER_TEST_FILE,
        }
    ],
    "outputs": [
        {
            "id": "output",  # Target output we want to retrieve
            "transmissionMode": "reference",  # Ask to provide the result as HTTP reference
        }
    ],
}


# define a function to allow re-submitting later in case of error
def submit_job() -> str:
    _path = f"{WEAVER_BIRD_PROCESS_URL}/jobs"
    _resp = requests.post(_path, json=data, **WEAVER_TEST_REQUEST_XARGS)
    assert _resp.status_code in [
        200,
        201,
    ], f"Error during WPS job submission:\n{json_dump(resp.text)}"
    loc = _resp.headers.get("Location")
    assert loc, "Could not find status location URL"
    return loc


status_location = submit_job()
print(f"Job Status Location: [{status_location}]")

Submitting process job with:
  File:     [https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/ta_Amon_MRI-CGCM3_decadal1980_r1i1p1_199101-200012.nc]
  Process:  [https://pavics.ouranos.ca/weaver/providers/hummingbird/processes/ncdump]
Job Status Location: [https://pavics.ouranos.ca/weaver/providers/hummingbird/processes/ncdump/jobs/fe9753e3-f0d8-457e-9171-7a0b732fc02c]


#### Monitor execution until completion

Now, we wait until the process completes by periodically verifying the provided status location of the job.
The job will be running asynchronously and will be gradually updated with progression and logging details.

Following job submission request, the `status` can be either `accepted` if it is still in queue pending execution, or
already be `running`. Once the job completes, the `status` should indicate it was either `succeeded` or `failed`.


In [18]:
# NBVAL_IGNORE_OUTPUT
# ignore status updates of job monitoring

print("Waiting for job completion with pooling monitoring of its status...")

# Define a timeout to abandon this monitoring. Process is relatively quick and shouldn't last too long.
# The process will be retried if failed to handle possible sporadic errors from the WPS remote provider.
# Stops on first maximum timout/retry reached, whichever happens first.
timeout = 60
retries = 10
attempt = retries
delta = 5
body = {}
while timeout >= 0:
    resp = requests.get(status_location, **WEAVER_TEST_REQUEST_XARGS)
    assert (
        resp.status_code == 200
    ), f"Failed retrieving job status at location [{status_location}]"
    body = resp.json()
    timeout -= delta
    status = body["status"]
    if status in ["accepted", "running"]:
        print(f"Delay: {delta}s, Duration: {body['duration']}, Status: {status}")
        time.sleep(delta)
        continue
    if status in ["failed", "succeeded"]:
        print(f"Final job status: [{status}]")
        if status == "failed":
            if attempt > 0:
                attempt -= 1
                retry_msg = f"{retries - attempt}/{retries}"
                print(f"Retrying execution... ({retry_msg})")
                status_location = submit_job()
                print(f"Job Status Location: [{status_location}] (retry: {retry_msg})")
                continue
            else:
                print(f"Final retry attempt reached ({retries}). Aborting.")
        break
    raise ValueError(f"Unhandled job status during monitoring: [{status}]")
assert timeout > 0, "Timeout reached. Process job submission never finished."

# note: don't assert the process success/failure yet, to retrieve more details in case it failed
assert body and "status" in body, f"Could not retrieve job status [{status_location}]"
status = body["status"]

Waiting for job completion with pooling monitoring of its status...
Delay: 5s, Duration: 00:00:00, Status: running
Final job status: [failed]
Retrying execution... (1/10)
Job Status Location: [https://pavics.ouranos.ca/weaver/providers/hummingbird/processes/ncdump/jobs/6e23a817-e202-497e-841b-f1d5b2d6eb42] (retry: 1/10)
Delay: 5s, Duration: 00:00:00, Status: accepted
Final job status: [succeeded]


#### Obtain job execution logs

Retrieve job logs listing execution steps accomplished by Weaver and the underlying process if it provided
status messages. During job execution, Weaver attempts to collect any output the original WPS produces and
integrates them within its own job logs in order to generate sequential chain of log events by each executed steps.

In case the job `failed` execution, this log will help us identify the cause of the problem.
Otherwise, we will have a summary of processing steps.

**NOTE**:

> Job logs is a feature specific to Weaver that is not necessarily implemented by other implementations
  of [OGC-API - Processes](https://github.com/opengeospatial/ogcapi-processes/).


In [19]:
# NBVAL_IGNORE_OUTPUT
# ignore variable logs values that could easily change, only informative

print("Obtaining job logs from execution...")

path = f"{status_location}/logs"
resp = requests.get(path, **WEAVER_TEST_REQUEST_XARGS)
assert resp.status_code == 200, f"Failed to retrieve job logs [{path}]"
logs = resp.json()

log_lines = "\n".join(logs)
assert len(logs) > 1
assert (
    status == "succeeded"
), f"Job execution was not successful. Status: [{status}]\nFull Logs:\n\n{log_lines}"
assert (
    "100%" in logs[-1] and "succeeded" in logs[-1]
), f"Log entry: [{logs[-1]}]\nFull Logs:\n\n{log_lines}"
print(f"Job logs retrieved from [{path}]:\n\n{log_lines}")

Obtaining job logs from execution...
Job logs retrieved from [https://pavics.ouranos.ca/weaver/providers/hummingbird/processes/ncdump/jobs/6e23a817-e202-497e-841b-f1d5b2d6eb42/logs]:

[2022-08-24 15:25:05] INFO     [weaver.datatype.Job] 00:00:00   0% accepted   Job task submitted for execution.
[2022-08-24 15:25:05] INFO     [weaver.datatype.Job] 00:00:00   0% running    Job started.
[2022-08-24 15:25:05] INFO     [weaver.datatype.Job] 00:00:00   0% running    Job task setup initiated.
[2022-08-24 15:25:05] INFO     [weaver.datatype.Job] 00:00:00   1% running    Job task setup completed.
[2022-08-24 15:25:05] DEBUG    [weaver.datatype.Job] 00:00:00   2% running    Employed WPS URL: [https://pavics.ouranos.ca/twitcher/ows/proxy/hummingbird]
[2022-08-24 15:25:05] INFO     [weaver.datatype.Job] 00:00:00   2% running    Execute WPS request for process [ncdump]
[2022-08-24 15:25:05] INFO     [weaver.datatype.Job] 00:00:00   3% running    Fetching job input definitions.
[2022-08-24 15:25:05]

#### Obtain the result location and output the data

When job is `succeeded`, the result endpoint under the corresponding job will provide the downloadable file references
for each of the available output ID defined by the WPS process.

Since the sample NetCDF file provided as input is expected to be converted to raw text data, it can be displayed below.

In [22]:
# If execution succeeded, the results endpoint will return 200 with corresponding references.
# Otherwise, 400 occurs because results were not produced due to failing job, and requesting its outputs is an invalid request.
path = f"{status_location}/results"
resp = requests.get(path, **WEAVER_TEST_REQUEST_XARGS)
assert (
    resp.status_code == 200
), f"Failed to retrieve job results location [{path}]. Code: [{resp.status_code}]."
print("\nJob was successful! Retrieving result location...")
body = resp.json()

# Here, our target output ID is named 'output' according to the process description
output = body.get("output")
assert isinstance(
    output, dict
), f"Could not find result matching ID 'output' within:\n{json_dump(body)}"
href = output["href"]
assert isinstance(href, str) and href.startswith(
    WEAVER_TEST_WPS_OUTPUTS
), f"Output result location does not have expected reference format: [{href}]"
print(f"Result is located at: [{href}]\n")
assert href.endswith(".txt")

print("Fetching output contents...")
resp = requests.get(href)
print(f"\nNCDUMP 'output' result content:\n\n{resp.text}")


Job was successful! Retrieving result location...
Result is located at: [https://pavics.ouranos.ca/wpsoutputs/weaver/public/54ce8f3f-8fe4-4081-8063-24fb8ea52250/nc_dump_6zwZ5z.txt]

Fetching output contents...

NCDUMP 'output' result content:

netcdf ta_Amon_MRI-CGCM3_decadal1980_r1i1p1_199101-200012.nc {
dimensions:
	time = UNLIMITED ; // (120 currently)
	bnds = 2 ;
	lat = 160 ;
	lon = 320 ;
	plev = 23 ;
variables:
	double time(time) ;
		time:bounds = "time_bnds" ;
		time:units = "days since 1981-01-01" ;
		time:calendar = "standard" ;
		time:axis = "T" ;
		time:long_name = "time" ;
		time:standard_name = "time" ;
	double time_bnds(time, bnds) ;
	double plev(plev) ;
		plev:units = "Pa" ;
		plev:axis = "Z" ;
		plev:positive = "down" ;
		plev:long_name = "pressure" ;
		plev:standard_name = "air_pressure" ;
	double lat(lat) ;
		lat:bounds = "lat_bnds" ;
		lat:units = "degrees_north" ;
		lat:axis = "Y" ;
		lat:long_name = "latitude" ;
		lat:standard_name = "latitude" ;
	double lat_bnds(l