Install the needed libraries

In [None]:
!pip install boto3
!(cd $RSPY_WHL_DIR && pip install rs_client_libraries-*.whl )

In [None]:
# We'll use boto3 to monitor the s3 bucket. 
# Note: the S3_ACCESSKEY, S3_SECRETKEY and S3_ENDPOINT are given in the docker-compose.yml file.
import boto3
import os

s3_session = boto3.session.Session()
s3_client = s3_session.client(
    service_name="s3",
    aws_access_key_id=os.environ["S3_ACCESSKEY"],
    aws_secret_access_key=os.environ["S3_SECRETKEY"],
    endpoint_url=os.environ["S3_ENDPOINT"],
    region_name=os.environ["S3_REGION"],
)

buckets = ["tmp-download", "catalog-bucket"]
bucket_dir = "stations"
bucket_url = f"s3://{buckets[0]}/{bucket_dir}"

# If bucket is already created, clear all files in order to start fresh for each demo. 
for b in buckets:
    if b in [bucket["Name"] for bucket in s3_client.list_buckets()["Buckets"]]:
        if 'Contents' in s3_client.list_objects(Bucket=b):
            objects = s3_client.list_objects(Bucket=b)['Contents']
            for obj in objects:
                # clear up the bucket
                s3_client.delete_object(Bucket=b, Key=obj['Key'])
    else:
        s3_client.create_bucket(Bucket=b)
for b in buckets:
    print(f"Is {b} empty ?: ", 'Contents' not in s3_client.list_objects(Bucket=b))


A bucket "tmp-download" is created for the purpose of this demo. Thus, the cadip and adgs prefect flows will be asking for the rs-server endpoints to download the files from CADIP and ADGS stations and to upload them to s3://auxiliary-files/stations/<station_name>
After a succesfull upload to s3 bucket, the stac catalog is updated with the info related to that file. 

In [None]:
# Create the user's collection first (this has to be done on client side)

from dataclasses import dataclass
import requests

@dataclass
class Collection:
    """A collection for test purpose."""

    user: str
    name: str

    @property
    def id_(self) -> str:
        """Returns the id."""
        return f"{self.user}_{self.name}"

    @property
    def properties(self):
        """Returns the properties."""
        return {
            "id": self.name,
            "type": "Collection",
            "links": [
                {
                    "rel": "items",
                    "type": "application/geo+json",
                    "href": f"http://localhost:8082/collections/{self.name}/items",
                },
                {"rel": "parent", "type": "application/json", "href": "http://localhost:8082/"},
                {"rel": "root", "type": "application/json", "href": "http://localhost:8082/"},
                {
                    "rel": "self",
                    "type": "application/json",
                    "href": f"""http://localhost:8082/collections/{self.name}""",
                },
                {
                    "rel": "license",
                    "href": "https://creativecommons.org/licenses/publicdomain/",
                    "title": "public domain",
                },
            ],
            "extent": {
                "spatial": {"bbox": [[-94.6911621, 37.0332547, -94.402771, 37.1077651]]},
                "temporal": {"interval": [["2000-02-01T00:00:00Z", "2000-02-12T00:00:00Z"]]},
            },
            "license": "public-domain",
            "description": "Some description",
            "stac_version": "1.0.0",
        }
    
user = "DemoUser"
mission = "s1"
url_catalog = "http://rs-server-catalog:8000"

# Create the collection for DemoUser
collection_type = Collection(user, f"{mission}_aux")
response = requests.post(url_catalog + f"/catalog/{user}/collections", json=collection_type.properties)

In [None]:

import threading
from datetime import datetime
from rs_workflows.common import (
    PrefectFlowConfig,
    download_flow,
)

def run_flow(user, url, url_catalog, station, mission, tmp_local_download, bucket_url, no_of_tasks, start_date, stop_date):
    # start the prefect flow
    download_flow(PrefectFlowConfig(user,
                               url,
                               url_catalog,
                               station,
                               mission,
                               tmp_local_download,
                               bucket_url,
                               no_of_tasks,
                               datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%SZ"),
                               datetime.strptime(stop_date, "%Y-%m-%dT%H:%M:%SZ"),                                   
            )
)

user = "DemoUser"
mission = "s1"
stations = ["CADIP", "ADGS"]
url = "http://rs-server-{}:8000"
url_catalog = "http://rs-server-catalog:8000"
tmp_local_download = "/tmp/{}_tmp"
no_of_tasks = 1
threads = []

for station in stations:
    run_flow(user,
             url.format(station.lower()),
             url_catalog,
             station,
             mission,
             tmp_local_download.format(station),
             bucket_url + f"/{station}",
             no_of_tasks,
             "2014-01-01T12:00:00Z",
             "2024-02-20T12:00:00Z",
             )
    

In [None]:
# Define some functions for a later use in generating the yaml file as input for the DPR mockup processor
def gen_payload_inputs(s3_content):
        yaml_content = []
        cadu_id = 0
        aux_id = 0
        for file in s3_content:            
            if "AUX" in file["Key"]:
                input_id = f"AUX{aux_id}"
                aux_id += 1
            else:    
                input_id = f"CADU{cadu_id}"
                cadu_id +=1
            yaml_template = {"id": input_id, "path": file["Key"], "store_type": "zarr", "store_params": {}}
            yaml_content.append(yaml_template)
        return yaml_content

def gen_inputs_list(s3_content):
    composer = []
    cadu_id = 0
    aux_id = 0
    for input_cnt, file in enumerate(s3_content):
        file_id = f'in{input_cnt}'
        if "AUX" in file["Key"]:
            input_id = f"AUX{aux_id}"
            aux_id += 1
        else:    
            input_id = f"CADU{cadu_id}"
            cadu_id +=1
        composer.append({file_id: input_id})
    return composer

RSPY DPR Processor mockup demo:


The DPRProcessor is a class that simulates processing part performed by eopf-cpm triggering module. The input of the processor it's a yaml config file with all the input files and expected outputs locations (local or s3).

The implemented mockup performs the following actions:
1. Check the validity of input yaml file (chunks/aux existance / naming convention)
2. Downloads the zarr input from public s3 ovh based on product type required in payload yaml.
3. Updates the .zattrs with our processor name (RSPY_DprMockupProcessor) and timestamp (if product is zipped, our processor updates zattrs inside .zip without extracting files)
4. Computes the CRC of updated .zattrs
5. Update product name VVV (as per EOPF-CPM PSD) with computed CRC, in order to call processor multiple times with same input and generated different outputs.
6. Uploads the products to s3 server (minio for this demo).
7. Removes the local downloaded products (if a flag is set).
8. Retrieves the .zattrs into a serialisable format (dict) in order to upload catalog in the future step of our processing chain.

In [None]:
yaml_payload = """
general_configuration:
  logging:
    level: DEBUG
  triggering__validate_run: true
  triggering__use_default_filename: true
  triggering__use_basic_logging: true
  triggering__load_default_logging: false
breakpoints:
workflow:
- step: 1
  active: true
  module: rs.dpr.mockup # string corresponding to the python path of the module
  processing_unit: DprMockupProcessor # EOProcessingUnit class name
  name: DprMockupProcessor # identifier for the processing unit
  inputs:
  -
  outputs:
    out: outputs
  parameters:
    product_types: # List of EOPF product types we want to generate. In this example we simulate S1L0 processor that generates 4 products
      - S1SSMOCN
I/O:
  inputs_products:
   -
  output_products:
  - id: outputs
    path: s3://test-processed-data/zarr/dpr_processor_output/ # output folder or S3 bucket
    type: folder
    store_type: zarr
    store_params: {}
dask_context: {}
logging: {}
config: {}
"""

# Update the yaml template with files downloaded from stations
import yaml
# Convert to yaml
yaml_payload = yaml.safe_load(yaml_payload)
# Update I/O and inputs
print(s3_client.list_objects(Bucket=buckets[1]))
yaml_payload["workflow"][0]['inputs'] = gen_inputs_list(s3_client.list_objects(Bucket=buckets[1])["Contents"])
yaml_payload["I/O"]['inputs_products'] = [item for item in gen_payload_inputs(s3_client.list_objects(Bucket=buckets[1])["Contents"])]
print(yaml.dump(yaml_payload))

In [None]:
# We'll use boto3 to monitor the s3 bucket.
# Note: the S3_ACCESSKEY, S3_SECRETKEY and S3_ENDPOINT are given in the docker-compose.yml file.
s3_session = boto3.session.Session()
s3_client = s3_session.client(
    service_name="s3",
    aws_access_key_id=os.environ["S3_ACCESSKEY"],
    aws_secret_access_key=os.environ["S3_SECRETKEY"],
    endpoint_url=os.environ["S3_ENDPOINT"],
    region_name=os.environ["S3_REGION"],
)
bucket_name = "test-processed-data"
bucket_dir = "zarr/dpr_processor_output"
bucket_url = f"s3://{bucket_name}/{bucket_dir}"

# If bucket is already created, clear all files in order to start fresh for each demo. 
if bucket_name in [bucket["Name"] for bucket in s3_client.list_buckets()["Buckets"]]:
    if 'Contents' in s3_client.list_objects(Bucket=bucket_name):
        objects = s3_client.list_objects(Bucket=bucket_name)['Contents']
        for obj in objects:
            # clear up the bucket
            s3_client.delete_object(Bucket=bucket_name, Key=obj['Key'])
else:
    s3_client.create_bucket(Bucket=bucket_name)

print("Is bucket empty now ?: ", 'Contents' not in s3_client.list_objects(Bucket=bucket_name))

Convert yaml to json in order to post it over HTTP and call the simulator webserver endpoint.
The output of run() method is a list of all stac-comptabile .zattrs.

In [None]:
import requests
import json
import pprint

dpr_simulator_endpoint = "http://dpr-simulator:8000/run" # rs-server host = the container name
response = requests.post(dpr_simulator_endpoint, json=yaml_payload)

pp = pprint.PrettyPrinter(indent=4)
for attr in response.json():
    pp.pprint(attr)

In [None]:
s3_client.list_objects(Bucket=bucket_name)['Contents']