This demo uses the following user stories:
- RSPY-25
- RSPY-85
- RSPY-100
- RSPY-115
- RSPY-139

Basically, this demo ingests all the files from CADIP and ADGS stations within the catalog database. The name of the collections in the catalog will be:
- for ADGS -> DemoUser_s1_aux
- for CADIP -> DemoUser_s1_chunk

NOTE: The demo removes these two collections from the catalog database before starting the ingestion, so be advised when launching it inside the cluster.
NOTE: To run it inside the cluster, this demo as well as the rs-client-libraries have to be uploaded in the jupyter notebook running as a pod in the cluster. After the uploading, the rs-client-libraries has to be installed and the kernel should be restarted before running this demo. To install the rs-client-libraries, go to the directory where the wheel has been uploaded (the wheel may be created on local pc with 'poetry build --format wheel') and start the following 2 commands:
#: pip uninstall -y rs_client-libraries && pip install rs_client_libraries-x.y.z-py3-none-any.whl 

## Configuration

In [None]:
import getpass
import os
 
# S3 access
if not os.getenv("S3_ACCESSKEY"):
    os.environ["S3_ACCESSKEY"] = getpass.getpass(f"Enter S3 access key for {os.environ['S3_ENDPOINT']!r}:")
if not os.getenv("S3_SECRETKEY"):
    os.environ["S3_SECRETKEY"] = getpass.getpass(f"Enter S3 secret key for {os.environ['S3_ENDPOINT']!r}:")
 
# API key authentication (not on local mode)
if (os.getenv("RSPY_LOCAL_MODE") != "1") and (not os.getenv("RSPY_APIKEY")):
    os.environ["RSPY_APIKEY"] = getpass.getpass(f"Enter your API key from {os.environ['RSPY_WEBSITE']!r}:")

In [None]:
# Set local or cluster configuration
import os

if os.getenv("RSPY_LOCAL_MODE") == "1":
    url_catalog = "http://rs-server-catalog:8000"    
    url = "http://rs-server-{}:8000"    
    HEADERS={}
    local_mode = True
else:
    url_catalog = os.environ["RSPY_WEBSITE"]    
    url = os.environ["RSPY_WEBSITE"]
    HEADERS={"headers": {"x-api-key": os.environ["RSPY_APIKEY"]}}
    local_mode = False

print(f"Using url for catalog: {url_catalog}")
print(f"Using url for rs-server: {url}")

import requests
import json

Install the needed libraries

In [None]:
!pip install boto3
if local_mode:
    try:
        import rs_workflows
    except ModuleNotFoundError:
        !(cd $RSPY_WHL_DIR && pip install rs_client_libraries-*.whl) # install rs-client if missing
        !opentelemetry-bootstrap -a install # install opentelemetry instrumentation for deps

In [None]:
# Instrument these notebook cells with OpenTelemetry (optional)
from opentelemetry import trace
otel_tracer = trace.get_tracer("my.notebook.trace")

In [None]:
with otel_tracer.start_as_current_span("nb.init-buckets") as span:
    
    # We'll use boto3 to monitor the s3 bucket. 
    # Note: the S3_ACCESSKEY, S3_SECRETKEY and S3_ENDPOINT are given in the docker-compose.yml file.
    import boto3
    import os
    
    s3_session = boto3.session.Session()
    s3_client = s3_session.client(
        service_name="s3",
        aws_access_key_id=os.environ["S3_ACCESSKEY"],
        aws_secret_access_key=os.environ["S3_SECRETKEY"],
        endpoint_url=os.environ["S3_ENDPOINT"],
        region_name=os.environ["S3_REGION"],
    )
    
    buckets = ["rs-cluster-temp", "rs-cluster-catalog"] # bucket names under S3_ENDPOINT
    bucket_dir = "stations"
    bucket_url = f"s3://{buckets[0]}/{bucket_dir}"
    
    # Only in local mode
    if local_mode:
        
        # If the bucket is already created, clear all files to start fresh for each demo. 
        for b in buckets:
            if b in [bucket["Name"] for bucket in s3_client.list_buckets()["Buckets"]]:
                if 'Contents' in s3_client.list_objects(Bucket=b):
                    objects = s3_client.list_objects(Bucket=b)['Contents']
                    for obj in objects:
                        # clear up the bucket
                        s3_client.delete_object(Bucket=b, Key=obj['Key'])
            else:
                s3_client.create_bucket(Bucket=b)
        for b in buckets:
            print(f"Is {b} empty ?: ", 'Contents' not in s3_client.list_objects(Bucket=b))
        
        # Truncate the items table from catalog also if this is not the first run !
        #docker exec -it catalog-db psql -U postgres -d catalog -c "TRUNCATE items"

The bucket "rs-cluster-temp" is used for this demo. Thus, the cadip and adgs prefect flows will be asking for the rs-server endpoints to download the files from the CADIP and ADGS stations and to upload them to "s3://rs-cluster-temp/stations/".
After a successful upload to the s3 bucket, the update stac catalog service is called to update the catalog and to transfer the files from the temp bucket to the "rs-cluster-catalog" bucket.
Two collections will be created in the catalog to publish the files:
- ADGS: DemoUser_s1_aux
- CADIP: DemoUser_s1_chunk

In [None]:
# Clean the previous executions. Be advised about this step when running the demo in the cluster !
with otel_tracer.start_as_current_span("nb.clean-previous") as span:
    requests.delete(f"{url_catalog}/catalog/collections/DemoUser:s1_aux", **HEADERS)
    requests.delete(f"{url_catalog}/catalog/collections/DemoUser:s1_chunk", **HEADERS)

In [None]:
# Create the user's collection first (this has to be done on client side)
with otel_tracer.start_as_current_span("nb.create-collection") as span:
    from dataclasses import dataclass
    import requests
    
    @dataclass
    class Collection:
        """A collection for test purpose."""
    
        user: str
        name: str
    
        @property
        def id_(self) -> str:
            """Returns the id."""
            return f"{self.user}_{self.name}"
    
        @property
        def properties(self):
            """Returns the properties."""
            return {
                "id": self.name,
                "type": "Collection",
                "links": [
                    {
                        "rel": "items",
                        "type": "application/geo+json",
                        "href": f"http://localhost:8082/collections/{self.name}/items",
                    },
                    {"rel": "parent", "type": "application/json", "href": "http://localhost:8082/"},
                    {"rel": "root", "type": "application/json", "href": "http://localhost:8082/"},
                    {
                        "rel": "self",
                        "type": "application/json",
                        "href": f"""http://localhost:8082/collections/{self.name}""",
                    },
                    {
                        "rel": "license",
                        "href": "https://creativecommons.org/licenses/publicdomain/",
                        "title": "public domain",
                    },
                ],
                "extent": {
                    "spatial": {"bbox": [[-94.6911621, 37.0332547, -94.402771, 37.1077651]]},
                    "temporal": {"interval": [["2000-02-01T00:00:00Z", "2000-02-12T00:00:00Z"]]},
                },
                "license": "public-domain",
                "description": "Some description",
                "stac_version": "1.0.0",
                "owner": user,
            }
        
    user = "DemoUser"
    mission = "s1"
    
    # Create the collections for DemoUser
    # For ADGS station
    collection_type = Collection(user, f"{mission}_aux")
    response = requests.post(url_catalog + f"/catalog/collections", json=collection_type.properties, **HEADERS)
    print(json.loads(response.content))
    # For CADIP station
    collection_type = Collection(user, f"{mission}_chunk")
    response = requests.post(url_catalog + f"/catalog/collections", json=collection_type.properties, **HEADERS)
    response.raise_for_status()
    print(json.loads(response.content))

In [None]:
with otel_tracer.start_as_current_span("nb.run-flow-ingestion") as span:
    
    from datetime import datetime
    
    from rs_workflows.common import (
        PrefectFlowConfig,
        download_flow,
    )
    
    def run_flow(user, url, url_catalog, station, mission, tmp_local_download, bucket_url, api_key, no_of_tasks, start_date, stop_date, limit):
        # start the prefect flow
        download_flow(PrefectFlowConfig(user,
                                        url,
                                        url_catalog,
                                        station,
                                        mission,
                                        tmp_local_download,
                                        bucket_url,
                                        api_key,
                                        no_of_tasks,
                                        datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%SZ"),
                                        datetime.strptime(stop_date, "%Y-%m-%dT%H:%M:%SZ"),
                                        limit
                )
    )
    
    stations = ["CADIP", "ADGS"]
    
    tmp_local_download = "/tmp/{}_tmp"
    # Number of tasks to be run in parallel. The maximum number of tasks in parallel is given by the number of tasks requested in rs-client-libraries prefect flow
    # download_flow from rs_workflows.common package
    no_of_tasks = 15
    # Use the limit parameter to download a maximum number of files. In this demo case, all files are downloaded, thus the limit should be None
    limit = None
    
    for station in stations:
        run_flow(user,
                 url.format(station.lower()),
                 url_catalog,
                 station,
                 mission,
                 tmp_local_download.format(station),
                 bucket_url + f"/{station}",
                 os.environ.get("RSPY_APIKEY", None),
                 no_of_tasks,
                 "2014-01-01T12:00:00Z",
                 "2024-02-20T12:00:00Z",
                 None
                 )    
        

In [None]:
# Links where the files can be downloaded. A maximum of 20 links will be created
with otel_tracer.start_as_current_span("nb.print-results") as span:
    import json
    catalog_data = json.loads((requests.get(url_catalog.rstrip("/") + f"/catalog/collections/{user}:{mission}_aux/items?limit=20", **HEADERS).content.decode()))
    
    for feature in catalog_data['features']:
        print(requests.get(url_catalog.rstrip("/") + f"/catalog/collections/{user}:{mission}_aux/items/{feature['id']}/download/file", **HEADERS).content)