# CADU endpoints demo

In this demo we will call the rs-server CADU HTTP endpoints:

  * List available CADU products
  * Download some products into local storage and S3 bucket
  * Monitor the download status from the database.

In [1]:
# Define some variables
endpoint="http://rs-server:8000/cadip/CADIP/cadu" # rs-server host = the container name
start="2014-01-01T12:00:00.000Z"
stop="2023-12-30T12:00:00.000Z"

In [2]:
# From a terminal, to list the available CADU products, we would use the curl command:
!set -x && curl -X GET "{endpoint}/list?start_date={start}&stop_date={stop}" -H "accept: application/json"

+ curl -X GET 'http://rs-server:8000/cadip/CADIP/cadu/list?start_date=2014-01-01T12:00:00.000Z&stop_date=2023-12-30T12:00:00.000Z' -H 'accept: application/json'
{"CADIP":[["2b17b57d-fff4-4645-b539-91f305c27c69","DCS_04_S1A_20231121072204051312_ch1_DSDB_00001.raw"],["2b17b57d-fff4-4645-b539-91f305c27c60","DCS_04_S1A_20231121072204051312_ch1_DSDB_00002.raw"],["2b17b57d-fff4-4645-b539-91f305c27c61","DCS_04_S1A_20231121072204051312_ch1_DSDB_00003.raw"],["2b17b57d-fff4-4645-b539-91f305c27c62","DCS_04_S1A_20231121072204051312_ch1_DSDB_00004.raw"],["2b17b57d-fff4-4645-b539-91f305c27c63","DCS_04_S1A_20231121072204051312_ch1_DSDB_00005.raw"],["2b17b57d-fff4-4645-b539-91f305c27c64","DCS_04_S1A_20231121072204051312_ch1_DSDB_00006.raw"],["2b17b57d-fff4-4645-b539-91f305c27c65","DCS_04_S1A_20231121072204051312_ch1_DSDB_00007.raw"],["some_id_2","DCS_04_S1A_20231121072204051312_ch1_DSDB_00060.raw"],["some_id_3","DCS_04_S1A_20231121072204051312_ch2_DSDB_00046.raw"],["some_id_4","DCS_04_S1A_202311210722

In [3]:
# But let's do it in python so it's easier to parse results
import requests
import pprint 

# Call the "list" endpoint
data = requests.get(f"{endpoint}/list", {"start_date": start, "stop_date": stop})
assert data.status_code == 200

# Get the returned products as (id,name) lists
products = data.json()["CADIP"]
assert len(products) == 10

# Print the first n products
pprint.PrettyPrinter(indent=4).pprint(products[:3])
print("...")

# Keep only the names
product_names = [name for id, name in products]

[   [   '2b17b57d-fff4-4645-b539-91f305c27c69',
        'DCS_04_S1A_20231121072204051312_ch1_DSDB_00001.raw'],
    [   '2b17b57d-fff4-4645-b539-91f305c27c60',
        'DCS_04_S1A_20231121072204051312_ch1_DSDB_00002.raw'],
    [   '2b17b57d-fff4-4645-b539-91f305c27c61',
        'DCS_04_S1A_20231121072204051312_ch1_DSDB_00003.raw']]
...


In [4]:
# The "list" endpoint has initialised the database with the products info.
# Call the "status" endpoint to get the info from the products name.
all_status = []
for name in product_names:
    data = requests.get(f"{endpoint}/status", {"name": name})
    assert data.status_code == 200
    all_status.append (data.json())

# Print the first n status
pprint.PrettyPrinter(indent=4).pprint(all_status[:2])
print("...")

[   {   'available_at_station': '2023-11-26T17:01:39.528000',
        'cadu_id': '2b17b57d-fff4-4645-b539-91f305c27c69',
        'db_id': 1,
        'download_start': None,
        'download_stop': None,
        'name': 'DCS_04_S1A_20231121072204051312_ch1_DSDB_00001.raw',
        'status': 'NOT_STARTED',
        'status_fail_message': None},
    {   'available_at_station': '2023-11-26T17:01:39.528000',
        'cadu_id': '2b17b57d-fff4-4645-b539-91f305c27c60',
        'db_id': 2,
        'download_start': None,
        'download_stop': None,
        'name': 'DCS_04_S1A_20231121072204051312_ch1_DSDB_00002.raw',
        'status': 'NOT_STARTED',
        'status_fail_message': None}]
...


---
**NOTE**

You can also monitor the database using pgAdmin.

---

In [1]:
# We'll use boto3 to monitor the s3 bucket.
# Note: the S3_ACCESSKEY, S3_SECRETKEY and S3_ENDPOINT are given in the docker-compose.yml file.
!pip install boto3
import boto3
import os

s3_session = boto3.session.Session()
s3_client = s3_session.client(
    service_name="s3",
    aws_access_key_id=os.environ["S3_ACCESSKEY"],
    aws_secret_access_key=os.environ["S3_SECRETKEY"],
    endpoint_url=os.environ["S3_ENDPOINT"],
)



In [7]:
# S3 bucket name
bucket_name = "test-data"

# If the s3 bucket already exist, remove the existing products from it
if bucket_name in [bucket["Name"] for bucket in s3_client.list_buckets()["Buckets"]]:
    for name in product_names:
        s3_client.delete_object(Bucket=bucket_name, Key=name)

# Else create the bucket
else:
    s3_client.create_bucket(Bucket=bucket_name)

# The local download directory is passed as an environment variable
local_download_dir = os.environ["RSPY_LOCAL_DOWNLOAD"]

# Remove all local files if they exist
from pathlib import Path
for name in product_names:
    file = Path (local_download_dir) / name
    if file.is_file():
        file.unlink()

In [8]:
import asyncio

# Call the CADIP endpoint to download one product in background 
# and upload it (optional) to the S3 bucket.
async def download_one(name: str, save_to_s3: bool):

    params = {"name": name, "local": local_download_dir}
    # obs = the bucket URL, if requested
    if save_to_s3:
        params["obs"] = f"s3://{bucket_name}"

    data = requests.get(endpoint, params)
    assert data.status_code == 200

# In parallel, call the "status" endpoint to get and print the download status.
async def print_status():

    # Wait a second if the staus need to be passed 
    # from DONE to NOT_STARTED if we download several times.
    await asyncio.sleep(1)

    all_done = False
    while not all_done: 

        # Count the number of products not started, in progres etc ...
        all_status = {"NOT_STARTED": 0, "IN_PROGRESS": 0, "FAILED": 0, "DONE": 0}
        for name in product_names:
            
            # Call the "status" endpoint
            data = requests.get(f"{endpoint}/status", {"name": name})
            assert data.status_code == 200
            all_status[(data.json())["status"]] += 1

        # Print result
        print (" / ".join ([f"{status}:{count}" for status, count in all_status.items()]))

        if all_status["DONE"] == len(product_names):
            all_done = True
        else:
            await asyncio.sleep(1)

# Call everything in parallel
async def download_all(save_to_s3: bool):
    async with asyncio.TaskGroup() as group:
        group.create_task (print_status())
        for name in product_names:
            group.create_task(download_one (name, save_to_s3))

print ("Download everything to the local directory, not s3:")
await (download_all(save_to_s3=False))

# Check that the local files exist. 
# Wait 1 second before that or sometimes it bugs.
await asyncio.sleep(1)
for name in product_names:
    file = Path (local_download_dir) / name    
    if not file.is_file():
        raise RuntimeException (f"{file} is missing locally")
    print (f"{file} exists")

print ("\nDownload everything again, but this time upload to S3:")
await (download_all(save_to_s3=True))

# This time the local files are not kept locally, 
# but they should be uploaded into the S3 bucket.
await asyncio.sleep(1)
all_s3_filenames = [key["Key"] for key in s3_client.list_objects(Bucket=bucket_name)['Contents']]
for name in product_names:    
    if not name in all_s3_filenames:
        raise RuntimeException (f"{file} is missing from the S3 bucket")
    print (f"s3://{bucket_name}/{name} exists")

Download everything to the local directory, not s3:
NOT_STARTED:0 / IN_PROGRESS:1 / FAILED:0 / DONE:9
NOT_STARTED:0 / IN_PROGRESS:0 / FAILED:0 / DONE:10
/local/download/DCS_04_S1A_20231121072204051312_ch1_DSDB_00001.raw exists
/local/download/DCS_04_S1A_20231121072204051312_ch1_DSDB_00002.raw exists
/local/download/DCS_04_S1A_20231121072204051312_ch1_DSDB_00003.raw exists
/local/download/DCS_04_S1A_20231121072204051312_ch1_DSDB_00004.raw exists
/local/download/DCS_04_S1A_20231121072204051312_ch1_DSDB_00005.raw exists
/local/download/DCS_04_S1A_20231121072204051312_ch1_DSDB_00006.raw exists
/local/download/DCS_04_S1A_20231121072204051312_ch1_DSDB_00007.raw exists
/local/download/DCS_04_S1A_20231121072204051312_ch1_DSDB_00060.raw exists
/local/download/DCS_04_S1A_20231121072204051312_ch2_DSDB_00046.raw exists
/local/download/DCS_04_S1A_20231121072204051312_ch2_DSDB_00060.raw exists

Download everything again, but this time upload to S3:
NOT_STARTED:0 / IN_PROGRESS:1 / FAILED:0 / DONE:9
N

---
**NOTE**

You can also monitor the s3 bucket using the minio console: http://127.0.0.1:9001/browser with:

  * Username: _minio_
  * Password: _Strong#Pass#1234_

---

In [9]:
from datetime import datetime

dt_format = "%Y-%m-%dT%H:%M:%S.%f" # %z

# Check timeliness by substracting download stop date - publishing date.
# Call the "status" endpoint.
print ("Timeliness for:")
for name in product_names:    
    data = requests.get(f"{endpoint}/status", {"name": name})
    assert data.status_code == 200
    values = data.json()
    publication = datetime.strptime (values["available_at_station"], dt_format)
    stop = datetime.strptime (values["download_stop"], dt_format)
    timeliness = stop - publication
    print (f"  - {name}: {timeliness}")

Timeliness for:
  - DCS_04_S1A_20231121072204051312_ch1_DSDB_00001.raw: 58 days, 20:14:59.914655
  - DCS_04_S1A_20231121072204051312_ch1_DSDB_00002.raw: 58 days, 20:14:59.912772
  - DCS_04_S1A_20231121072204051312_ch1_DSDB_00003.raw: 58 days, 20:14:59.912088
  - DCS_04_S1A_20231121072204051312_ch1_DSDB_00004.raw: 58 days, 20:15:00.031535
  - DCS_04_S1A_20231121072204051312_ch1_DSDB_00005.raw: 58 days, 20:15:00.304013
  - DCS_04_S1A_20231121072204051312_ch1_DSDB_00006.raw: 58 days, 20:15:00.375393
  - DCS_04_S1A_20231121072204051312_ch1_DSDB_00007.raw: 58 days, 20:14:59.915409
  - DCS_04_S1A_20231121072204051312_ch1_DSDB_00060.raw: 58 days, 20:15:00.157170
  - DCS_04_S1A_20231121072204051312_ch2_DSDB_00046.raw: 58 days, 21:56:18.531328
  - DCS_04_S1A_20231121072204051312_ch2_DSDB_00060.raw: 58 days, 21:51:20.854928


In [1]:
!pip install prefect

Collecting prefect
  Downloading prefect-2.14.16-py3-none-any.whl.metadata (10 kB)
Collecting aiosqlite>=0.17.0 (from prefect)
  Downloading aiosqlite-0.19.0-py3-none-any.whl (15 kB)
Collecting apprise<2.0.0,>=1.1.0 (from prefect)
  Downloading apprise-1.7.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m878.5 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting asyncpg>=0.23 (from prefect)
  Downloading asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting click<8.2,>=8.0 (from prefect)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting dateparser<2.0.0,>=1.1.1 (from prefect)
  Downloading dateparser-1.2.0-py2.py3-none-any.whl.metadata (28 kB)
Collecting docker<7.0,>=4.0 (from prefect)
  Downloading docker-6.1.3-py3-none-any.whl.metadata (3.5 kB)
Collecting graphviz>=0.20.1 (from prefect)
  Downloading graphviz-0.20.1-py3-none-any.whl

In [19]:
import httpx
from prefect import flow


@flow(retries=3, retry_delay_seconds=5, log_prints=True)
def get_repo_info(repo_name: str = "PrefectHQ/prefect"):
    url = f"https://api.github.com/repos/{repo_name}"
    response = httpx.get(url)
    response.raise_for_status()
    repo = response.json()
    print(f"{repo_name} repository statistics 🤓:")
    print(f"Stars 🌠 : {repo['stargazers_count']}")
    print(f"Forks 🍴 : {repo['forks_count']}")

if __name__ == "__main__":
    get_repo_info()



 `@flow(name='my_unique_name', ...)`


In [40]:
from prefect import flow, task
import requests
import asyncio
from pathlib import Path

# Define some variables
endpoint="http://rs-server:8000/cadip/CADIP/cadu" # rs-server host = the container name
start="2014-01-01T12:00:00.000Z"
stop="2023-12-30T12:00:00.000Z"
bucket_name = "test-data"
local_download_dir = os.environ["RSPY_LOCAL_DOWNLOAD"]

@task(name='search_cadu_task')
def search_cadu(date_start: str, date_end: str):
    print(f"Searching products between {date_start} and {date_end}")
    data = requests.get(f"{endpoint}/list", {"start_date": start, "stop_date": stop})
    products = data.json()["CADIP"]
    print(f"Here is the list of products found: {products}")
    product_names = [name for id, name in products]
    return product_names

@task(name='download_one_task')
async def download_one(name: str, save_to_s3: bool):
    params = {"name": name, "local": local_download_dir}
    # obs = the bucket URL, if requested
    if save_to_s3:
        print(f"pushing {name} to the bucket {bucket_name} ...")
        params["obs"] = f"s3://{bucket_name}/Cadip_products"
    data = requests.get(endpoint, params)
    assert data.status_code == 200

@task(name='print_status_task')
async def print_status(product_names: list):
    # Wait a second if the staus need to be passed 
    # from DONE to NOT_STARTED if we download several times.
    await asyncio.sleep(1)
    all_done = False
    while not all_done: 
        # Count the number of products not started, in progres etc ...
        all_status = {"NOT_STARTED": 0, "IN_PROGRESS": 0, "FAILED": 0, "DONE": 0}
        for name in product_names:           
            # Call the "status" endpoint
            data = requests.get(f"{endpoint}/status", {"name": name})
            assert data.status_code == 200
            all_status[(data.json())["status"]] += 1
        # Print result
        print (" / ".join ([f"{status}:{count}" for status, count in all_status.items()]))
        if all_status["DONE"] == len(product_names):
            all_done = True
        else:
            await asyncio.sleep(1)

    
@task(name='download_all_task')
async def download_all(save_to_s3: bool, product_names: list):
    async with asyncio.TaskGroup() as group:
        group.create_task (print_status.fn(product_names))
        for name in product_names:
            print(f"Downloading {name}")
            group.create_task(download_one.fn(name, save_to_s3))
            print(f"{name} has been downloaded !")

@task(name='download_cadu_task')
async def download_cadu(save_to_s3: bool, product_names: list):
    # S3 bucket name
    bucket_name = "test-data"
    # Check if the s3 bucket already exist
    if bucket_name in [bucket["Name"] for bucket in s3_client.list_buckets()["Buckets"]]:
        print(f"The bucket {bucket_name} already exists, removing the existing products from it ...")
        bucket_content = s3_client.list_objects(Bucket=bucket_name)
        print(f"Bucket {bucket_name} is clear !")
        # Check if the bucket is not empty
        if 'Contents' in bucket_content:
            all_s3_filenames = [key["Key"] for key in s3_client.list_objects(Bucket=bucket_name)['Contents']]
            # Remove the existing products from it
            for file in all_s3_filenames:
                s3_client.delete_object(Bucket=bucket_name, Key=file)
    # Else create the bucket
    else:
        print(f"The bucket {bucket_name} does not exist, creating the bucket {bucket_name} ...")
        s3_client.create_bucket(Bucket=bucket_name)
        print(f"The bucket {bucket_name} has beeen created !")
    # Remove all local files if they exist
    print("Removing all local files if they exist ...")
    for name in product_names:
        file = Path (local_download_dir) / name
        if file.is_file():
            file.unlink()
    print("local download directory is clear !")
    await download_all.fn(save_to_s3, product_names)
    await asyncio.sleep(1)
    # If value save_to_s3 is True, download all the products and upload it on the bucket s3
    if save_to_s3:
        await asyncio.sleep(1)
        all_s3_filenames = [key["Key"] for key in s3_client.list_objects(Bucket=bucket_name)['Contents']]
        for name in product_names:
            is_missing = True
            for filename in all_s3_filenames:
                if name in filename:
                    is_missing = False
            if is_missing:
                raise RuntimeError (f"{name} is missing from the S3 bucket")
            print (f"s3://{bucket_name}/{name} exists")
    # If value save_to_s3 is False, download all the products locally
    else:
        for name in product_names:
            file = Path (local_download_dir) / name    
            if not file.is_file():
                raise RuntimeError (f"{name} is missing locally")
            print (f"{file} exists")

@flow(name='main_flow', log_prints=True)
def working(save_to_s3: bool, date_start: str, date_end: str):
    print(f"Save to S3: {save_to_s3}.")
    product_names = search_cadu(date_start, date_end)
    download_cadu(save_to_s3, product_names)

working(False, start, stop)


 `@task(name='my_unique_name', ...)`

 `@task(name='my_unique_name', ...)`

 `@task(name='my_unique_name', ...)`

 `@task(name='my_unique_name', ...)`

 `@task(name='my_unique_name', ...)`

 `@flow(name='my_unique_name', ...)`


[Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `list`')),
 Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `NoneType`'))]

In [39]:
!prefect server start


 ___ ___ ___ ___ ___ ___ _____ 
| _ \ _ \ __| __| __/ __|_   _| 
|  _/   / _|| _|| _| (__  | |  
|_| |_|_\___|_| |___\___| |_|  

Configure Prefect to communicate with the server with:

    prefect config set PREFECT_API_URL=http://127.0.0.1:4200/api

View the API reference documentation at http://127.0.0.1:4200/docs

Check out the dashboard at http://127.0.0.1:4200



[Errno 98] error while attempting to bind on address ('127.0.0.1', 4200): address already in use
Server stopped!


In [29]:
all_s3_filenames = [key["Key"] for key in s3_client.list_objects(Bucket=bucket_name)['Contents']]
all_s3_filenames

['Cadip_products/DCS_04_S1A_20231121072204051312_ch1_DSDB_00001.raw',
 'Cadip_products/DCS_04_S1A_20231121072204051312_ch1_DSDB_00002.raw',
 'Cadip_products/DCS_04_S1A_20231121072204051312_ch1_DSDB_00003.raw',
 'Cadip_products/DCS_04_S1A_20231121072204051312_ch1_DSDB_00004.raw',
 'Cadip_products/DCS_04_S1A_20231121072204051312_ch1_DSDB_00005.raw',
 'Cadip_products/DCS_04_S1A_20231121072204051312_ch1_DSDB_00006.raw',
 'Cadip_products/DCS_04_S1A_20231121072204051312_ch1_DSDB_00007.raw',
 'Cadip_products/DCS_04_S1A_20231121072204051312_ch1_DSDB_00060.raw',
 'Cadip_products/DCS_04_S1A_20231121072204051312_ch2_DSDB_00046.raw',
 'Cadip_products/DCS_04_S1A_20231121072204051312_ch2_DSDB_00060.raw']

In [19]:
count = s3_client.list_objects(Bucket=bucket_name)
is_content = "Contents" in count
is_content

False