# CADU endpoints demo

In this demo we will call the rs-server CADU HTTP endpoints:

  * List available CADU products
  * Download some products into local storage and S3 bucket
  * Monitor the download status from the database.

In [None]:
# Define some variables
endpoint="http://rs-server:8000/cadip/CADIP/cadu" # rs-server host = the container name
start="2014-01-01T12:00:00.000Z"
stop="2023-12-30T12:00:00.000Z"

In [None]:
# From a terminal, to list the available CADU products, we would use the curl command:
!set -x && curl -X GET "{endpoint}/list?start_date={start}&stop_date={stop}" -H "accept: application/json"

In [None]:
# But let's do it in python so it's easier to parse results
import requests
import pprint 

# Call the "list" endpoint
data = requests.get(f"{endpoint}/list", {"start_date": start, "stop_date": stop})
assert data.status_code == 200

# Get the returned products as (id,name) lists
products = data.json()["CADIP"]
assert len(products) == 10

# Print the first n products
pprint.PrettyPrinter(indent=4).pprint(products[:3])
print("...")

# Keep only the names
product_names = [name for id, name in products]

In [None]:
# The "list" endpoint has initialised the database with the products info.
# Call the "status" endpoint to get the info from the products name.
all_status = []
for name in product_names:
    data = requests.get(f"{endpoint}/status", {"name": name})
    assert data.status_code == 200
    all_status.append (data.json())

# Print the first n status
pprint.PrettyPrinter(indent=4).pprint(all_status[:2])
print("...")

---
**NOTE**

You can also monitor the database using pgAdmin.

---

In [None]:
# We'll use boto3 to monitor the s3 bucket.
# Note: the S3_ACCESSKEY, S3_SECRETKEY and S3_ENDPOINT are given in the docker-compose.yml file.
!pip install boto3
import boto3
import os

s3_session = boto3.session.Session()
s3_client = s3_session.client(
    service_name="s3",
    aws_access_key_id=os.environ["S3_ACCESSKEY"],
    aws_secret_access_key=os.environ["S3_SECRETKEY"],
    endpoint_url=os.environ["S3_ENDPOINT"],
)

In [None]:
# S3 bucket name
bucket_name = "test-data"

# If the s3 bucket already exist, remove the existing products from it
if bucket_name in [bucket["Name"] for bucket in s3_client.list_buckets()["Buckets"]]:
    for name in product_names:
        s3_client.delete_object(Bucket=bucket_name, Key=name)

# Else create the bucket
else:
    s3_client.create_bucket(Bucket=bucket_name)

# The local download directory is passed as an environment variable
local_download_dir = os.environ["RSPY_LOCAL_DOWNLOAD"]

# Remove all local files if they exist
from pathlib import Path
for name in product_names:
    file = Path (local_download_dir) / name
    if file.is_file():
        file.unlink()

In [None]:
import asyncio

# Call the CADIP endpoint to download one product in background 
# and upload it (optional) to the S3 bucket.
async def download_one(name: str, save_to_s3: bool):

    params = {"name": name, "local": local_download_dir}
    # obs = the bucket URL, if requested
    if save_to_s3:
        params["obs"] = f"s3://{bucket_name}"

    data = requests.get(endpoint, params)
    assert data.status_code == 200

# In parallel, call the "status" endpoint to get and print the download status.
async def print_status():

    # Wait a second if the staus need to be passed 
    # from DONE to NOT_STARTED if we download several times.
    await asyncio.sleep(1)

    all_done = False
    while not all_done: 

        # Count the number of products not started, in progres etc ...
        all_status = {"NOT_STARTED": 0, "IN_PROGRESS": 0, "FAILED": 0, "DONE": 0}
        for name in product_names:
            
            # Call the "status" endpoint
            data = requests.get(f"{endpoint}/status", {"name": name})
            assert data.status_code == 200
            all_status[(data.json())["status"]] += 1

        # Print result
        print (" / ".join ([f"{status}:{count}" for status, count in all_status.items()]))

        if all_status["DONE"] == len(product_names):
            all_done = True
        else:
            await asyncio.sleep(1)

# Call everything in parallel
async def download_all(save_to_s3: bool):
    async with asyncio.TaskGroup() as group:
        group.create_task (print_status())
        for name in product_names:
            group.create_task(download_one (name, save_to_s3))

print ("Download everything to the local directory, not s3:")
await (download_all(save_to_s3=False))

# Check that the local files exist. 
# Wait 1 second before that or sometimes it bugs.
await asyncio.sleep(1)
for name in product_names:
    file = Path (local_download_dir) / name    
    if not file.is_file():
        raise RuntimeException (f"{file} is missing locally")
    print (f"{file} exists")

print ("\nDownload everything again, but this time upload to S3:")
await (download_all(save_to_s3=True))

# This time the local files are not kept locally, 
# but they should be uploaded into the S3 bucket.
await asyncio.sleep(1)
all_s3_filenames = [key["Key"] for key in s3_client.list_objects(Bucket=bucket_name)['Contents']]
for name in product_names:    
    if not name in all_s3_filenames:
        raise RuntimeException (f"{file} is missing from the S3 bucket")
    print (f"s3://{bucket_name}/{name} exists")

---
**NOTE**

You can also monitor the s3 bucket using the minio console: http://127.0.0.1:9001/browser with:

  * Username: _minio_
  * Password: _Strong#Pass#1234_

---

In [None]:
from datetime import datetime

dt_format = "%Y-%m-%dT%H:%M:%S.%f" # %z

# Check timeliness by substracting download stop date - publishing date.
# Call the "status" endpoint.
print ("Timeliness for:")
for name in product_names:    
    data = requests.get(f"{endpoint}/status", {"name": name})
    assert data.status_code == 200
    values = data.json()
    publication = datetime.strptime (values["available_at_station"], dt_format)
    stop = datetime.strptime (values["download_stop"], dt_format)
    timeliness = stop - publication
    print (f"  - {name}: {timeliness}")

In [None]:
!pip install prefect

In [None]:
import httpx
from prefect import flow


@flow(retries=3, retry_delay_seconds=5, log_prints=True)
def get_repo_info(repo_name: str = "PrefectHQ/prefect"):
    url = f"https://api.github.com/repos/{repo_name}"
    response = httpx.get(url)
    response.raise_for_status()
    repo = response.json()
    print(f"{repo_name} repository statistics 🤓:")
    print(f"Stars 🌠 : {repo['stargazers_count']}")
    print(f"Forks 🍴 : {repo['forks_count']}")

if __name__ == "__main__":
    get_repo_info()


In [None]:
from prefect import flow, task
import requests
import asyncio
from pathlib import Path

# Define some variables
endpoint="http://rs-server:8000/cadip/CADIP/cadu" # rs-server host = the container name
start="2014-01-01T12:00:00.000Z"
stop="2023-12-30T12:00:00.000Z"
bucket_name = "test-data"
local_download_dir = os.environ["RSPY_LOCAL_DOWNLOAD"]

@task(name='search_cadu_task')
def search_cadu(date_start: str, date_end: str):
    print(f"Searching products between {date_start} and {date_end}")
    data = requests.get(f"{endpoint}/list", {"start_date": start, "stop_date": stop})
    products = data.json()["CADIP"]
    print(f"Here is the list of products found: {products}")
    product_names = [name for id, name in products]
    return product_names

@task(name='download_one_task')
async def download_one(name: str, save_to_s3: bool):
    params = {"name": name, "local": local_download_dir}
    # obs = the bucket URL, if requested
    if save_to_s3:
        print(f"pushing {name} to the bucket {bucket_name} ...")
        params["obs"] = f"s3://{bucket_name}/Cadip_products"
    data = requests.get(endpoint, params)
    assert data.status_code == 200

@task(name='print_status_task')
async def print_status(product_names: list):
    # Wait a second if the staus need to be passed 
    # from DONE to NOT_STARTED if we download several times.
    await asyncio.sleep(1)
    all_done = False
    while not all_done: 
        # Count the number of products not started, in progres etc ...
        all_status = {"NOT_STARTED": 0, "IN_PROGRESS": 0, "FAILED": 0, "DONE": 0}
        for name in product_names:           
            # Call the "status" endpoint
            data = requests.get(f"{endpoint}/status", {"name": name})
            assert data.status_code == 200
            all_status[(data.json())["status"]] += 1
        # Print result
        print (" / ".join ([f"{status}:{count}" for status, count in all_status.items()]))
        if all_status["DONE"] == len(product_names):
            all_done = True
        else:
            await asyncio.sleep(1)

    
@task(name='download_all_task')
async def download_all(save_to_s3: bool, product_names: list):
    async with asyncio.TaskGroup() as group:
        group.create_task (print_status.fn(product_names))
        for name in product_names:
            print(f"Downloading {name}")
            group.create_task(download_one.fn(name, save_to_s3))
            print(f"{name} has been downloaded !")

@task(name='download_cadu_task')
async def download_cadu(save_to_s3: bool, product_names: list):
    # S3 bucket name
    bucket_name = "test-data"
    # Check if the s3 bucket already exist
    if bucket_name in [bucket["Name"] for bucket in s3_client.list_buckets()["Buckets"]]:
        print(f"The bucket {bucket_name} already exists, removing the existing products from it ...")
        bucket_content = s3_client.list_objects(Bucket=bucket_name)
        print(f"Bucket {bucket_name} is clear !")
        # Check if the bucket is not empty
        if 'Contents' in bucket_content:
            all_s3_filenames = [key["Key"] for key in s3_client.list_objects(Bucket=bucket_name)['Contents']]
            # Remove the existing products from it
            for file in all_s3_filenames:
                s3_client.delete_object(Bucket=bucket_name, Key=file)
    # Else create the bucket
    else:
        print(f"The bucket {bucket_name} does not exist, creating the bucket {bucket_name} ...")
        s3_client.create_bucket(Bucket=bucket_name)
        print(f"The bucket {bucket_name} has beeen created !")
    # Remove all local files if they exist
    print("Removing all local files if they exist ...")
    for name in product_names:
        file = Path (local_download_dir) / name
        if file.is_file():
            file.unlink()
    print("local download directory is clear !")
    await download_all.fn(save_to_s3, product_names)
    await asyncio.sleep(1)
    # If value save_to_s3 is True, download all the products and upload it on the bucket s3
    if save_to_s3:
        await asyncio.sleep(1)
        all_s3_filenames = [key["Key"] for key in s3_client.list_objects(Bucket=bucket_name)['Contents']]
        for name in product_names:
            is_missing = True
            for filename in all_s3_filenames:
                if name in filename:
                    is_missing = False
            if is_missing:
                raise RuntimeError (f"{name} is missing from the S3 bucket")
            print (f"s3://{bucket_name}/{name} exists")
    # If value save_to_s3 is False, download all the products locally
    else:
        for name in product_names:
            file = Path (local_download_dir) / name    
            if not file.is_file():
                raise RuntimeError (f"{name} is missing locally")
            print (f"{file} exists")

@flow(name='main_flow', log_prints=True)
def working(save_to_s3: bool, date_start: str, date_end: str):
    print(f"Save to S3: {save_to_s3}.")
    product_names = search_cadu(date_start, date_end)
    download_cadu(save_to_s3, product_names)

working(False, start, stop)

In [None]:
!prefect server start

In [None]:
all_s3_filenames = [key["Key"] for key in s3_client.list_objects(Bucket=bucket_name)['Contents']]
all_s3_filenames

In [None]:
count = s3_client.list_objects(Bucket=bucket_name)
is_content = "Contents" in count
is_content