In [None]:
import os
import requests
import boto3
import botocore
import numpy as np
import dask
from dask.distributed import Client, Semaphore
import pandas as pd
from pathlib import Path

In [None]:
dask_client = Client(n_workers=64, threads_per_worker=1, memory_limit='20GB')
dask_client

Set up S3 Access using Boto

In [None]:
era5_bucket = 'era5-pds'
client = boto3.client('s3', config=botocore.client.Config(signature_version=botocore.UNSIGNED))

In [None]:
paginator = client.get_paginator('list_objects')
result = paginator.paginate(Bucket=era5_bucket, Delimiter='/')
for prefix in result.search('CommonPrefixes'):
    print(prefix.get('Prefix'))


In [None]:
@dask.delayed(nout=3)
def download_era5_file(bucket, aws_obj, dst_fp, sem):
    """Download ERA5 file from S3 bucket for a given date

    Args:
        date (DateTime): Date to download data for
        bucket (str): S3 bucket name to download from
        dst_dir (Path | str): Directory to save file to
        sem (Semaphore): Semaphore to limit concurrent downloads

    Returns:
        int: Download status (0 = success, 1 = failure)
    """
    with sem:
        client = boto3.client('s3', config=botocore.client.Config(signature_version=botocore.UNSIGNED))
        try:
            with open(dst_fp, 'wb') as f:
                client.download_fileobj(bucket, aws_obj, f)
            download_status = 0
        except botocore.exceptions.ClientError as e:
            download_status = 1
    
    return download_status

def retrieve_era5_precip(start_date, end_date, bucket, dst_dir, max_concurrent=4):
    """
    Retrieve ERA5 data from S3 bucket for a given date range
    """
    dst_dir = Path(dst_dir)
    date_range = pd.date_range(start_date, end_date, freq='MS')
    statuses = []
    aws_objs = []
    filenames = []
    
    # Limit concurrent downloads using a semaphore
    sem = Semaphore(max_concurrent, name='aws-era5')
    dask.config.set('distributed.scheduler.locks.lease-timeout', '1800s')

    for date in date_range:
        prefix = f"{date.strftime('%Y/%m/')}data/"
        dst_fp = dst_dir / (date.strftime('%Y%m%d') + '.nc')
        dst_fp.parent.mkdir(parents=True, exist_ok=True)
        aws_obj = prefix + 'precipitation_amount_1hour_Accumulation.nc'
        
        status = download_era5_file(bucket, aws_obj, dst_fp, sem)

        statuses.append(status)
        aws_objs.append(aws_obj)
        filenames.append(dst_fp)

    statuses = dask.compute(*statuses)
    
    return pd.DataFrame({'date': date_range, 'download_status': statuses, 'aws_obj': aws_objs, 'filename': filenames})

In [None]:
start_date = '2010-01-01'
end_date = '2023-04-01'

era5_data_retrieval = retrieve_era5_precip(start_date, end_date, era5_bucket, Path("../../data-precip-analysis/era5"), max_concurrent=12)

In [None]:
era5_data_retrieval