In [None]:
%matplotlib inline

from dask.distributed import Client
import dask
from map.generate_map_weights import *
from datetime import datetime, timedelta
import grids.config as config
from grids.utils import get_cache_dir, make_parent_dir, profile

In [None]:
client = Client()
client

In [None]:
# Setup some criteria
ingest_days = 1
forecast_interval_hrs = 6
start_dt = datetime(2023, 1, 1, 18) # First one is at 00Z in date
td = timedelta(hours=forecast_interval_hrs)
number_of_forecasts = 1 #int(ingest_days * 24/forecast_interval_hrs)

In [None]:
# Loop though forecasts, fetch and insert
for f in range(number_of_forecasts):
    reference_time = start_dt + td * f
    ref_time_str = reference_time.strftime("%Y%m%dT%HZ")

    print(f"Start download of {ref_time_str}")

    blob_list = list_blobs_forcing(
        configuration = "forcing_medium_range",
        reference_time = ref_time_str,
        must_contain = "forcing"
    )

    # This can be used to run serial
    dfs = []
    for blob_name in blob_list:
        df = dask.delayed(mp.calculate_map_forcing)(blob_name, use_cache=True)
        dfs.append(df)
    
    # Join all timesteps into single pd.DataFrame
    %time 
    results = dask.compute(*dfs)
    df = pd.concat(results)

    # Save as parquet file
    parquet_filepath = os.path.join(config.MEDIUM_RANGE_PARQUET, f"{ref_time_str}.parquet")
    make_parent_dir(parquet_filepath)
    df.to_parquet(parquet_filepath)

    # Print out some DataFrame stats
    print(df.info(verbose=True, memory_usage='deep'))
    print(df.memory_usage(index=True, deep=True))
