In [None]:
%matplotlib inline

# adding project dirs to path so code may be referenced from the notebook
import sys
sys.path.insert(0, '..')

In [None]:
import gc
import os
import config
import utils
import importlib
import dask

import pandas as pd

from datetime import datetime, timedelta
from dask.distributed import Client, LocalCluster

import importlib
import grid_to_parquet
importlib.reload(grid_to_parquet)

In [None]:
cluster = LocalCluster(n_workers=12, threads_per_worker=1)
client = Client(cluster)
cluster

# Fetch Forcing Data 

In [None]:
# Setup some criteria
ingest_forecast_days = 9
forecast_interval_hrs = 6
start_dt = datetime(2023, 1, 10) # First one is at 00Z in date
td = timedelta(hours=forecast_interval_hrs)
number_of_forecasts = int(ingest_forecast_days * 24/forecast_interval_hrs)

In [None]:
print(datetime.now())
# Loop though forecasts, fetch and insert
for f in range(number_of_forecasts):
    reference_time = start_dt + td * f
    ref_time_str = reference_time.strftime("%Y%m%dT%HZ")

    print(f"Start download of {ref_time_str}")

    blob_list = grid_to_parquet.list_blobs_forcing(
        configuration = "forcing_medium_range",
        reference_time = ref_time_str,
        must_contain = "forcing"
    )

    dfs = []
    for blob_name in blob_list:
        df = dask.delayed(grid_to_parquet.calculate_map_forcing)(
            blob_name, 
            use_cache=False, 
            weights_filepath=config.HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH
        )
        dfs.append(df)
    
    # Join all timesteps into single pd.DataFrame
    results = dask.compute(*dfs)
    df = pd.concat(results)

    # Save as parquet file
    parquet_filepath = os.path.join(config.MEDIUM_RANGE_FORCING_PARQUET, f"{ref_time_str}.parquet")
    utils.make_parent_dir(parquet_filepath)
    df.to_parquet(parquet_filepath)
    
    del df
    gc.collect()

    # Print out some DataFrame stats
    # print(df.info(verbose=True, memory_usage='deep'))
    # print(df.memory_usage(index=True, deep=True))
print(datetime.now())

# Fetch Assim Data

In [None]:
# Setup some criteria
start_dt = datetime(2022, 12, 18)
number_of_days = 40

# Loop though forecasts, fetch and insert
for f in range(number_of_days):
    issue_date = start_dt + timedelta(days=f)
    issue_date_str = issue_date.strftime("%Y%m%d")

    print(f"Start download of {issue_date_str}")

    blob_list = grid_to_parquet.list_blobs_assim(
        configuration = "forcing_analysis_assim",
        issue_date = issue_date_str,
        must_contain = "tm00.conus"
    )

    dfs = []
    for blob_name in blob_list:
        df = dask.delayed(grid_to_parquet.calculate_map_assim)(
            blob_name,
            use_cache=False, 
            weights_filepath=config.HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH
        )
        dfs.append(df)
    
    # Join all timesteps into single pd.DataFrame
    %time 
    results = dask.compute(*dfs)
    df = pd.concat(results)

    # Save as parquet file
    parquet_filepath = os.path.join(config.FORCING_ANALYSIS_ASSIM_PARQUET, f"{issue_date_str}.parquet")
    utils.make_parent_dir(parquet_filepath)
    df.to_parquet(parquet_filepath)
    
    del df
    gc.collect()

    # Print out some DataFrame stats
    # print(df.info(verbose=True, memory_usage='deep'))
    # print(df.memory_usage(index=True, deep=True))