In [1]:
%matplotlib inline

# adding project dirs to path so code may be referenced from the notebook
import sys
sys.path.insert(0, '..')

In [2]:
import gc
import os
import config
import utils
import importlib
import dask

import pandas as pd

from datetime import datetime, timedelta
from dask.distributed import Client

import grid_to_parquet

# Fetch Forcing Data 

In [3]:
# Setup some criteria
ingest_days = 1
forecast_interval_hrs = 6
start_dt = datetime(2023, 1, 1) # First one is at 00Z in date
td = timedelta(hours=forecast_interval_hrs)
number_of_forecasts = 1 #int(ingest_days * 24/forecast_interval_hrs)

In [4]:
# Loop though forecasts, fetch and insert
for f in range(number_of_forecasts):
    reference_time = start_dt + td * f
    ref_time_str = reference_time.strftime("%Y%m%dT%HZ")

    print(f"Start download of {ref_time_str}")

    blob_list = grid_to_parquet.list_blobs_forcing(
        configuration = "forcing_medium_range",
        reference_time = ref_time_str,
        must_contain = "forcing"
    )

    dfs = []
    for blob_name in blob_list:
        df = dask.delayed(grid_to_parquet.calculate_map_forcing)(blob_name, use_cache=True, weights_filepath=config.HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH)
        dfs.append(df)
    
    # Join all timesteps into single pd.DataFrame
    %time 
    results = dask.compute(*dfs)
    df = pd.concat(results)

    # Save as parquet file
    parquet_filepath = os.path.join(config.MEDIUM_RANGE_FORCING_PARQUET, f"{ref_time_str}.parquet")
    utils.make_parent_dir(parquet_filepath)
    df.to_parquet(parquet_filepath)

    # Print out some DataFrame stats
    # print(df.info(verbose=True, memory_usage='deep'))
    # print(df.memory_usage(index=True, deep=True))

Start download of 20230101T00Z
CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.34 µs


# Fetch Assim Data

In [6]:
# Setup some criteria
start_dt = datetime(2023, 1, 1)
number_of_days = 11

# Loop though forecasts, fetch and insert
for f in range(number_of_days):
    issue_date = start_dt + timedelta(days=f)
    issue_date_str = issue_date.strftime("%Y%m%d")

    print(f"Start download of {issue_date_str}")

    blob_list = grid_to_parquet.list_blobs_assim(
        configuration = "forcing_analysis_assim",
        issue_date = issue_date_str,
        must_contain = "tm00.conus"
    )

    dfs = []
    for blob_name in blob_list:
        df = dask.delayed(grid_to_parquet.calculate_map_assim)(blob_name, use_cache=True, weights_filepath=config.HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH)
        dfs.append(df)
    
    # Join all timesteps into single pd.DataFrame
    %time 
    results = dask.compute(*dfs)
    df = pd.concat(results)

    # Save as parquet file
    parquet_filepath = os.path.join(config.FORCING_ANALYSIS_ASSIM_PARQUET, f"{issue_date_str}.parquet")
    utils.make_parent_dir(parquet_filepath)
    df.to_parquet(parquet_filepath)

    # Print out some DataFrame stats
    # print(df.info(verbose=True, memory_usage='deep'))
    # print(df.memory_usage(index=True, deep=True))

Start download of 20230101
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs
Start download of 20230102
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs
Start download of 20230103
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs
Start download of 20230104
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs
Start download of 20230105
CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.58 µs
Start download of 20230106
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.29 µs
Start download of 20230107
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs
Start download of 20230108
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs
Start download of 20230109
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
Start download of 20230110
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
Start download of 20230111
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: