In [1]:
%matplotlib inline

# adding project dirs to path so code may be referenced from the notebook
import sys
sys.path.insert(0, '..')

In [2]:
import gc
import os
import config
import utils
import importlib
import dask

import pandas as pd

from datetime import datetime, timedelta
from dask.distributed import Client, LocalCluster

import grid_to_parquet

In [3]:
cluster = LocalCluster()
cluster

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 15.63 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:37539,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 15.63 GiB

0,1
Comm: tcp://127.0.0.1:34109,Total threads: 2
Dashboard: http://127.0.0.1:32855/status,Memory: 3.91 GiB
Nanny: tcp://127.0.0.1:34509,
Local directory: /tmp/dask-worker-space/worker-h64q7kzu,Local directory: /tmp/dask-worker-space/worker-h64q7kzu

0,1
Comm: tcp://127.0.0.1:44865,Total threads: 2
Dashboard: http://127.0.0.1:35503/status,Memory: 3.91 GiB
Nanny: tcp://127.0.0.1:33967,
Local directory: /tmp/dask-worker-space/worker-iimhu1g0,Local directory: /tmp/dask-worker-space/worker-iimhu1g0

0,1
Comm: tcp://127.0.0.1:44125,Total threads: 2
Dashboard: http://127.0.0.1:43707/status,Memory: 3.91 GiB
Nanny: tcp://127.0.0.1:37311,
Local directory: /tmp/dask-worker-space/worker-uzq8l_xf,Local directory: /tmp/dask-worker-space/worker-uzq8l_xf

0,1
Comm: tcp://127.0.0.1:43941,Total threads: 2
Dashboard: http://127.0.0.1:43721/status,Memory: 3.91 GiB
Nanny: tcp://127.0.0.1:39053,
Local directory: /tmp/dask-worker-space/worker-_59orlz_,Local directory: /tmp/dask-worker-space/worker-_59orlz_


# Fetch Forcing Data 

In [7]:
# Setup some criteria
ingest_days = 4
forecast_interval_hrs = 6
start_dt = datetime(2023, 1, 3) # First one is at 00Z in date
td = timedelta(hours=forecast_interval_hrs)
number_of_forecasts = int(ingest_days * 24/forecast_interval_hrs)

In [8]:
# Loop though forecasts, fetch and insert
for f in range(number_of_forecasts):
    reference_time = start_dt + td * f
    ref_time_str = reference_time.strftime("%Y%m%dT%HZ")

    print(f"Start download of {ref_time_str}")

    blob_list = grid_to_parquet.list_blobs_forcing(
        configuration = "forcing_medium_range",
        reference_time = ref_time_str,
        must_contain = "forcing"
    )

    dfs = []
    for blob_name in blob_list:
        df = dask.delayed(grid_to_parquet.calculate_map_forcing)(blob_name, use_cache=True, weights_filepath=config.HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH)
        dfs.append(df)
    
    # Join all timesteps into single pd.DataFrame
    %time 
    results = dask.compute(*dfs)
    df = pd.concat(results)

    # Save as parquet file
    parquet_filepath = os.path.join(config.MEDIUM_RANGE_FORCING_PARQUET, f"{ref_time_str}.parquet")
    utils.make_parent_dir(parquet_filepath)
    df.to_parquet(parquet_filepath)
    
    del df
    gc.collect()

    # Print out some DataFrame stats
    # print(df.info(verbose=True, memory_usage='deep'))
    # print(df.memory_usage(index=True, deep=True))

Start download of 20230103T00Z
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs
Start download of 20230103T06Z
CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.81 µs




Start download of 20230103T12Z
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs




Start download of 20230103T18Z
CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 4.29 µs
Start download of 20230104T00Z
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs
Start download of 20230104T06Z
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
Start download of 20230104T12Z
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs
Start download of 20230104T18Z
CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.81 µs
Start download of 20230105T00Z
CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 3.81 µs
Start download of 20230105T06Z
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs
Start download of 20230105T12Z
CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 3.81 µs
Start download of 20230105T18Z
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs




Start download of 20230106T00Z
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
Start download of 20230106T06Z
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.29 µs
Start download of 20230106T12Z
CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.81 µs
Start download of 20230106T18Z
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs


# Fetch Assim Data

In [4]:
# Setup some criteria
start_dt = datetime(2023, 1, 16)
number_of_days = 1

# Loop though forecasts, fetch and insert
for f in range(number_of_days):
    issue_date = start_dt + timedelta(days=f)
    issue_date_str = issue_date.strftime("%Y%m%d")

    print(f"Start download of {issue_date_str}")

    blob_list = grid_to_parquet.list_blobs_assim(
        configuration = "forcing_analysis_assim",
        issue_date = issue_date_str,
        must_contain = "tm00.conus"
    )

    dfs = []
    for blob_name in blob_list:
        df = dask.delayed(grid_to_parquet.calculate_map_assim)(blob_name, use_cache=True, weights_filepath=config.HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH)
        dfs.append(df)
    
    # Join all timesteps into single pd.DataFrame
    %time 
    results = dask.compute(*dfs)
    df = pd.concat(results)

    # Save as parquet file
    parquet_filepath = os.path.join(config.FORCING_ANALYSIS_ASSIM_PARQUET, f"{issue_date_str}.parquet")
    utils.make_parent_dir(parquet_filepath)
    df.to_parquet(parquet_filepath)
    
    del df
    gc.collect()

    # Print out some DataFrame stats
    # print(df.info(verbose=True, memory_usage='deep'))
    # print(df.memory_usage(index=True, deep=True))

Start download of 20230116
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
