In [None]:
%%capture
!pip install spatialpandas easydev colormap colorcet duckdb dask_geopandas nb_black

In [None]:
%matplotlib inline

# adding project dirs to path so code may be referenced from the notebook
import sys
sys.path.insert(0, '..')

In [None]:
import duckdb
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import config
import utils

# Explore the MAP data
Lets first find out how many forcasts we have downloaded and saved as Parquet files, as well as the start and end dates.  We will use duckdb for this.

In [None]:
%%time
query = f"""
    SELECT 
    count(distinct(reference_time)) as forcast_count,
    min(reference_time) as start_time,
    max(reference_time) as end_time,
    FROM read_parquet('{config.MEDIUM_RANGE_FORCING_PARQUET}/*.parquet')
;"""
print(query)
df = duckdb.query(query).to_df()
df

Now look at the assim data.  Lets query the Parquet files to make sure they are all complete.  Each `reference_time` should have 240 `value_times`.

In [None]:
%%time
query = f"""
    SELECT count(distinct(value_time)) as count,
    min(value_time) as start_time,
    max(value_time) as end_time
    FROM read_parquet('{config.FORCING_ANALYSIS_ASSIM_PARQUET}/*.parquet')
;"""
print(query)
df = duckdb.query(query).to_df()
df

In [None]:
%%time
basins_gdf = utils.parquet_to_gdf(config.HUC10_PARQUET_FILEPATH)

In [None]:
basins_gdf

In [None]:
basins_gdf.plot()

In [None]:
# Query some forcast data from parquet files
import importlib
import queries
importlib.reload(queries)

In [None]:
# query = queries.calculate_catchment_metrics(
#     config.MEDIUM_RANGE_FORCING_PARQUET,
#     config.FORCING_ANALYSIS_ASSIM_PARQUET,
#     group_by=["catchment_id"],
#     order_by=["observed_average"],
#     filters=[
#         {
#             "column": "reference_time",
#             "operator": "=",
#             "value": "2023-01-03 12:00:00"
#         },
#         {
#             "column": "catchment_id",
#             "operator": "like",
#             "value": "18%"
#         }
#     ]
# )


query = queries.calculate_catchment_metrics(
    config.MEDIUM_RANGE_FORCING_PARQUET,
    config.FORCING_ANALYSIS_ASSIM_PARQUET,
    group_by=["catchment_id"],
    order_by=["observed_average"],
    filters=[
        {
            "column": "catchment_id",
            "operator": "like",
            "value": "18%"
        },
        {
            "column": "reference_time",
            "operator": "=",
            "value": "2022-12-25 00:00:00"
        },
    ]
)
print(query)
df = duckdb.query(query).to_df()
df

In [None]:
# Join query to basins
gdf_map = basins_gdf.merge(df, left_on="huc10", right_on="catchment_id")

# Filter to CA
# gdf_map = gdf_map.loc[gdf_map["catchment_id"].str.startswith("18")]

gdf_map.plot("max_forecast_delta", legend=True)

In [None]:
query = queries.get_joined_catchment_timeseries(
    config.MEDIUM_RANGE_FORCING_PARQUET,
    config.FORCING_ANALYSIS_ASSIM_PARQUET,
    filters=[
        {
            "column": "reference_time",
            "operator": "=",
            "value": "2023-01-03 12:00:00"
        },
        {
            "column": "catchment_id",
            "operator": "=",
            "value": "1802000502"
        },
    ]
)

df = duckdb.query(query).to_df()
df

In [None]:
# this is really bad
ax = plt.gca()
df.plot.bar(x= 'value_time', y="forecast_value", ax=ax, figsize=(20,10), color="blue", stacked=True)
df.plot.bar(x= 'value_time', y="observed_value", ax=ax, figsize=(20,10), color="orange", stacked=True)