## Overview

This notebook is used to retreive USGS gauge data for the Delaware River Basin (DRB). The Hyriver suite is used to query and retrieve the data. 

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

# From the PyNHD library, import data acuistion tools
from pygeohydro import NWIS
import pynhd as pynhd

In [2]:
## Specifications
filter_drb = True
bbox = (-77.8, 37.5, -74.0, 44.0)
dates = ("1980-01-01", "2022-12-31")

def filter_drb_sites(x, sdir = '../Pywr-DRB/DRB_spatial/DRB_shapefiles'):
    """Filters USGS gauge data to remove gauges outside the DRB boundary.

    Args:
        x (pd.DataFrame): A dataframe with gauges including columns "long" and "lat" with location data. 
        sdir (str, optional) The location of the folder containing the DRB shapefile.
    Returns:
        pd.DataFrame: Dataframe containing gauge data, for gauges within the DRB boundary
    """
    crs = 4386

    drb_boarder = gpd.read_file(f'{sdir}/drb_bnd_polygon.shp')
    drb_boarder = drb_boarder.to_crs(crs)
    x_all = gpd.GeoDataFrame(x, geometry = gpd.points_from_xy(x.long, x.lat, crs = crs))
    x_filtered = gpd.clip(x_all, drb_boarder)
    return x_filtered

In [3]:
# Use the national water info system (NWIS)
nwis = NWIS()
print("Initialized")

# Send a query_request for all gage info in the bbox
query_request = {"bBox": ",".join(f"{b:.06f}" for b in bbox),
        "hasDataTypeCd": "dv",
        "outputDataTypeCd": "dv"}

query_result = nwis.get_info(query_request, expanded= False, nhd_info= False)

# Filter non-streamflow stations
query_result = query_result.query("site_tp_cd in ('ST','ST-TS')")
query_result = query_result[query_result.parm_cd == '00060']  # https://help.waterdata.usgs.gov/parameter_cd?group_cd=PHY
query_result = query_result.reset_index(drop = True)

stations = list(set(query_result.site_no.tolist()))
print(f"Gage data gathered, {len(stations)} USGS streamflow gauges found in date range.")


Initialized
Gage data gathered, 1391 USGS streamflow gauges found in date range.


In [4]:
### Location data (long,lat)
gage_data = query_result[['site_no', 'dec_long_va', 'dec_lat_va', 'begin_date', 'end_date']]
gage_data.columns = ['site_no', 'long', 'lat', 'begin_date', 'end_date']
gage_data.index = gage_data['site_no']
gage_data= gage_data.drop('site_no', axis=1)

# Take just locations in the DRB
if filter_drb:
    gage_data = filter_drb_sites(gage_data)
gage_data = gage_data[~gage_data.index.duplicated(keep = 'first')]
stations = gage_data.index.to_list()
print(f'{len(stations)} streamflow gauges after filtering.')


366 streamflow gauges after filtering.


### NLDI Basin Characteristic Retrieval

When estimating flows at ungauged locations, we want to use only un-managed streamflow data as reference data.  Thus, we want to remove observations from locations which are impacted by dams.  We can use the `pynhd` package to retrieve NLDI basin characteristics which tell us if a particular gauge contains dams in the basin.  

There are four catchment characteristics in the NLDI data that provide indication of reservoir operations. These are listed below. We can then use the `pynhd.NLDI()` module to access these characteristics for all DRB sites and remove any sites which have dams upstream.  

In [5]:
nldi = pynhd.NLDI()
all_characteristics = nldi.valid_characteristics

reservoir_characteristics = ['CAT_NID_STORAGE2013', 'CAT_NDAMS2013', 'CAT_MAJOR2013', 'CAT_NORM_STORAGE2013']


In [8]:
gage_comid = pd.DataFrame(index = gage_data.index, columns=['comid', 'reachcode', 'comid-long', 'comid-lat'])
for st in gage_data.index:
    coords = (gage_data.loc[st, ['long']].values[0], gage_data.loc[st, ['lat']].values[0])
    try:
        found = nldi.comid_byloc(coords)
        gage_comid.loc[st, ['comid']] = found.comid.values[0]
        gage_comid.loc[st, ['reachcode']] = found.reachcode.values[0]
        gage_comid.loc[st, ['comid-long']] = found.geometry.x[0]
        gage_comid.loc[st, ['comid-lat']] = found.geometry.y[0]
    except:
        print(f'Error getting COMID for site {st}')


Error getting COMID for site 01478185
Error getting COMID for site 01412080
Error getting COMID for site 01453500
Error getting COMID for site 01455200
Error getting COMID for site 01420500
Error getting COMID for site 01418000


In [10]:
gage_data = pd.concat([gage_data, gage_comid], axis=1)
gage_data = gage_data.dropna(axis=0)
gage_data["comid"] = gage_data["comid"].astype('int')
gage_data.head()

Unnamed: 0_level_0,long,lat,begin_date,end_date,geometry,comid,reachcode,comid-long,comid-lat
site_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
14806318,-75.833278,39.90733,1997-08-15,1998-10-29,POINT (-75.83328 39.90733),4651858,2040205000224,-75.833032,39.907284
1480629,-75.823,39.929552,2020-02-14,2023-05-17,POINT (-75.82300 39.92955),4651854,2040205000095,-75.822734,39.929434
1480617,-75.801334,39.961773,1970-01-01,2023-05-17,POINT (-75.80133 39.96177),4650580,2040205000082,-75.80083,39.961744
1480500,-75.827447,39.985661,1943-10-01,2023-05-17,POINT (-75.82745 39.98566),932040162,2040205000083,-75.827471,39.985257
1480400,-75.844945,40.027326,1995-02-01,2023-05-17,POINT (-75.84494 40.02733),4648544,2040205000769,-75.844522,40.027197


In [11]:
## Use the station IDs to retrieve basin information
cat_chars = nldi.getcharacteristic_byid(gage_data.comid, fsource = 'comid', 
                                        char_type= "local", char_ids= reservoir_characteristics)



In [13]:
cat = cat_chars.reset_index()
cat.columns = ['comid', 'CAT_MAJOR2013', 'CAT_NDAMS2013',	'CAT_NID_STORAGE2013',	'CAT_NORM_STORAGE2013']


In [15]:
gage_with_cat_chars = pd.merge(gage_data, cat, on = "comid")
gage_with_cat_chars.index = gage_data.index

In [17]:
## Filter sites that have reservoirs upstream
for i, st in enumerate(gage_data.index):
    if gage_with_cat_chars.loc[st, reservoir_characteristics].sum() > 0:
        gage_data = gage_data.drop(st)
print(f'{gage_data.shape[0]} gauges kept after removing managed streamflows.')

307 gauges kept after removing managed streamflows.


In [19]:
# Export gage_data
gage_data.to_csv('./data/drb_usgs_metadata.csv', sep=',')

In [47]:
gage_data

Unnamed: 0_level_0,long,lat,begin_date,end_date,geometry,comid,reachcode,comid-long,comid-lat
site_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
014806318,-75.833278,39.907330,1997-08-15,1998-10-29,POINT (-75.83328 39.90733),4651858,02040205000224,-75.833032,39.907284
01480629,-75.823000,39.929552,2020-02-14,2023-05-17,POINT (-75.82300 39.92955),4651854,02040205000095,-75.822734,39.929434
01480617,-75.801334,39.961773,1970-01-01,2023-05-17,POINT (-75.80133 39.96177),4650580,02040205000082,-75.80083,39.961744
01480500,-75.827447,39.985661,1943-10-01,2023-05-17,POINT (-75.82745 39.98566),932040162,02040205000083,-75.827471,39.985257
01480300,-75.860774,40.072879,1960-06-01,2023-05-17,POINT (-75.86077 40.07288),4648694,02040205000089,-75.860697,40.072277
...,...,...,...,...,...,...,...,...,...
01425500,-75.392778,42.161111,1934-10-01,1968-09-29,POINT (-75.39278 42.16111),2613640,02040101000652,-75.392961,42.160811
0142400103,-75.279444,42.173583,1952-10-01,2023-05-17,POINT (-75.27944 42.17358),2614018,02040101000152,-75.279189,42.173134
01425675,-75.440000,42.174444,1969-10-01,1981-09-29,POINT (-75.44000 42.17444),2613472,02040101000185,-75.440499,42.17453
01424000,-75.278785,42.177862,1952-10-01,1967-06-30,POINT (-75.27878 42.17786),2614018,02040101000152,-75.278995,42.177498


### Retrieve streamflow



In [37]:
dates = ('1900-01-01', '2022-12-31')
stations = gage_data.index

nwis = NWIS()
Q = nwis.get_streamflow(stations, dates)

# Export all data to CSV
Q.to_csv(f'./data/historic_unmanaged_streamflow_{dates[0][:4]}_{dates[1][:4]}_cms.csv', sep=',')

In [50]:
gage_with_cat_chars.loc["01436000",:]

long                                   -74.635556
lat                                         41.82
begin_date                    1941-10-01 00:00:00
end_date                      2023-05-17 00:00:00
geometry                POINT (-74.6355556 41.82)
comid                                     4147432
reachcode                          02040104000138
comid-long                             -74.635177
comid-lat                               41.820199
CAT_MAJOR2013                                 0.0
CAT_NDAMS2013                                 0.0
CAT_NID_STORAGE2013                           0.0
CAT_NORM_STORAGE2013                          0.0
Name: 01436000, dtype: object