In [1]:
from dataretrieval import nwis
import geopandas as gpd
from pathlib import Path
import numpy as np
import pandas as pd
import json



In [2]:
proj_dir = Path("../../../..")

data_dir = proj_dir / "Data/InSituTemperature"
data_dir.mkdir(exist_ok=True)

# load metadata
metadata = json.load(Path(data_dir, "processed", "metadata.json").open("r"))

target_parameters = {
    "00010_Maximum": "max water temperature (C)",
    "00010_Minimum": "min water temperature (C)",
    "00010_Mean": "avg water temperature (C)",
    "00060_Mean": "avg discharge (cfs)",
}

parameter_codes = {
    "max water temperature (C)": "Maximum water temperature, degrees Celsius",
    "min water temperature (C)": "Minimum water temperature, degrees Celsius",
    "avg water temperature (C)": "Mean water temperature, degrees Celsius",
    "avg discharge (cfs)": "Discharge, cubic feet per second",
    "avg discharge (m3/d)": "Discharge, cubic meters per day",
}

startDt = "1999-01-01"
endDt = "2023-10-30"

In [3]:
# Load the shapefile
fn = Path(proj_dir, "Data/GIS/shapefiles/CRBSingle.shp")
gdf = gpd.read_file(fn)
# gdf.bounds.values[0]

# divide the bounding box into 16 smaller boxes
xmin, ymin, xmax, ymax = gdf.bounds.values[0]
x = np.linspace(xmin, xmax, 5)
y = np.linspace(ymin, ymax, 5)

# create a list of lists for the bounding boxes
bb = []
for i in range(len(x) - 1):
    for j in range(len(y) - 1):
        bb.append(list(np.array([x[i], y[j], x[i + 1], y[j + 1]]).round(6)))

In [4]:
# get the sites for each of the bounding boxes and save them to a dataframe
siteList = []
for i in range(len(bb)):
    try:
        siteList.append(nwis.what_sites(bBox=bb[i], startDt=startDt, endDt=endDt, parameterCd='00010')[0])
    except:
        pass

In [5]:
siteList_df = pd.concat(siteList, ignore_index=True)

filtered_sites = gpd.GeoDataFrame(siteList_df, geometry=gpd.points_from_xy(siteList_df['dec_long_va'], siteList_df['dec_lat_va']), crs='epsg:4326')
sites_within_basin = filtered_sites[filtered_sites.within(gdf.geometry[0])]
# sites_within_basin.head()

In [6]:
# # save sites_within_basin to a csv file
# sites_within_basin.to_csv(Path(data_dir, "processed", "sites_within_basin2.csv"), index=False)

# Download the data of filtered sites


In [7]:
for site in sites_within_basin["site_no"]:
# for site in ['14103000']:
    # for site in ["14070615", "14070620", "14070621"]:
    try:
        site_data = nwis.get_record(sites=site, service="dv", start=startDt, end=endDt)
        site_data.index.rename("date", inplace=True)
        column_dict = {
            col: target_parameters[col]
            for col in site_data.columns
            if col in target_parameters.keys()
        }
        site_data.rename(
            columns=column_dict,
            inplace=True,
        )
        if "avg discharge (cfs)" in site_data.columns:
            site_data["avg discharge (m3/d)"] = (
                site_data["avg discharge (cfs)"] * 0.0283168 * 86400
            )
            column_dict["avg discharge (m3/d)"] = "avg discharge (m3/d)"

        if column_dict:
            site_data[column_dict.values()].to_csv(
                Path(data_dir, "processed", f"USGS_{site}.csv"), index=True
            )

            # update metadata
            site_key = "USGS_" + site
            if site_key not in metadata["stations"].keys():
                metadata["stations"][site_key] = {}

            metadata["stations"][site_key]["source"] = "USGS"
            metadata["stations"][site_key]["id"] = site
            metadata["stations"][site_key]["description"] = sites_within_basin[
                sites_within_basin["site_no"] == site
            ]["station_nm"].values[0]
            metadata["stations"][site_key]["latitude"] = sites_within_basin[
                sites_within_basin["site_no"] == site
            ]["dec_lat_va"].values[0]
            metadata["stations"][site_key]["longitude"] = sites_within_basin[
                sites_within_basin["site_no"] == site
            ]["dec_long_va"].values[0]

            # TODO: update the parameters instead of overwriting them
            if "parameters" not in metadata["stations"][site_key].keys():
                metadata["stations"][site_key]["parameters"] = {}
            for key in column_dict.values():
                if key not in metadata["stations"][site_key]["parameters"].keys():
                    metadata["stations"][site_key]["parameters"][key] = parameter_codes[key]
            
            # metadata["stations"][site_key]["parameters"] = {
            #     key: parameter_codes[key] for key in column_dict.values()
            # }
            metadata["stations"][site_key]["geometry"] = {
                "type": "Point",
                "coordinates": [
                    sites_within_basin[sites_within_basin["site_no"] == site][
                        "dec_long_va"
                    ].values[0],
                    sites_within_basin[sites_within_basin["site_no"] == site][
                        "dec_lat_va"
                    ].values[0],
                ],
            }
    except:
        raise

# add last updated date and last updated by
metadata["last_updated"] = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
metadata["last_updated_by"] = "George Darkwah"
metadata["last_updated_by_email"] = "gdarkwah@uw.edu"

# save metadata
with open(Path(data_dir, "processed", "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=4)