# Data from [Columbia Basin Research](https://www.cbr.washington.edu/)


## River Environment Data

Outflow (kcfs), Spill (kcfs), Spill Percent (%), Inflow (kcfs), Temp (Scroll Case) (C), Temperature (C), Barometric Pressure (mmHg), Dissolved Gas (mmHg), Dissolved Gas Percent (%), Turbidity (ft), Elevation (ft)


In [1]:
import os
import csv
import requests
from datetime import datetime
import pandas as pd
import json
from pathlib import Path
import numpy as np



In [29]:
proj_dir = Path("../../../..")

data_dir = proj_dir / "Data/InSituTemperature"

# load metadata
metadata = json.load(Path(data_dir, "processed", "metadata copy.json").open("r"))
# dart_stations_metadata = pd.read_csv(Path("/Users/gdarkwah/Library/CloudStorage/OneDrive-UW/01-Research/01-Hydrothermal History/Methods/data_retrieval/cbr_dart/dart_metadata.csv"))
dart_stations_metadata = pd.read_csv(
    Path(proj_dir, "Methods/data_retrieval/cbr_dart/dart_metadata.csv")
)

In [3]:
# dart_stations_metadata["Abbrev"]

In [13]:
# format the url for the request
def format_url(proj: str, startDate: str, endDate: str):
    """Formats the url for the request for a particular project and date range within the same year
    Args:
        proj (str): abbreviated form of the project name
        startDate (str): start date of the query [YYYY-MM-DD]
        endDate (str): end date of the query [YYYY-MM-DD]
    Returns:
        url (str): formated url for the request
    """
    # convert the dates to datetime objects
    startDate = datetime.strptime(startDate, "%Y-%m-%d")
    endDate = datetime.strptime(endDate, "%Y-%m-%d")

    # get the year from the start date
    year = startDate.year

    # get the month and day from the start and end dates
    startMonth = startDate.month
    startDay = startDate.day
    endMonth = endDate.month
    endDay = endDate.day

    # format the url
    url = "https://www.cbr.washington.edu/dart/cs/php/rpt/river_daily.php?sc=1&outputFormat=csv&year={}&proj={}&span=no&startdate={}%2F{}&enddate={}%2F{}".format(
        year, proj, startMonth, startDay, endMonth, endDay
    )

    return url

In [14]:
# get the data from the url and convert it to a csv
def get_data(proj: str, startDate: str, endDate: str, path: str):
    """Gets the data from the url and converts it to a csv
    Args:
        proj (str): abbreviated form of the project name
        startDate (str): start date of the query [YYYY-MM-DD]
        endDate (str): end date of the query [YYYY-MM-DD]
        path (str): path to the directory where the csv file will be saved
    Returns:
        None
    """

    # capitalize the project name
    proj = proj.upper()

    # get start year and end year
    startYear = datetime.strptime(startDate, "%Y-%m-%d").year
    endYear = datetime.strptime(endDate, "%Y-%m-%d").year

    first_data = True

    # create a csv file for the data by adding all the data from each year
    with open(
        os.path.join(path, "raw/dart", "DART_{}.csv".format(proj)), "w", newline=""
    ) as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        # for each year, take off the lines after the line that begins with 'Notes:'
        for year in range(startYear, endYear + 1):
            if year == startYear and year == endYear:
                # get the url for the request
                url = format_url(proj, startDate, endDate)
            elif year == startYear:
                # get the url for the request
                url = format_url(proj, startDate, "{}-12-31".format(year))
            elif year == endYear:
                # get the url for the request
                url = format_url(proj, "{}-01-01".format(year), endDate)
            else:
                # get the url for the request
                url = format_url(proj, "{}-01-01".format(year), "{}-12-31".format(year))

            # print(url)
            # get the data from the url and convert it to csv format
            response = requests.get(url)
            data = response.text.splitlines()
            if (
                data[0] == "<!DOCTYPE html>"
                or data[0] == '<html lang="en" class="no-js">'
            ):
                pass
            else:
                for i in range(len(data)):
                    if data[i].startswith("Notes:"):
                        data = data[:i]
                        break
                # write the data to the csv file but don't repeat the header row
                # print(data[0])
                if year == startYear or first_data:
                    writer.writerows(csv.reader(data))
                    first_data = False
                else:
                    writer.writerows(csv.reader(data[1:]))
                # writer.writerows(csv.reader(data))

In [15]:
# postprocess the downloaded data
def postprocess_data(proj: str, path: str, grand_id: str = None):
    # if not grand_id:
    #     grand_id = proj

    # read in the data
    df = pd.read_csv(os.path.join(path, "raw/dart", "DART_{}.csv".format(proj.upper())))
    df.drop_duplicates(inplace=True)
    df.to_csv(
        os.path.join(path, "raw/dart", "DART_{}.csv".format(proj.upper())), index=False
    )
    df = pd.read_csv(os.path.join(path, "raw/dart", "DART_{}.csv".format(proj.upper())))

    new_df = pd.DataFrame()
    new_df["date"] = df["Date"]
    new_df["outflow (m3/d)"] = df["Outflow (kcfs)"] * 0.0283168 * 86400 * 1000
    new_df["inflow (m3/d)"] = df["Inflow (kcfs)"] * 0.0283168 * 86400 * 1000
    new_df["spill (m3/d)"] = df["Spill (kcfs)"] * 0.0283168 * 86400 * 1000
    new_df["avg water temperature (C)"] = df["Temperature (C)"]
    try:
        new_df["elevation (m)"] = df["Elevation (ft)"] * 0.3048
    except:
        try:
            new_df["tailwater elevation (m)"] = df["Tailwater Elevation (ft)"] * 0.3048
        except:
            pass

    # drop null columns
    new_df = new_df.dropna(axis=1, how="all")

    # save the data
    new_df.to_csv(
        os.path.join(path, "processed", "DART_{}.csv".format(proj)), index=False
    )

    # print(new_df.columns)
    return new_df.columns

In [18]:
parameter_dict = {
    "outflow (m3/d)": "Average outflow",
    "inflow (m3/d)": "Average inflow",
    "spill (m3/d)": "Average spill",
    "avg water temperature (C)": "Average water temperature",
    "elevation (m)": "Reservoir elevation",
    "tailwater elevation (m)": "Tailwater_elevation",
}

In [19]:
# get the data for each project
proj = dart_stations_metadata[
    "Abbrev"
]  # list of projects https://www.cbr.washington.edu/dart/metadata/river
# proj = ['CCIW']
# grand_id = [297,  338, '338_tail', 299, '299_tail' ]
startDate = "1999-01-01"
endDate = "2023-10-30"

# # specify the directory to save the data
# data_dir = Path(
#     "/Users/gdarkwah/Library/CloudStorage/OneDrive-UW/01-Research/01-Hydrothermal History/Data/timeseries"
# )
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, "raw/dart"), exist_ok=True)
os.makedirs(os.path.join(data_dir, "processed"), exist_ok=True)

# get the data for each project
# for p, id in zip(proj, grand_id):
for p in proj:
    get_data(p, startDate, endDate, data_dir)
    parameters = postprocess_data(p, data_dir)

    # update the metadata
    site_key = "DART_" + p.upper()
    if site_key not in metadata["stations"].keys():
        metadata["stations"][site_key] = {}

    metadata["stations"][site_key]["source"] = "CBR DART"
    metadata["stations"][site_key]["id"] = p.upper()
    metadata["stations"][site_key]["description"] = dart_stations_metadata[
        dart_stations_metadata["Abbrev"] == p.upper()
    ]["Project Name"].values[0]

    if "parameters" not in metadata["stations"][site_key].keys():
        metadata["stations"][site_key]["parameters"] = {}
    for parameter in parameters[1:]:
        if parameter not in metadata["stations"][site_key]["parameters"].keys():
            metadata["stations"][site_key]["parameters"][parameter] = parameter_dict[
                parameter
            ]

    metadata["stations"][site_key]["latitude"] = dart_stations_metadata[
        dart_stations_metadata["Abbrev"] == p.upper()
    ]["Latitude"].values[0]
    metadata["stations"][site_key]["longitude"] = dart_stations_metadata[
        dart_stations_metadata["Abbrev"] == p.upper()
    ]["Longitude"].values[0]

    metadata["stations"][site_key]["geometry"] = {
        "type": "Point",
        "coordinates": [
            dart_stations_metadata[dart_stations_metadata["Abbrev"] == p.upper()][
                "Longitude"
            ].values[0],
            dart_stations_metadata[dart_stations_metadata["Abbrev"] == p.upper()][
                "Latitude"
            ].values[0],
        ],
    }

In [30]:
# add last updated date and last updated by
metadata["last_updated"] = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
metadata["last_updated_by"] = "George Darkwah"
metadata["last_updated_by_email"] = "gdarkwah@uw.edu"

# save metadata
with open(Path(data_dir, "processed", "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=4)