# Data from [Columbia Basin Research](https://www.cbr.washington.edu/)


## River Environment Data

Outflow (kcfs), Spill (kcfs), Spill Percent (%), Inflow (kcfs), Temp (Scroll Case) (C), Temperature (C), Barometric Pressure (mmHg), Dissolved Gas (mmHg), Dissolved Gas Percent (%), Turbidity (ft), Elevation (ft)


In [1]:
import os
import csv
import requests
from datetime import datetime
import pandas as pd
import json
from pathlib import Path
import numpy as np
import time
import urllib.request



In [2]:
proj_dir = Path("../../../..")
data_dir = proj_dir / "data/insitu/conditions"

stations_metadata_path = Path(proj_dir, "data/insitu/metadata/stations.csv")
stations_attributes_path = Path(proj_dir, "data/insitu/metadata/dictionaries/stations_attributes.csv")

stations_attributes = pd.read_csv(stations_attributes_path)

In [3]:
# load metadata
dart_stations_metadata = pd.read_csv(Path("dart_metadata.csv"))
if not os.path.exists(stations_metadata_path):
    stations_metadata = pd.DataFrame(columns=stations_attributes['Attribute_name'])
    stations_metadata.to_csv(stations_metadata_path, index=False)

stations_metadata = pd.read_csv(stations_metadata_path)

In [4]:
# format the url for the request
def format_url(proj: str, startDate: str, endDate: str):
    """Formats the url for the request for a particular project and date range within the same year
    Args:
        proj (str): abbreviated form of the project name
        startDate (str): start date of the query [YYYY-MM-DD]
        endDate (str): end date of the query [YYYY-MM-DD]
    Returns:
        url (str): formated url for the request
    """
    # convert the dates to datetime objects
    startDate = datetime.strptime(startDate, "%Y-%m-%d")
    endDate = datetime.strptime(endDate, "%Y-%m-%d")

    # get the year from the start date
    year = startDate.year

    # get the month and day from the start and end dates
    startMonth = startDate.month
    startDay = startDate.day
    endMonth = endDate.month
    endDay = endDate.day

    # format the url
    url = "https://www.cbr.washington.edu/dart/cs/php/rpt/river_daily.php?sc=1&outputFormat=csv&year={}&proj={}&span=no&startdate={}%2F{}&enddate={}%2F{}".format(
        year, proj, startMonth, startDay, endMonth, endDay
    )

    return url

In [5]:
# get the data from the url and convert it to a csv
def get_data(proj: str, startDate: str, endDate: str, path: str):
    """Gets the data from the url and converts it to a csv
    Args:
        proj (str): abbreviated form of the project name
        startDate (str): start date of the query [YYYY-MM-DD]
        endDate (str): end date of the query [YYYY-MM-DD]
        path (str): path to the directory where the csv file will be saved
    Returns:
        None
    """

    # capitalize the project name
    proj = proj.upper()

    # get start year and end year
    startYear = datetime.strptime(startDate, "%Y-%m-%d").year
    endYear = datetime.strptime(endDate, "%Y-%m-%d").year

    first_data = True

    # create a csv file for the data by adding all the data from each year
    with open(
        os.path.join(path, "raw/dart", "DART_{}.csv".format(proj)), "w", newline=""
    ) as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        # for each year, take off the lines after the line that begins with 'Notes:'
        for year in range(startYear, endYear + 1):
            if year == startYear and year == endYear:
                # get the url for the request
                url = format_url(proj, startDate, endDate)
            elif year == startYear:
                # get the url for the request
                url = format_url(proj, startDate, "{}-12-31".format(year))
            elif year == endYear:
                # get the url for the request
                url = format_url(proj, "{}-01-01".format(year), endDate)
            else:
                # get the url for the request
                url = format_url(proj, "{}-01-01".format(year), "{}-12-31".format(year))

            # print(url)
            # get the data from the url and convert it to csv format
            try:
                response = requests.get(url)
            except requests.ConnectionError as e:
                # sleep and try again
                time.sleep(np.random.randint(20, 60))
                response = requests.get(url)
            # except requests.Timeout as e:
            #     # stop the loop
            #     break

            
            data = response.text.splitlines()
            if (
                data[0] == "<!DOCTYPE html>"
                or data[0] == '<html lang="en" class="no-js">'
            ):
                pass
            else:
                for i in range(len(data)):
                    if data[i].startswith("Notes:"):
                        data = data[:i]
                        break
                # write the data to the csv file but don't repeat the header row
                # print(data[0])
                if year == startYear or first_data:
                    writer.writerows(csv.reader(data))
                    first_data = False
                else:
                    writer.writerows(csv.reader(data[1:]))
                # writer.writerows(csv.reader(data))

In [6]:
# postprocess the downloaded data
def postprocess_data(proj: str, path: str, grand_id: str = None):
    # if the data exists, read it in
    if os.path.exists(os.path.join(path, "processed", "DART_{}.csv".format(proj))):
        df_existing = pd.read_csv(
            os.path.join(path, "processed", "DART_{}.csv".format(proj))
        )
        df_existing["date"] = pd.to_datetime(df_existing["date"])
    else:
        df_existing = pd.DataFrame()

    # read in the data
    df = pd.read_csv(os.path.join(path, "raw/dart", "DART_{}.csv".format(proj.upper())))
    df.drop_duplicates(inplace=True)
    df.to_csv(
        os.path.join(path, "raw/dart", "DART_{}.csv".format(proj.upper())), index=False
    )
    df = pd.read_csv(os.path.join(path, "raw/dart", "DART_{}.csv".format(proj.upper())))

    new_df = pd.DataFrame()
    new_df["date"] = pd.to_datetime(df["Date"])
    new_df["outflow(m3/d)"] = df["Outflow (kcfs)"] * 0.0283168 * 86400 * 1000
    new_df["inflow(m3/d)"] = df["Inflow (kcfs)"] * 0.0283168 * 86400 * 1000
    new_df["spill(m3/d)"] = df["Spill (kcfs)"] * 0.0283168 * 86400 * 1000
    new_df["avg_temp(C)"] = df["Temperature (C)"]
    try:
        new_df["wse(m)"] = df["Elevation (ft)"] * 0.3048
    except:
        try:
            new_df["wse_tail(m)"] = df["Tailwater Elevation (ft)"] * 0.3048
        except:
            pass
    
    df = None
    # merge the data with existing data
    if not df_existing.empty:
        new_df = pd.concat([df_existing, new_df], ignore_index=True)
        new_df.drop_duplicates(subset=["date"], inplace=True)

    # drop null columns
    new_df = new_df.dropna(axis=1, how="all")

    # save the data
    new_df.to_csv(
        os.path.join(path, "processed", "DART_{}.csv".format(proj)), index=False
    )

    # print(new_df.columns)
    return new_df.columns

In [7]:
# function to break the dates into 5 year intervals
def date_breaks(start: str, end: str, interval: int = 5):
    """Breaks the date range into 5 year intervals
    Args:
        start (str): start date of the date range [YYYY-MM-DD]
        end (str): end date of the date range [YYYY-MM-DD]
    Returns:
        date_ranges (list): list of date ranges
    """
    # convert the dates to datetime objects
    start = datetime.strptime(start, "%Y-%m-%d")
    end = datetime.strptime(end, "%Y-%m-%d")

    # get the years
    startYear = start.year
    endYear = end.year

    # get the date ranges
    date_ranges = []
    for i in range(startYear, endYear, interval):
        date_ranges.append(
            (
                "{}-01-01".format(i),
                "{}-12-31".format(min(i + 4, endYear)),
            )
        )

    return date_ranges

In [8]:
# get the data for each project
proj = dart_stations_metadata[
    "Abbrev"
]  # list of projects https://www.cbr.washington.edu/dart/metadata/river
# proj = ["BON", 'IHR', 'JDA']
# grand_id = [297,  338, '338_tail', 299, '299_tail' ]
startDate = "1982-01-01"
# endDate = "2024-09-18"
endDate = datetime.today().strftime("%Y-%m-%d")

# break the date range into 3 year intervals
date_ranges = date_breaks(startDate, endDate, 1)

# # specify the directory to save the data
# data_dir = Path(
#     "/Users/gdarkwah/Library/CloudStorage/OneDrive-UW/01-Research/01-Hydrothermal History/Data/timeseries"
# )
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, "raw/dart"), exist_ok=True)
os.makedirs(os.path.join(data_dir, "processed"), exist_ok=True)

# get the data for each project
# for p, id in zip(proj, grand_id):

# for date_range in date_ranges:
for p in proj:
    print(f"Processing {p} data")
    get_data(p, startDate, endDate, data_dir)
    parameters = postprocess_data(p, data_dir)

    # if parameters is not None:

    # update the metadata
    station_ID = "DART_" + p.upper()
    if station_ID not in stations_metadata["station_ID"].values:
        stations_metadata = pd.concat(
            [
                stations_metadata,
                pd.DataFrame(
                    {
                        "station_ID": [station_ID],
                        "id_at_source": [p.upper()],
                        "available_data": ["{}"],
                        "source_URL": [
                            '{"url" : []}'
                        ],
                        "description": [
                            dart_stations_metadata[
                                dart_stations_metadata["Abbrev"] == p.upper()
                            ]["Project Name"].values[0]
                        ],
                        "latitude": [
                            dart_stations_metadata[
                                dart_stations_metadata["Abbrev"] == p.upper()
                            ]["Latitude"].values[0]
                        ],
                        "longitude": [
                            dart_stations_metadata[
                                dart_stations_metadata["Abbrev"] == p.upper()
                            ]["Longitude"].values[0]
                        ],
                        "site_params": ["{}"],
                    }
                ),
            ],
            ignore_index=True,
        )

    # update source url
    source_url = json.loads(
        stations_metadata.loc[
            stations_metadata["station_ID"] == station_ID, "source_URL"
        ].values[0]
    )

    if "https://www.cbr.washington.edu/dart/query/river_daily" not in source_url["url"]:
        source_url["url"].append("https://www.cbr.washington.edu/dart/query/river_daily")
        stations_metadata.loc[
            stations_metadata["station_ID"] == station_ID, "source_URL"
        ] = json.dumps(source_url)

    # update the available data
    availble_data = stations_metadata.loc[
        stations_metadata["station_ID"] == station_ID, "available_data"
    ].values[0]
    availble_data = json.loads(availble_data)

    # check if there is "conditions"  in the available data
    if "conditions" not in availble_data.keys():
        availble_data["conditions"] = []
    # add the parameters to the available data
    # print(parameters[1:])
    for param in parameters[1:]:
        if param not in availble_data["conditions"]:
            availble_data["conditions"].append(param)

    # update the metadata
    stations_metadata.loc[
        stations_metadata["station_ID"] == station_ID, "available_data"
    ] = json.dumps(availble_data)

    print(f"Finished processing {p} data")
    # # sleep for a random time between 30 to 60 seconds
    # time.sleep(np.random.randint(30, 60))

    # save the metadata
    stations_metadata.to_csv(stations_metadata_path, index=False)

Processing ALF data
Finished processing ALF data
Processing BON data
Finished processing BON data
Processing CCIW data
Finished processing CCIW data
Processing CIBW data
Finished processing CIBW data
Processing CHJ data
Finished processing CHJ data
Processing CHQW data
Finished processing CHQW data
Processing CWMW data
Finished processing CWMW data
Processing DWR data
Finished processing DWR data
Processing DWQI data
Finished processing DWQI data
Processing GCL data
Finished processing GCL data
Processing GCGW data
Finished processing GCGW data
Processing HGH data
Finished processing HGH data
Processing HGHM data
Finished processing HGHM data
Processing IHR data
Finished processing IHR data
Processing IDSW data
Finished processing IDSW data
Processing JDA data
Finished processing JDA data
Processing JHAW data
Finished processing JHAW data
Processing LEWI data
Finished processing LEWI data
Processing LIB data
Finished processing LIB data
Processing LGS data
Finished processing LGS data


In [9]:
# add last updated date and last updated by
metadata_status = {
    "last_updated": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
    "update_message": "Updated the metadata to include the new DART stations",
    "last_updated_by": "George Darkwah",
    "last_updated_by_email": "gdarkwah@uw.edu",
}

# save metadata
with open(Path(proj_dir, "data/insitu/metadata/metadata_status.csv"), "w") as f:
    json.dump(metadata_status, f, indent=4)