In [1]:
import csv
import requests
from datetime import datetime
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
import time



In [2]:
proj_dir = Path("../..")

# Create a directory to store in-situ data
data_dir = proj_dir / "insitu_data"
processed_data_dir = data_dir / "processed"
data_dir.mkdir(exist_ok=True)
processed_data_dir.mkdir(exist_ok=True)

# load metadata
stations_metadata_path = Path(proj_dir, "insitu_data/metadata/stations.csv")
stations_attributes_path = Path(proj_dir, "insitu_data/metadata/dictionaries/stations_attributes.csv")

stations_attributes = pd.read_csv(stations_attributes_path)

conditions_data = pd.read_csv(Path(proj_dir, "insitu_data/metadata/dictionaries/conditions_data.csv"))

In [3]:
# load metadata
if not os.path.exists(stations_metadata_path):
    stations_metadata = pd.DataFrame(columns=stations_attributes['Attribute_name'])
    stations_metadata.to_csv(stations_metadata_path, index=False)

stations_metadata = pd.read_csv(stations_metadata_path)

usbr_stations_metadata = json.load(Path('stations.json').open("r"))

pcode_keys = usbr_stations_metadata["pcode_keys"]

In [4]:
# define a function to format the url
def format_url(station_name: str, pcodes: list, start: datetime, end: datetime):
    """Formats the url for the USBR PN data query.
    Args:
        station_name (str): The station name.
        pcodes (list): The list of pcodes.
        start (datetime): The start date.
        end (datetime): The end date.
    Returns:
        url (str): The formatted url.
    """
    url = (
        f"https://www.usbr.gov/pn-bin/daily.pl?station={station_name.lower()}&format=csv&year={start.year}&month={start.month}&day={start.day}&year={end.year}&month={end.month}&day={end.day}"
        + "".join(["&pcode=" + pcode.strip(" ").lower() for pcode in pcodes])
    )

    return url

In [5]:
# define a function to download the data for a station
def download_data(station_name: str, pcodes: list, start: datetime, end: datetime, path: str):
    """Downloads the data for a station.
    Args:
        station_name (str): The station name.
        pcodes (list): The list of pcodes.
        start (datetime): The start date.
        end (datetime): The end date.
        path (str): The path to save the data.
    Returns:
        None
    """
    # format the url
    url = format_url(station_name, pcodes, start, end)

    # download the data
    # r = requests.get(url)
    try:
        r = requests.get(url)
    except requests.ConnectionError as e:
        # sleep and try again
        time.sleep(np.random.randint(20, 60))
        r = requests.get(url)
    # except requests.Timeout as e:
    #     # stop the loop
    #     break

    # write the data to a csv file
    with open(os.path.join(path, 'raw/usbr', station_name + ".csv"), "w") as f:
        f.write(r.text)

    # read the csv file
    with open(os.path.join(path, 'raw/usbr', station_name + ".csv"), "r") as f:
        reader = csv.reader(f)
        data = list(reader)

    # remove the header
    data = data[1:]

    # define the column names
    column_names = ["date"] + pcodes

    # write the data to a csv file
    with open(os.path.join(path, 'raw/usbr', station_name + ".csv"), "w") as f:
        writer = csv.writer(f)
        writer.writerow(column_names)
        writer.writerows(data)

    return None

In [6]:
# convert from deg min sec to decimal degrees
def dms2dd(degrees, minutes=0, seconds=0, direction=None):
    dd = float(degrees) + float(minutes) / 60 + float(seconds) / (60 * 60)
    if direction == "S" or direction == "W":
        dd *= -1
    return dd

In [7]:
# function to process the downloaded data
def postprocess_data(
    station_name: str,
    path: str,
    grand_id: str = None,
    pcodes: list = None,
    pcode_keys: dict = None,
):
    if not grand_id:
        grand_id = station_name

    # read in the data
    # print(path, "raw/usbr", "{}.csv".format(station_name.upper()))
    df = pd.read_csv(
        os.path.join(path, "raw/usbr", "{}.csv".format(station_name.upper()))
    )

    new_df = pd.DataFrame()
    new_df["date"] = df["date"]

    # convert the data to the correct units
    for pcode in pcodes:
        if pcode in pcode_keys.keys():
            try:
                pcode_keys[pcode]["constant"] = pcode_keys[pcode]["constant"]
            except:
                pcode_keys[pcode]["constant"] = None
            
            if pcode_keys[pcode]["constant"]:
                new_df[pcode_keys[pcode]["column_name"]] = (
                    df[pcode] * np.prod(pcode_keys[pcode]["conversion_factors"])
                    + pcode_keys[pcode]["constant"]
                )
            else:
                new_df[pcode_keys[pcode]["column_name"]] = df[pcode] * np.prod(
                    pcode_keys[pcode]["conversion_factors"]
                )

    # save the data
    new_df.to_csv(
        # os.path.join(path, "processed", "USBR_{}.csv".format(grand_id)), index=False
        os.path.join(path, "processed", "USBR_{}.csv".format(station_name)), index=False
    )

    return new_df.columns.tolist()
    # print("processed data for {}".format(station_name))

In [8]:
# define the station names
# station_names = ["crpo", 'prv', 'prvo', 'kee', 'cle', 'crao']
station_names = pd.read_csv("pcodes.csv", header=None)[0]
# grand_ids = [None, 91, '91_forebay', 55, 58, None]

# define the start and end dates
start_date = datetime.strptime("1982-01-01", "%Y-%m-%d")
end_date = pd.Timestamp.now()

In [9]:
# read the stations json file
with open("stations.json", "r") as f:
    stations_dict = json.load(f)

# if not os.path.exists(path):
#     os.makedirs(path)
os.makedirs(os.path.join(data_dir, "raw/usbr"), exist_ok=True)
os.makedirs(os.path.join(data_dir, "processed"), exist_ok=True)

# download the data for each station
# for station_name, id in zip(station_names, grand_ids):
for station_name in station_names:
    # if pcodes exist for the station
    if "pcodes" in stations_dict[station_name.upper()]:
        # define the pcodes and pcode keys
        pcodes = stations_dict[station_name.upper()]["pcodes"]
        pcode_keys = stations_dict["pcode_keys"]

        # download the data
        download_data(station_name.upper(), pcodes, start_date, end_date, data_dir)
        # postprocess the data
        station_conditions = postprocess_data(
            station_name.upper(), data_dir, pcodes=pcodes, pcode_keys=pcode_keys
        )

        # print(conditions_data)

        # update the metadata
        station_ID = f"USBR_{station_name}"

        # if station_ID not in stations_metadata["station_ID"].values:
        #     stations_metadata = pd.concat(
        #         [
        #             stations_metadata,
        #             pd.DataFrame(
        #                 {
        #                     "station_ID": [station_ID],
        #                     "id_at_source": [station_name.upper()],
        #                     "available_data": ["{}"],
        #                     "source_URL": [
        #                         f"https://www.usbr.gov/pn-bin/inventory.pl?site={station_name.upper()}&ui=true&interval=daily"
        #                     ],
        #                     "description": [
        #                         usbr_stations_metadata[station_name]["description"]
        #                     ],
        #                     "latitude": [
        #                         dms2dd(
        #                             *usbr_stations_metadata[station_name]["latitude"]
        #                             .strip("-")
        #                             .split("-"),
        #                             direction="N",
        #                         )
        #                     ],
        #                     "longitude": [
        #                         dms2dd(
        #                             direction="W",
        #                             *usbr_stations_metadata[station_name]["longitude"]
        #                             .strip("-")
        #                             .split("-"),
        #                         )
        #                     ],
        #                     "site_params": ["{}"],
        #                 }
        #             ),
        #         ],
        #         ignore_index=True,
        #     )

        # # update the available data
        # availble_data = stations_metadata.loc[
        #     stations_metadata["station_ID"] == station_ID, "available_data"
        # ].values[0]
        # availble_data = json.loads(availble_data)

        # add the parameters to the available data
        # print(parameters[1:])
        
        for param in station_conditions:

            if station_ID not in stations_metadata["station_ID"].values:
                new_stations_metadata = pd.DataFrame(
                    {
                        "station_ID": [station_ID],
                        "id_at_source": [station_name.upper()],
                        "available_data": ["{}"],
                        "source_URL": ['{"url" : []}'],
                        "description": [
                            usbr_stations_metadata[station_name]["description"]
                        ],
                        "latitude": [
                            dms2dd(
                                *usbr_stations_metadata[station_name]["latitude"]
                                .strip("-")
                                .split("-"),
                                direction="N",
                            )
                        ],
                        "longitude": [
                            dms2dd(
                                direction="W",
                                *usbr_stations_metadata[station_name]["longitude"]
                                .strip("-")
                                .split("-"),
                            )
                        ],
                        "site_params": ["{}"],
                    }
                )
                availble_data = json.loads(
                    new_stations_metadata["available_data"].values[0]
                )
                stations_metadata = pd.concat(
                    [
                        stations_metadata,
                        new_stations_metadata,
                    ],
                    ignore_index=True,
                )

            else:
                availble_data = stations_metadata.loc[
                    stations_metadata["station_ID"] == station_ID, "available_data"
                ].values[0]
                availble_data = json.loads(availble_data)

            # update source url
            source_url = json.loads(
                stations_metadata.loc[
                    stations_metadata["station_ID"] == station_ID, "source_URL"
                ].values[0]
            )

            if (
                f"https://www.usbr.gov/pn-bin/inventory.pl?site={station_name.upper()}&ui=true&interval=daily"
                not in source_url["url"]
            ):
                source_url["url"].append(
                    f"https://www.usbr.gov/pn-bin/inventory.pl?site={station_name.upper()}&ui=true&interval=daily"
                )
                stations_metadata.loc[
                    stations_metadata["station_ID"] == station_ID, "source_URL"
                ] = json.dumps(source_url)

            # check if there is "conditions"  in the available data
            if "conditions" not in availble_data.values():
                availble_data["conditions"] = []

            # # update the available data
            # availble_data = stations_metadata.loc[
            #     stations_metadata["station_ID"] == station_ID, "available_data"
            # ].values[0]
            # availble_data = json.loads(availble_data)

            # print((param not in availble_data["conditions"]) and (param in conditions_data['Attribute_name'].to_list()))
            if (param not in availble_data["conditions"]) and (
                param in conditions_data["Attribute_name"].to_list()
            ):

                availble_data["conditions"].append(param)

                # if station_ID not in stations_metadata["station_ID"].values:
                #     new_stations_metadata["available_data"] = json.dumps(availble_data)
                #     stations_metadata = pd.concat(
                #         [
                #             stations_metadata,
                #             new_stations_metadata,
                #         ],
                #         ignore_index=True,
                #     )
                # else:
                #     # update the metadata
                #     stations_metadata.loc[
                #         stations_metadata["station_ID"] == station_ID, "available_data"
                #     ] = json.dumps(availble_data)

                # update the metadata
                stations_metadata.loc[
                    stations_metadata["station_ID"] == station_ID, "available_data"
                ] = json.dumps(availble_data)

                # # update the metadata
                # stations_metadata.loc[
                #     stations_metadata["station_ID"] == station_ID, "available_data"
                # ] = json.dumps(availble_data)

                # save the metadata
                stations_metadata.to_csv(stations_metadata_path, index=False)

    print("processed data for {}".format(station_name))

processed data for AFCI
processed data for AGA
processed data for ALNO
processed data for ALPY
processed data for ALTO
processed data for AMF
processed data for AMFI
processed data for ANCI
processed data for AND
processed data for ANDI
processed data for ANTI
processed data for ANTO
processed data for ARK
processed data for ARNO
processed data for ASCI
processed data for AUCI
processed data for BASO
processed data for BCAO
processed data for BCMO
processed data for BCSO
processed data for BCTO
processed data for BDDI
processed data for BENO
processed data for BEU
processed data for BEUO
processed data for BFCI
processed data for BFKY
processed data for BFTI
processed data for BIGI
processed data for BILI
processed data for BIRO
processed data for BJBO
processed data for BKPI
processed data for BMCI
processed data for BOOI
processed data for BPPI
processed data for BRFI
processed data for BSEI
processed data for BTSI
processed data for BUL
processed data for BUM
processed data for BURI

In [10]:
# add last updated date and last updated by
metadata_status = {
    "last_updated": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
    "update_message": "Updated the metadata to for USGS stations",
    "last_updated_by": "George Darkwah",
    "last_updated_by_email": "gdarkwah@uw.edu",
}

# save metadata
with open(Path(data_dir, "metadata/metadata_status.csv"), "w") as f:
    json.dump(metadata_status, f, indent=4)