In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

### Setup

In [2]:
# Define input pathing
inputDir = Path(Path.home(), 'temp', 'PSU_data', 'preprocessed_inputs')
gage_path = Path(inputDir, 'gage_info.csv')
ensemble_path = Path(inputDir, 'ensemble_streamflow.csv')
hbv_path = Path(inputDir, 'hbv_streamflow.csv')
prms_path = Path(inputDir, 'prms_streamflow.csv')
sacsma_path = Path(inputDir, 'sacsma_streamflow.csv')

# Define export directory
outputDir = Path(Path.home(), 'temp', 'PSU_data', 'teehr_inputs')
if not outputDir.exists():
    outputDir.mkdir(parents=True, exist_ok=True)

### Ingest preprocessed inputs

In [3]:
# read in gage data and fix leading 0's
gage_df = pd.read_csv(gage_path)
gage_df['STAID'] = gage_df['STAID'].astype(str)
gage_list = list(gage_df['STAID'])
gage_bin = []
for id in gage_list:
    if len(id) == 7:
        new_id = f'0{id}'
        gage_bin.append(new_id)
    else:
        gage_bin.append(id)
gage_df['STAID'] = gage_bin


# read in streamflow data
ensemble_df = pd.read_csv(ensemble_path)
ensemble_df.rename(columns={'Unnamed: 0':'Date'}, inplace=True)
hbv_df = pd.read_csv(hbv_path)
hbv_df.rename(columns={'Unnamed: 0':'Date'}, inplace=True)
prms_df = pd.read_csv(prms_path)
prms_df.rename(columns={'Unnamed: 0':'Date'}, inplace=True)
sacsma_df = pd.read_csv(prms_path)
sacsma_df.rename(columns={'Unnamed: 0':'Date'}, inplace=True)

In [4]:
# PSU uses usgs ids for their model ids
ensemble_df.columns = [c.replace('d_ensemble-','usgs-') for c in ensemble_df.columns]
hbv_df.columns = [c.replace('d_hbv-','usgs-') for c in hbv_df.columns]
prms_df.columns = [c.replace('d_prms-','usgs-') for c in prms_df.columns]
sacsma_df.columns = [c.replace('d_prms-','usgs-') for c in sacsma_df.columns]  # sacsma preprocessed file had prms heading

### Create parquet for each configuration in teehr format

In [5]:
# convert the streamflow units
def convert_streamflow_units(input_df, gage_df):
    '''Converts units on streamflow from mm/day to cms.'''
    for column in input_df.columns:
        if column != 'Date':
            # get gage id
            column_parts = column.split('-')
            gage_id = column_parts[1]
            # get area in km
            gage_row = gage_df[gage_df['STAID'] == gage_id]
            area_km = gage_row['DRAIN_SQKM'].values[0]
            # get value in cms
            input_df[column] = (input_df[column]*area_km)/86.4  # I gave you the wrong conversion

    return input_df



In [6]:
ensemble_df = convert_streamflow_units(input_df=ensemble_df, gage_df=gage_df)
hbv_df = convert_streamflow_units(input_df=hbv_df, gage_df=gage_df)
prms_df = convert_streamflow_units(input_df=prms_df, gage_df=gage_df)
sacsma_df = convert_streamflow_units(input_df=sacsma_df, gage_df=gage_df)

In [7]:
def teehrify(input_df, configuration_name, outputDir):
    '''Converts the PSU DM data to teehr format.'''
    # establish output folder
    secondary_ts_dir = Path(outputDir, 'secondary_timeseries_TEST') # this will be our test folder to compare normal loading to
    if not secondary_ts_dir.exists():
        secondary_ts_dir.mkdir(parents=True, exist_ok=True)

    # establish counter
    len_process = len(input_df.columns)-1
    counter = 0

    # deconstruct input_df
    for column in input_df.columns:
        if column != 'Date':
            # increment counter and print progress
            counter += 1
            if counter%100 == 0:
                print(f'starting routine for gage #{counter} ({np.round(((counter/len_process)*100),2)}%)')

            # assemble data to add rows for a given gage
            num_rows = len(input_df)
            data = {'reference_time': [None]*num_rows,
                    'value_time': pd.to_datetime(input_df['Date'].values),
                    'value': input_df[column].values,
                    'variable_name': ['streamflow_daily_mean']*num_rows,
                    'configuration_name': [configuration_name]*num_rows,
                    'unit_name': ['m^3/s']*num_rows,
                    'location_id': [column]*num_rows,
                    'member': ['None']*num_rows # this is what we are testing for the nullible error around member field
                   }
            working_df = pd.DataFrame(data)

            # drop rows with nan values in the value column
            working_df_validated = working_df.dropna(subset=['value'])
            if len(working_df_validated) != len(working_df):
                print(f'Invalid NaN values found for gage #{counter} -- removed {len(working_df)-len(working_df_validated)} rows')

            # Export
            outPath = Path(secondary_ts_dir, f'{configuration_name}_{counter}.parquet')
            working_df_validated.to_parquet(outPath)

    print(f'finished processing {counter} gages!')

    return

In [8]:
teehrify(input_df=ensemble_df, configuration_name='d_ensemble', outputDir=outputDir)

starting routine for gage #100 (3.7%)
starting routine for gage #200 (7.4%)
starting routine for gage #300 (11.1%)
starting routine for gage #400 (14.8%)
starting routine for gage #500 (18.5%)
starting routine for gage #600 (22.2%)
starting routine for gage #700 (25.9%)
starting routine for gage #800 (29.6%)
starting routine for gage #900 (33.3%)
starting routine for gage #1000 (37.0%)
starting routine for gage #1100 (40.7%)
starting routine for gage #1200 (44.4%)
starting routine for gage #1300 (48.09%)
starting routine for gage #1400 (51.79%)
starting routine for gage #1500 (55.49%)
starting routine for gage #1600 (59.19%)
starting routine for gage #1700 (62.89%)
starting routine for gage #1800 (66.59%)
starting routine for gage #1900 (70.29%)
starting routine for gage #2000 (73.99%)
starting routine for gage #2100 (77.69%)
starting routine for gage #2200 (81.39%)
starting routine for gage #2300 (85.09%)
starting routine for gage #2400 (88.79%)
starting routine for gage #2500 (92.49%

In [9]:
teehrify(input_df=hbv_df, configuration_name='d_hbv', outputDir=outputDir)

starting routine for gage #100 (3.7%)
starting routine for gage #200 (7.4%)
starting routine for gage #300 (11.1%)
starting routine for gage #400 (14.8%)
starting routine for gage #500 (18.5%)
starting routine for gage #600 (22.2%)
starting routine for gage #700 (25.9%)
starting routine for gage #800 (29.6%)
starting routine for gage #900 (33.3%)
starting routine for gage #1000 (37.0%)
starting routine for gage #1100 (40.7%)
starting routine for gage #1200 (44.4%)
starting routine for gage #1300 (48.09%)
starting routine for gage #1400 (51.79%)
starting routine for gage #1500 (55.49%)
starting routine for gage #1600 (59.19%)
starting routine for gage #1700 (62.89%)
starting routine for gage #1800 (66.59%)
starting routine for gage #1900 (70.29%)
starting routine for gage #2000 (73.99%)
starting routine for gage #2100 (77.69%)
starting routine for gage #2200 (81.39%)
starting routine for gage #2300 (85.09%)
starting routine for gage #2400 (88.79%)
starting routine for gage #2500 (92.49%

In [10]:
teehrify(input_df=prms_df, configuration_name='d_prms', outputDir=outputDir)

starting routine for gage #100 (3.7%)
starting routine for gage #200 (7.4%)
starting routine for gage #300 (11.1%)
starting routine for gage #400 (14.8%)
starting routine for gage #500 (18.5%)
starting routine for gage #600 (22.2%)
starting routine for gage #700 (25.9%)
starting routine for gage #800 (29.6%)
starting routine for gage #900 (33.3%)
starting routine for gage #1000 (37.0%)
starting routine for gage #1100 (40.7%)
starting routine for gage #1200 (44.4%)
starting routine for gage #1300 (48.09%)
starting routine for gage #1400 (51.79%)
starting routine for gage #1500 (55.49%)
starting routine for gage #1600 (59.19%)
starting routine for gage #1700 (62.89%)
starting routine for gage #1800 (66.59%)
starting routine for gage #1900 (70.29%)
starting routine for gage #2000 (73.99%)
starting routine for gage #2100 (77.69%)
starting routine for gage #2200 (81.39%)
starting routine for gage #2300 (85.09%)
starting routine for gage #2400 (88.79%)
starting routine for gage #2500 (92.49%

In [11]:
teehrify(input_df=sacsma_df, configuration_name='d_sacsma', outputDir=outputDir)

starting routine for gage #100 (3.7%)
starting routine for gage #200 (7.4%)
starting routine for gage #300 (11.1%)
starting routine for gage #400 (14.8%)
starting routine for gage #500 (18.5%)
starting routine for gage #600 (22.2%)
starting routine for gage #700 (25.9%)
starting routine for gage #800 (29.6%)
starting routine for gage #900 (33.3%)
starting routine for gage #1000 (37.0%)
starting routine for gage #1100 (40.7%)
starting routine for gage #1200 (44.4%)
starting routine for gage #1300 (48.09%)
starting routine for gage #1400 (51.79%)
starting routine for gage #1500 (55.49%)
starting routine for gage #1600 (59.19%)
starting routine for gage #1700 (62.89%)
starting routine for gage #1800 (66.59%)
starting routine for gage #1900 (70.29%)
starting routine for gage #2000 (73.99%)
starting routine for gage #2100 (77.69%)
starting routine for gage #2200 (81.39%)
starting routine for gage #2300 (85.09%)
starting routine for gage #2400 (88.79%)
starting routine for gage #2500 (92.49%