# Preprocessing Utah Reservoir and Gage data for WaDE

In [None]:
# Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Utah/SS_ReservoirsObservationSites/RawInputData"
os.chdir(workingDir)

## Data: "Distribution Stations" site data

In [None]:
fileInput = "Distribution_Stations.csv"
dfds = pd.read_csv(fileInput)
dfds['STATION_ID'] = dfds['STATION_ID'].astype(int)
print(len(dfds))
dfds.head()

## Data: get timeseries data via API service per site

In [None]:
# done already. Run time = 35min 30s.

# %%time

# # Get list of STATION_ID
# stationIDList = dfds['STATION_ID'].astype(int).astype(str).tolist()   

# # issue with SSL verification for this data. Use this to ignore
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

# # create empty url dataframe for timeseries data
# dfurl = pd.DataFrame()

# slength = len(stationIDList)
# for i in range(slength):
#     fileInputURL = "https://www.waterrights.utah.gov/dvrtdb/DailyCommaData.asp?BYEAR=1900&EYEAR=2023&StationId=" + str(stationIDList[i])
#     print(fileInputURL)
#     try:
#         # get metadata
#         dfmetadata = pd.read_csv(fileInputURL, nrows=5) # read in file
#         dfmetadata_T = dfmetadata.transpose() # tranpose the dataframe
#         new_header = dfmetadata_T.iloc[0] #grab the first row for the header
#         dfmetadata_T = dfmetadata_T[1:] #take the data less the header row
#         dfmetadata_T.columns = new_header #set the header row as the df header
#         unitsString = dfmetadata_T['Units'].astype(str).to_string() # convert value to string
        
#         # get timeseries
#         dftemp = pd.read_csv(fileInputURL, skiprows=5)
#         dftemp['Units'] = unitsString
#         dftemp['timeseriesID'] =  str(stationIDList[i])
#         dftemp['url'] = fileInputURL
#         dfurl = pd.concat([dfurl, dftemp])
#     except:
#         dftemp = pd.DataFrame()
#         dftemp['Units'] = ""
#         dftemp['timeseriesID'] =  str(stationIDList[i])
#         dftemp['url'] = fileInputURL
#         dfurl = pd.concat([dfurl, dftemp])
#         print("Error, issue with API return.")

        
# dfurl.to_csv('url_timeseries.zip', compression=dict(method='zip', archive_name='url_timeseries.csv'), index=False)
# print(len(dfurl))
# dfurl.head(1)

In [None]:
# Input File - dataframeTimeSeries.zip
df_timeseries = pd.read_csv('url_timeseries.zip', compression='zip')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_timeseries:
    df_timeseries['WaDEUUID'] = "utssro" + df_timeseries.index.astype(str)
    df_timeseries.to_csv('url_timeseries.zip', compression=dict(method='zip', archive_name='url_timeseries.csv'), index=False)

print(len(df_timeseries))
df_timeseries.head(1)

In [None]:
df_timeseries_ds = pd.merge(df_timeseries, dfds, left_on='timeseriesID', right_on='STATION_ID', how='left')
print(len(df_timeseries_ds))
df_timeseries_ds.head(1)

In [None]:
# just checking units

# # Assign Units

# def checkUnitsFunc(valA):
#     valA = str(valA).strip().lower()
#     if 'cfs' in valA:
#         outString = "CFS"
#     if 'acft' in valA:
#         outString = "AF"
#     if 'feet' in valA:
#         outString = "FT"
#     return outString

# df_timeseries_ds['timeseriesUnits'] = df_timeseries_ds.apply(lambda row: checkUnitsFunc(row['Units']), axis=1)
# df_timeseries_ds['timeseriesUnits'].unique()

In [None]:
# Assign VariableCV

def checkVariableCVFunc(valA):
    valA = str(valA).strip().lower()
    
    if 'cfs' in valA:
        outString = "Discharge"
    if 'discharge in cfs' in valA:
        outString = "Discharge"
    if 'height in feet' in valA:
        outString = "Stage"
    if 'storage in acft' in valA:
        outString = "Storage"
    if 'discharge in acft' in valA:
        outString = "Discharge AF"
    if 'diversion in acft' in valA:
        outString = "Diversion"
    if 'evaporation in cfs' in valA:
        outString = "Evaporation"
    
    return outString

df_timeseries_ds['in_VariableCV'] = df_timeseries_ds.apply(lambda row: checkVariableCVFunc(row['Units']), axis=1)
df_timeseries_ds['in_VariableCV'].unique()

In [None]:
# Assign SiteTypeCV

def checkSiteTypeCVFunc(valA):
    valA = str(valA).strip().lower()
    if 'cfs' in valA:
        outString = "Stream Gage"
    if 'discharge in cfs' in valA:
        outString = "Stream Gage"
    if 'height in feet' in valA:
        outString = "Stream Gage"
    if 'storage in acft' in valA:
        outString = "Reservoir"
    if 'discharge in acft' in valA:
        outString = "Stream Gage"
    if 'diversion in acft' in valA:
        outString = "Stream Gage"
    if 'evaporation in cfs' in valA:
        outString = "Stream Gage"
    
    return outString

df_timeseries_ds['in_SiteTypeCV'] = df_timeseries_ds.apply(lambda row: checkSiteTypeCVFunc(row['Units']), axis=1)
df_timeseries_ds['in_SiteTypeCV'].unique()

In [None]:
# Assign BeneficialUseCategory

def checkBeneficialUseCategoryFunc(valA):
    valA = str(valA).strip().lower()
    if 'cfs' in valA:
        outString = "Discharge"
    if 'discharge in cfs' in valA:
        outString = "Discharge"
    if 'height in feet' in valA:
        outString = "Stage"
    if 'storage in acft' in valA:
        outString = "Storage"
    if 'discharge in acft' in valA:
        outString = "Discharge"
    if 'diversion in acft' in valA:
        outString = "Diversion"
    if 'evaporation in cfs' in valA:
        outString = "Evaporation"
    
    return outString

df_timeseries_ds['in_BeneficialUseCategory'] = df_timeseries_ds.apply(lambda row: checkBeneficialUseCategoryFunc(row['Units']), axis=1)
df_timeseries_ds['in_BeneficialUseCategory'].unique()

In [None]:
# WaDE Fields

# output dataframe
df1 = pd.DataFrame(index=df_timeseries_ds.index)

# data assessment
df1['WaDEUUID'] = df_timeseries_ds['WaDEUUID']

# variable info
df1['in_VariableCV'] = df_timeseries_ds['in_VariableCV']

# water source info
df1['in_WaterSourceName'] = df_timeseries_ds['SYSTEM']
df1['in_WaterSourceNativeID'] = "" # fill out below
df1['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df1['in_CoordinateAccuracy'] = "WaDE Unspecified"
df1['in_CoordinateMethodCV'] = "WaDE Unspecified"
df1['in_County'] = "WaDE Unspecified"
df1['in_HUC12'] = "WaDE Unspecified"
df1['in_HUC8'] = "WaDE Unspecified"
df1['in_Latitude'] = df_timeseries_ds['Latitude']
df1['in_Longitude'] = df_timeseries_ds['Longitude']
df1['in_PODorPOUSite'] = df_timeseries_ds['in_SiteTypeCV'] # samething here
df1['in_SiteNativeID'] = df_timeseries_ds['STATION_ID']
df1['in_SiteName'] = df_timeseries_ds['NAME']
df1['in_SiteTypeCV'] = df_timeseries_ds['in_SiteTypeCV']
df1['in_StateCV'] = 'UT'

# Site VariableAmounts Info
df1['in_Amount'] = df_timeseries_ds['Flow'] # change here
df1['in_BeneficialUseCategory'] = df_timeseries_ds['in_BeneficialUseCategory']
df1['in_ReportYearCV'] = "" # will fill in below
df1['in_TimeframeEnd'] = df_timeseries_ds['Date']
df1['in_TimeframeStart'] = df_timeseries_ds['Date']

df1 = df1.drop_duplicates().reset_index(drop=True)

print(len(df1))
df1.head(1)

In [None]:
dfout = df1.copy()
print(len(dfout))

In [None]:
dfout.info()

## Fixing a few errors

In [None]:
 # Create VariableSpecificCV field

dfout['in_VariableSpecificCV'] = dfout['in_VariableCV'].astype(str) + "_Daily_" + dfout['in_BeneficialUseCategory'].astype(str) + "_" + dfout['in_WaterSourceTypeCV'].astype(str)
dfout['in_VariableSpecificCV'].unique()

In [None]:
# Converting data from string to datetime64[ns]
# extracting year component of datetime64[ns]
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'])
dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'])
dfout['in_ReportYearCV'] = dfout['in_TimeframeStart'].dt.to_period('Y')
dfout.head(1)

In [None]:
# in_Latitude 
dfout['in_Latitude'] = pd.to_numeric(dfout['in_Latitude'], errors='coerce').fillna(0)
dfout['in_Latitude'].unique()

In [None]:
# in_Longitude
dfout['in_Longitude'] = pd.to_numeric(dfout['in_Longitude'], errors='coerce').fillna(0)
dfout['in_Longitude'].unique()

In [None]:
# Fixing Water Amount datatype
dfout['in_Amount'] = pd.to_numeric(dfout['in_Amount'], errors='coerce').fillna(0)
dfout['in_Amount'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEUT_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = dfout['in_WaterSourceName']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceName']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

## Review and Export

In [None]:
dfout.info()

In [None]:
#Exporting to Finished File
dfout.to_csv('P_utSSROMain.zip', index=False, compression="zip")  # The output, save as a zip