# Preprocessing Wyoming Reservoir and Gage data for WaDE

In [None]:
# Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Wyoming/SS_ReservoirsObservationSites/RawInputData"
os.chdir(workingDir)

## Data: Total Lake Reservoir Storage Volume

In [None]:
df1a = pd.read_csv("total_lake_reservoir_storage_volume/data set label.csv")
df2a = pd.read_csv("total_lake_reservoir_storage_volume/location identifier.csv")
df3a = pd.read_csv("total_lake_reservoir_storage_volume/location name.csv")
df4a = pd.read_csv("total_lake_reservoir_storage_volume/location type.csv")
df5a = pd.read_csv("total_lake_reservoir_storage_volume/latitude.csv")
df6a = pd.read_csv("total_lake_reservoir_storage_volume/longitude.csv")

df1a = df1a.rename({'Value': 'data set label'}, axis=1)
df1a = df1a.merge(df2a[['Data Set Id', 'Value']], on='Data Set Id', how='left').rename({'Value': 'location identifier'}, axis=1)
df1a = df1a.merge(df3a[['Data Set Id', 'Value']], on='Data Set Id', how='left').rename({'Value': 'location name'}, axis=1)
df1a = df1a.merge(df4a[['Data Set Id', 'Value']], on='Data Set Id', how='left').rename({'Value': 'location type'}, axis=1)
df1a = df1a.merge(df5a[['Data Set Id', 'Value']], on='Data Set Id', how='left').rename({'Value': 'latitude'}, axis=1)
df1a = df1a.merge(df6a[['Data Set Id', 'Value']], on='Data Set Id', how='left').rename({'Value': 'longitude'}, axis=1)

dftlrsv_site = df1a.copy()
print(len(dftlrsv_site))
dftlrsv_site.head(3)

In [None]:
# %%time

# datasetidList = dftlrsv_site['Data Set Id'].astype(str).str.replace(" ", "%20").str.replace("@", "%40").tolist()  

# # issue with SSL verification for this data. Use this to ignore
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

# # create empty url dataframe for timeseries data
# dftlrsv_timeseries = pd.DataFrame()

# slength = len(datasetidList)
# for i in range(slength):
#     fileInputURL = "https://seoflow.wyo.gov/Export/BulkExport?DateRange=EntirePeriodOfRecord&TimeZone=0&Calendar=CALENDARYEAR&Interval=Daily&Step=1&ExportFormat=csv&TimeAligned=True&RoundData=False&IncludeGradeCodes=False&IncludeApprovalLevels=False&IncludeQualifiers=undefined&IncludeInterpolationTypes=False&Datasets[0].Calculation=Aggregate&Datasets[0].UnitId=198&_=1679336048953&Datasets[0].DatasetName=" + str(datasetidList[i])
#     print(fileInputURL)
#     try:
#         dftemp = pd.read_csv(fileInputURL, skiprows=4)
#         dftemp['timeseriesID'] =  str(datasetidList[i])
#         dftemp['url'] = fileInputURL
#         dftlrsv_timeseries = pd.concat([dftlrsv_timeseries, dftemp])
#     except:
#         dftemp = pd.DataFrame()
#         dftemp['timeseriesID'] =  str(datasetidList[i])
#         dftemp['url'] = fileInputURL
#         dftlrsv_timeseries = pd.concat([dftlrsv_timeseries, dftemp])
#         print("Error, issue with API return.")

        
# dftlrsv_timeseries.to_csv('total_lake_reservoir_storage_volume/tlrsv_timeseries.zip', compression=dict(method='zip', archive_name='tlrsv_timeseries.csv'), index=False)
# print(len(dftlrsv_timeseries))
# dftlrsv_timeseries.head(1)

In [None]:
# Input File - dataframeTimeSeries.zip
dftlrsv_timeseries = pd.read_csv('total_lake_reservoir_storage_volume/tlrsv_timeseries.zip', compression='zip')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dftlrsv_timeseries:
    dftlrsv_timeseries['WaDEUUID'] = "tx_tlrsv" + dftlrsv_timeseries.index.astype(str)
    dftlrsv_timeseries.to_csv('total_lake_reservoir_storage_volume/tlrsv_timeseries.zip', compression=dict(method='zip', archive_name='tlrsv_timeseries.csv'), index=False)

dftlrsv_timeseries['timeseriesID'] = dftlrsv_timeseries['timeseriesID'].astype(str).str.replace("%20", " ").str.replace("%40", "@")

print(len(dftlrsv_timeseries))
dftlrsv_timeseries.head(1)

In [None]:
dftlrsv = pd.merge(dftlrsv_timeseries, dftlrsv_site, left_on='timeseriesID', right_on='Data Set Id', how='left')
print(len(dftlrsv))
dftlrsv.head(2)

In [None]:
# WaDE Fields

# output dataframe
df1 = pd.DataFrame(index=dftlrsv.index)

# data assessment
df1['WaDEUUID'] = dftlrsv['WaDEUUID']

# variable info
df1['in_VariableCV'] = dftlrsv['WaDE Interpretation']

# water source info
df1['in_WaterSourceName'] = "WaDE Unspecified"
df1['in_WaterSourceNativeID'] = "WaDEID_WYws1"
df1['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df1['in_CoordinateAccuracy'] = "WaDE Unspecified"
df1['in_CoordinateMethodCV'] = "WaDE Unspecified"
df1['in_County'] = "WaDE Unspecified"
df1['in_HUC12'] = "WaDE Unspecified"
df1['in_HUC8'] = "WaDE Unspecified"
df1['in_Latitude'] = dftlrsv['latitude']
df1['in_Longitude'] = dftlrsv['longitude']
df1['in_PODorPOUSite'] = "Reservoir"
df1['in_SiteNativeID'] =dftlrsv['location identifier']
df1['in_SiteName'] = dftlrsv['location name']
df1['in_SiteTypeCV'] = "Reservoir/Lake"
df1['in_StateCV'] = 'WY'

# Site VariableAmounts Info
df1['in_Amount'] = dftlrsv['Average (Acre-ft)'] # change here
df1['in_BeneficialUseCategory'] = "Storage"
df1['in_ReportYearCV'] = "" # will fill in below
df1['in_TimeframeEnd'] = dftlrsv['End of Interval (UTC)']
df1['in_TimeframeStart'] = dftlrsv['Start of Interval (UTC)']

df1 = df1.drop_duplicates().reset_index(drop=True)

print(len(df1))
df1.head(1)

## Data: Discharge

In [None]:
df1b = pd.read_csv("discharge/data set label.csv")
df2b = pd.read_csv("discharge/location identifier.csv")
df3b = pd.read_csv("discharge/location name.csv")
df4b = pd.read_csv("discharge/location type.csv")
df5b = pd.read_csv("discharge/latitude.csv")
df6b = pd.read_csv("discharge/longitude.csv")

df1b = df1b.rename({'Value': 'data set label'}, axis=1)
df1b = df1b.merge(df2b[['Data Set Id', 'Value']], on='Data Set Id', how='left').rename({'Value': 'location identifier'}, axis=1)
df1b = df1b.merge(df3b[['Data Set Id', 'Value']], on='Data Set Id', how='left').rename({'Value': 'location name'}, axis=1)
df1b = df1b.merge(df4b[['Data Set Id', 'Value']], on='Data Set Id', how='left').rename({'Value': 'location type'}, axis=1)
df1b = df1b.merge(df5b[['Data Set Id', 'Value']], on='Data Set Id', how='left').rename({'Value': 'latitude'}, axis=1)
df1b = df1b.merge(df6b[['Data Set Id', 'Value']], on='Data Set Id', how='left').rename({'Value': 'longitude'}, axis=1)

dfdis_site = df1b.copy()
print(len(dfdis_site))
dfdis_site.head(3)

In [None]:
# %%time

# datasetidList = dfdis_site['Data Set Id'].astype(str).str.replace(" ", "%20").str.replace("@", "%40").tolist()  

# # issue with SSL verification for this data. Use this to ignore
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

# # create empty url dataframe for timeseries data
# dfdis_timeseries = pd.DataFrame()

# slength = len(datasetidList)
# for i in range(slength):
#     fileInputURL = "https://seoflow.wyo.gov/Export/BulkExport?DateRange=EntirePeriodOfRecord&TimeZone=0&Calendar=CALENDARYEAR&Interval=Daily&Step=1&ExportFormat=csv&TimeAligned=True&RoundData=False&IncludeGradeCodes=False&IncludeApprovalLevels=False&IncludeQualifiers=undefined&IncludeInterpolationTypes=False&Datasets[0].Calculation=Instantaneous&Datasets[0].UnitId=208&_=1679418181067&Datasets[0].DatasetName=" + str(datasetidList[i])
#     print(fileInputURL)
#     try:
#         dftemp = pd.read_csv(fileInputURL, skiprows=4)
#         dftemp['timeseriesID'] =  str(datasetidList[i])
#         dftemp['url'] = fileInputURL
#         dfdis_timeseries = pd.concat([dfdis_timeseries, dftemp])
#     except:
#         dftemp = pd.DataFrame()
#         dftemp['timeseriesID'] =  str(datasetidList[i])
#         dftemp['url'] = fileInputURL
#         dfdis_timeseries = pd.concat([dfdis_timeseries, dftemp])
#         print("Error, issue with API return.")

        
# dfdis_timeseries.to_csv('discharge/dis_timeseries.zip', compression=dict(method='zip', archive_name='dis_timeseries.csv'), index=False)
# print(len(dfdis_timeseries))
# dfdis_timeseries.head(1)

In [None]:
# Input File - dataframeTimeSeries.zip
dfdis_timeseries = pd.read_csv('discharge/dis_timeseries.zip', compression='zip')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfdis_timeseries:
    dfdis_timeseries['WaDEUUID'] = "tx_dis" + dfdis_timeseries.index.astype(str)
    dfdis_timeseries.to_csv('discharge/dis_timeseries.zip', compression=dict(method='zip', archive_name='dis_timeseries.csv'), index=False)

dfdis_timeseries['timeseriesID'] = dfdis_timeseries['timeseriesID'].astype(str).str.replace("%20", " ").str.replace("%40", "@")

print(len(dfdis_timeseries))
dfdis_timeseries.head(1)

In [None]:
dfdis = pd.merge(dfdis_timeseries, dfdis_site, left_on='timeseriesID', right_on='Data Set Id', how='left')
print(len(dfdis))
dfdis.head(2)

In [None]:
# WaDE Fields

# output dataframe
df2 = pd.DataFrame(index=dfdis.index)

# data assessment
df2['WaDEUUID'] = dfdis['WaDEUUID']

# variable info
df2['in_VariableCV'] = dfdis['WaDE Interpretation']

# water source info
df2['in_WaterSourceName'] = "WaDE Unspecified"
df2['in_WaterSourceNativeID'] = "WaDEID_WYws1"
df2['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df2['in_CoordinateAccuracy'] = "WaDE Unspecified"
df2['in_CoordinateMethodCV'] = "WaDE Unspecified"
df2['in_County'] = "WaDE Unspecified"
df2['in_HUC12'] = "WaDE Unspecified"
df2['in_HUC8'] = "WaDE Unspecified"
df2['in_Latitude'] = dfdis['latitude']
df2['in_Longitude'] = dfdis['longitude']
df2['in_PODorPOUSite'] = "Stream Gage"
df2['in_SiteNativeID'] =dfdis['location identifier']
df2['in_SiteName'] = dfdis['location name']
df2['in_SiteTypeCV'] = "Hydrology Station"
df2['in_StateCV'] = 'WY'

# Site VariableAmounts Info
df2['in_Amount'] = dfdis['Value at End of Interval (ft^3/s)'] # change here
df2['in_BeneficialUseCategory'] = "Discharge"
df2['in_ReportYearCV'] = "" # will fill in below
df2['in_TimeframeEnd'] = dfdis['End of Interval (UTC)']
df2['in_TimeframeStart'] = dfdis['Start of Interval (UTC)']

df2 = df2.drop_duplicates().reset_index(drop=True)

print(len(df2))
df2.head(1)

## Concatenate Together. Output Dataframe.

In [None]:
# Concatenate Produced Data Together
frames = [df1, df2]
dfout = pd.concat(frames).reset_index(drop=True)
print(len(dfout))

In [None]:
dfout.info()

## Fixing a few errors

In [None]:
 # Create VariableSpecificCV field

dfout['in_VariableSpecificCV'] = dfout['in_VariableCV'].astype(str) + "_Daily_" + dfout['in_BeneficialUseCategory'].astype(str) + "_" + dfout['in_WaterSourceTypeCV'].astype(str)
dfout['in_VariableSpecificCV'].unique()

In [None]:
# Converting data from string to datetime64[ns]
# extracting year component of datetime64[ns]
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'])
dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'])
dfout['in_ReportYearCV'] = dfout['in_TimeframeStart'].dt.to_period('Y')
dfout.head(1)

In [None]:
# in_Latitude 
dfout['in_Latitude'] = pd.to_numeric(dfout['in_Latitude'], errors='coerce').fillna(0)
dfout['in_Latitude'].unique()

In [None]:
# in_Longitude
dfout['in_Longitude'] = pd.to_numeric(dfout['in_Longitude'], errors='coerce').fillna(0)
dfout['in_Longitude'].unique()

In [None]:
# Fixing Water Amount datatype
dfout['in_Amount'] = pd.to_numeric(dfout['in_Amount'], errors='coerce').fillna(0)
dfout.head(1)

## Review and Export

In [None]:
dfout.info()

In [None]:
#Exporting to Finished File
dfout.to_csv('P_wySSROMain.zip', index=False, compression="zip")  # The output, save as a zip