# Preprocessing Nebraska Reservoir and Observation Site data for WaDE

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Nebraska/SS_ReservoirsObservationSites/RawInputData"
os.chdir(workingDir)

## API Site Data

In [None]:
# done already

# %%time
# # Retrieve list of only NeDNRstream gage stations
# #######################################

# # API retrieval
# url = "https://nednr.nebraska.gov/IwipApi/api/v1/StreamGage/GetStationList"
# responseD = json.loads(requests.get(url).text)
# DtL = responseD['Results']
# length = len(DtL)

# # create dataframe and store
# df = pd.DataFrame()
# for i in range(length):
#     row = pd.DataFrame([DtL[i]])
#     df = pd.concat([df, row])

# # Use only NeNDR Active provided sites
# df = df[df['SourceName'] == 'NeDNR']

# # Exporting output files.
# df.to_csv('StreamGageGetStationList.csv', index=False)  # The output.

In [None]:
# the one we want to test out

# Input File - StreamGageGetStationList.csv
sgInput = "StreamGageGetStationList.csv"
dfsg = pd.read_csv(sgInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsg:
    dfsg['WaDEUUID'] = "nebRG" + dfsg.index.astype(str)
    dfsg.to_csv('StreamGageGetStationList.csv', index=False)

dfsg.head(1)

In [None]:
# Update StationNumber values must have 8 digits, with leading 0s

def updateStationNumber(x):
    x = str(x).strip()
    if len(x) == 4:
        x = "0000" + x
    if len(x) == 5:
        x = "000" + x
    if len(x) == 6:
        x = "00" + x
    if len(x) == 7:
        x = "0" + x
    return x

dfsg['StationNumber'] = dfsg.apply(lambda row: updateStationNumber(row['StationNumber']), axis=1)
dfsg.head()

In [None]:
# Get list of StationNumber
streamgageIdList = dfsg['StationNumber'].tolist()   
print(len(streamgageIdList))
streamgageIdList

## Timeseries Data

### DailyMeanByYear

In [None]:
# get timeseires without using the year list

# %%time
# # get timeseries results
# # use StationNumber in url

# # create empty dataframe
# dfts = pd.DataFrame()

# sglength = len(streamgageIdList)
# for i in range(sglength):
#     serviceStr = "DailyMeanByYear" # change here
#     url = "https://nednr.nebraska.gov/IwipApi/api/v1/StreamGage/" + serviceStr + "?StationNumber=" + str(streamgageIdList[i])
#     try:
#         responseD = json.loads(requests.get(url).text)
#         DtL = responseD['Results']

#         # store in dataframe
#         dftemp = pd.DataFrame()
#         length = len(DtL)
#         for x in range(length):
#             row = pd.DataFrame([DtL[x]])
#             row['url'] = url
#             row['service'] = serviceStr
#             dftemp = pd.concat([dftemp, row])

#         dfts = pd.concat([dfts, dftemp])
    
#     except:
#         dftemp = pd.DataFrame()
#         dftemp['url'] = url
#         dfts = pd.concat([dfts, dftemp])
#         print("Error, issue with API return.")

# dfts.to_csv('DailyMeanByYear.csv', index=False)  # The output.
# print(len(dfts))
# dfts.head()

In [None]:
# Year list
yearList = ["2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009",
            "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019",
            "2020", "2021", "2022"]

In [None]:
%%time
# get timeseries results
# use StationNumber in url

# create empty dataframe
dfts = pd.DataFrame()

sgLength = len(streamgageIdList)
ylLength = len(yearList)

for i in range(sgLength):
    serviceStr = "DailyMeanByYear" # change here
    for j in range(ylLength):
        url = "https://nednr.nebraska.gov/IwipApi/api/v1/StreamGage/" + serviceStr + "?StationNumber=" + str(streamgageIdList[i]) + "&MeanYear=" + str(yearList[j])
        print(url)
        try:
            responseD = json.loads(requests.get(url).text)
            DtL = responseD['Results']

            # store in dataframe
            dftemp = pd.DataFrame()
            length = len(DtL)
            for x in range(length):
                row = pd.DataFrame([DtL[x]])
                row['url'] = url
                row['service'] = serviceStr
                dftemp = pd.concat([dftemp, row])

            dfts = pd.concat([dfts, dftemp])
        except:
            dftemp = pd.DataFrame()
            dftemp['url'] = url
            dfts = pd.concat([dfts, dftemp])
            print("Error, issue with API return.")

print(len(dfts))
dfts.head()

In [None]:
# Update StationNumber values must have 8 digits, with leading 0s

def updateStationNumber(x):
    x = str(x).strip()
    if len(x) == 4:
        x = "0000" + x
    if len(x) == 5:
        x = "000" + x
    if len(x) == 6:
        x = "00" + x
    if len(x) == 7:
        x = "0" + x
    return x

dfts['StationNumber'] = dfts.apply(lambda row: updateStationNumber(row['StationNumber']), axis=1)
dfts.head()

## Output Dataframe

In [None]:
 #Merging dataframes into one, using left-join.
df = pd.merge(dfts, dfsg, on='StationNumber', how='left')
print(len(df))
df.head(1)

In [None]:
# output dataframe
dfout = pd.DataFrame(index=df.index)

# Site Info
dfout['in_Latitude'] = df['Latitude']
dfout['in_Longitude'] = df['Longitude']
dfout['in_PODorPOUSite'] = "Observation Site"
dfout['in_SiteName'] = df['StationName']
dfout['in_SiteNativeID'] = df['StationNumber']
dfout['in_SiteTypeCV'] = df['StationTypeDescription']

# Site VariableAmounts Info
dfout['in_Amount'] = df['Value']
dfout['in_BeneficialUseCategory'] = "Unspecified"
dfout['in_ReportYearCV'] = df['Date']
dfout['in_TimeframeEnd'] = df['Date']
dfout['in_TimeframeStart'] = df['Date']

print(len(dfout))
dfout.head(1)

## Cleaning Output

In [None]:
dfout.info()
dfout.head(1)

In [None]:
# Convert History Year to YYYY-MM-DD format.
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], utc=True)
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], utc=True)
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

In [None]:
# extract year out
dfout['in_ReportYearCV'] = pd.to_datetime(dfout['in_ReportYearCV'], utc=True)
dfout['in_ReportYearCV'] = pd.to_datetime(dfout["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
dfout['in_ReportYearCV'] = dfout['in_ReportYearCV'].dt.year
dfout.head(1)

In [None]:
# in_Latitude & in_Longitude
dfout['in_Latitude'] = pd.to_numeric(dfout['in_Latitude'], errors='coerce').fillna(0)
dfout['in_Longitude'] = pd.to_numeric(dfout['in_Longitude'], errors='coerce').fillna(0)
dfout.head(1)

## Export Outputs

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

In [None]:
#Exporting to Finished File
dfout.to_csv('P_neSSRGMain.csv', index=False)  # The output