# Preprocessing Oregon Reservoir and Gage data for WaDE

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Oregon/SS_ReservoirsGages/RawInputData"
os.chdir(workingDir)

## Site Data

In [None]:
# Input File - OWRD_gages.csv
sgInput = "OWRD_gages.csv"
dfsg = pd.read_csv(sgInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsg:
    dfsg['WaDEUUID'] = "orSRG" + dfsg.index.astype(str)
    dfsg.to_csv('OWRD_gages.csv', index=False)

print(len(dfsg))
dfsg.head(1)

In [None]:
# Drop null rows for the time periods

dfsg = dfsg.dropna(subset=['period_of_', 'period_of1']).reset_index(drop=True)
print(len(dfsg))
dfsg.head(1)

In [None]:
# only use Active site records

dfsg = dfsg[dfsg['station__1'] == 'Active'].drop_duplicates().reset_index(drop=True)
print(len(dfsg))
dfsg.head(1)

In [None]:
# convert start and end data to MM/DD/YYYY format.

dfsg['period_of_'] = pd.to_datetime(dfsg['period_of_'], utc=True)
dfsg['period_of_'] = pd.to_datetime(dfsg["period_of_"].dt.strftime('%m/%d/%Y').astype(str))

dfsg['period_of1'] = pd.to_datetime(dfsg['period_of1'], utc=True)
dfsg['period_of1'] = pd.to_datetime(dfsg["period_of1"].dt.strftime('%m/%d/%Y').astype(str))

dfsg.head(1)

In [None]:
# Get list of StationNumber
streamgageIdList = dfsg['station_nb'].tolist()
print(len(streamgageIdList))
streamgageIdList

In [None]:
# Get list of start date
startDateList = dfsg['period_of_'].astype(str).tolist()
print(len(startDateList))
startDateList

In [None]:
# Get list of end date
endDateList = dfsg['period_of1'].astype(str).tolist()
print(len(endDateList))
endDateList

## Timeseries Data

In [None]:
# done already

# %%time
# # get timeseries results

# # create empty dataframe
# dfts = pd.DataFrame()

# sglength = len(streamgageIdList)
# for i in range(sglength):
#     url = "https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=" + str(streamgageIdList[i]) + "&start_date=" + str(startDateList[i]) + "&end_date=" + str(endDateList[i]) + "&dataset=MDF&format=html"
#     print(url)
#     try:
#         # store in dataframe
#         dftemp = pd.DataFrame()
#         dftemp = pd.read_csv(url, sep="\t")
#         dftemp['url'] = url
#         dfts = pd.concat([dfts, dftemp])
    
#     except:
#         dftemp = pd.DataFrame()
#         dftemp['url'] = url
#         dfts = pd.concat([dfts, dftemp])
#         print("Error, issue with API return.")

# dfts.to_csv('timeseriesData.csv', index=False)  # The output.
# print(len(dfts))
# dfts.head()

In [None]:
# Input File - timeseriesData.csv
# read the dataset from a zip
fileInput = "timeseriesData.zip"
dfts = pd.read_csv(fileInput)

print(len(dfts))
dfts.head(1)

In [None]:
# only use published data

dfts = dfts[dfts['published_status'] == 'Published'].drop_duplicates().reset_index(drop=True)
print(len(dfts))
dfts.head(1)

## Output Dataframe

In [None]:
# Merging dataframes into one, using left-join.
df = pd.merge(dfts, dfsg, left_on='<pre>station_nbr', right_on='station_nb', how='left')
print(len(df))
df.head(1)

In [None]:
dfout = pd.DataFrame(index=df.index)

# Water Source Info
dfout['in_WaterSourceTypeCV'] = df['streamfl_1']

# Site Info
dfout['in_County'] = df['county_nam']
dfout['in_Latitude'] = df['latitude_d']
dfout['in_Longitude'] = df['longitude_']
dfout['in_PODorPOUSite'] = "Gage"
dfout['in_SiteName'] = df['station_na']
dfout['in_SiteNativeID'] = df['station_nb']
dfout['in_SiteTypeCV'] = df['source_t_1']

# Site VariableAmounts Info
dfout['in_Amount'] = df['mean_daily_flow_cfs']
dfout['in_BeneficialUseCategory'] = "Unspecified"
dfout['in_ReportYearCV'] = df['record_date']
dfout['in_TimeframeEnd'] = df['record_date']
dfout['in_TimeframeStart'] = df['record_date']

print(len(dfout))
dfout.head(1)

In [None]:
# drop duplicate values
dfout = dfout.drop_duplicates().reset_index(drop=True)
print(len(dfout))

## Cleaning Output

In [None]:
dfout.info()
dfout.head(1)

In [None]:
# Fixing empty water source types

def fixWaterSourceTypeCV(val):
    val = str(val).strip()
    if val == "" or val == " " or pd.isnull(val) or val == "nan":
        outString = "Unspecified"
    else:
        outString = val
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: fixWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceTypeCV'].unique()

In [None]:
# Fixing empty site types names

def fixSiteTypeCV(val):
    val = str(val).strip()
    if val == "" or val == " " or pd.isnull(val) or val == "nan":
        outString = "Unspecified"
    else:
        outString = val
    return outString

dfout['in_SiteTypeCV'] = dfout.apply(lambda row: fixSiteTypeCV(row['in_SiteTypeCV']), axis=1)
dfout['in_SiteTypeCV'].unique()

In [None]:
# Convert History Year to MM/DD/YYYY format.
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], utc=True)
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], utc=True)
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

In [None]:
# extract year out
dfout['in_ReportYearCV'] = pd.to_datetime(dfout['in_ReportYearCV'], utc=True)
dfout['in_ReportYearCV'] = pd.to_datetime(dfout["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
dfout['in_ReportYearCV'] = dfout['in_ReportYearCV'].dt.year
dfout.head(1)

## WaDE Custom Elements (due to missing info)

In [None]:
%%time
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEOR_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
    if not (ml.empty):  # check if the series is empty
        outList = ml.iloc[0]
    else:
        outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

## Export Outputs

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

In [None]:
dfout

In [None]:
#Exporting to Finished File
dfout.to_csv('P_orSSRGMain.csv', index=False)  # The output

In [None]:
dfout.to_csv('P_orSSRGMain.zip', compression={'method': 'zip', 'archive_name': 'sample.csv'})

In [None]:
print(os.getcwd())