# Preprocessing Texas Reservoir and Observation Site data for WaDE

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Texas/SS_ReservoirsObservationSites/RawInputData"
os.chdir(workingDir)

## Site Data
- use csv file

In [None]:
fileInput = "recent-conditions.csv"
dfs = pd.read_csv(fileInput)
print(len(dfs))
dfs.head(1)

In [None]:
# create unique site name to use for API service
dfs['apiSiteName'] = dfs['short_name'].str.lower().str.replace(' ', '-').str.strip()
dfs.head(3)

## Timeseries Data
- use API

In [None]:
# Get list of apiSiteName
apiSiteNameList = dfs['apiSiteName'].tolist()   
print(len(apiSiteNameList))
apiSiteNameList

In [None]:
# done already, use zip file

# %%time
# # get timeseries results. this took about 23 minutes.
# # use StationNumber in url

# # issue with SSL verification for this TX data. Use this to ignore
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

# # create empty dataframe
# columnsList = ["date",
#                "water_level",
#                "surface_area",
#                "reservoir_storage",
#                "conservation_storage",
#                "percent_full",
#                "conservation_capacity",
#                "dead_pool_capacity",
#                "apiSiteName"]
# dfts = pd.DataFrame(columns=columnsList)

# sglength = len(apiSiteNameList)
# for i in range(sglength):
#     fileInputURL = "https://www.waterdatafortexas.org/reservoirs/individual/" + str(apiSiteNameList[i]) + ".csv"
#     print(fileInputURL)
#     try:
#         dftemp = pd.read_csv(fileInputURL, comment="#", skip_blank_lines=True) # skip comment lines with a #
#         dftemp['apiSiteName'] = str(apiSiteNameList[i])
#         dftemp.columns = dfts.columns
#         dfts = pd.concat([dfts, dftemp])   
#     except:
#         dftemp = pd.DataFrame()
#         dftemp['apiSiteName'] = str(apiSiteNameList[i])
#         dfts = pd.concat([dfts, dftemp], ignore_index=False, sort=False) 
#         print("Error, issue with API return.")

# dfts.to_csv('timeSeriesData.zip', index=False, compression="zip")  # The output, save as a zip
# print(len(dfts))
# dfts.head()

In [None]:
# Input File - dataframeTimeSeries.zip
dfts = pd.read_csv('timeSeriesData.zip', compression='zip')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfts:
    dfts['WaDEUUID'] = "txssro" + dfts.index.astype(str)
    dfts.to_csv('timeSeriesData.zip', index=False)

print(len(dfts))
dfts.head(1)

## Output Dataframe

In [None]:
 #Merging dataframes into one, using left-join.
dfts = pd.merge(dfts, dfs, on='apiSiteName', how='left')
print(len(dfts))
dfts.head(1)

#### water_level

In [None]:
# output dataframe
dfwl = pd.DataFrame(index=dfts.index)

# data assessment
dfwl['WaDEUUID'] = dfts['WaDEUUID']

# variable info
dfwl['in_VariableCV'] = 'Reservoir Level' # change here

# water source info
dfwl['in_WaterSourceName'] = "WaDE Unspecified"
dfwl['in_WaterSourceNativeID'] = "WaDEID_TXws1"
dfwl['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
dfwl['in_CoordinateAccuracy'] = "WaDE Unspecified"
dfwl['in_CoordinateMethodCV'] = "WaDE Unspecified"
dfwl['in_County'] = "WaDE Unspecified"
dfwl['in_HUC12'] = "WaDE Unspecified"
dfwl['in_HUC8'] = "WaDE Unspecified"
dfwl['in_Latitude'] = dfts['Latitude']
dfwl['in_Longitude'] = dfts['Longitude']
dfwl['in_PODorPOUSite'] = "Observation Site"
dfwl['in_SiteNativeID'] = "" # will fill in below
dfwl['in_SiteName'] = dfts['short_name']
dfwl['in_SiteTypeCV'] = 'Reservoir'
dfwl['in_StateCV'] = 'TX'

# Site VariableAmounts Info
dfwl['in_Amount'] = dfts['water_level'] # change here
dfwl['in_BeneficialUseCategory'] = "Storage"
dfwl['in_ReportYearCV'] = dfts['date']
dfwl['in_TimeframeEnd'] = dfts['date']
dfwl['in_TimeframeStart'] = dfts['date']

print(len(dfwl))
dfwl.head(1)

#### reservoir_storage

In [None]:
# output dataframe
dfrs = pd.DataFrame(index=dfts.index)

# data assessment
dfrs['WaDEUUID'] = dfts['WaDEUUID']

# variable info
dfrs['in_VariableCV'] = 'Reservoir Storage' # change here

# water source info
dfrs['in_WaterSourceName'] = "WaDE Unspecified"
dfrs['in_WaterSourceNativeID'] = "WaDEID_TXws1"
dfrs['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
dfrs['in_CoordinateAccuracy'] = "WaDE Unspecified"
dfrs['in_CoordinateMethodCV'] = "WaDE Unspecified"
dfrs['in_County'] = "WaDE Unspecified"
dfrs['in_HUC12'] = "WaDE Unspecified"
dfrs['in_HUC8'] = "WaDE Unspecified"
dfrs['in_Latitude'] = dfts['Latitude']
dfrs['in_Longitude'] = dfts['Longitude']
dfrs['in_PODorPOUSite'] = "Observation Site"
dfrs['in_SiteNativeID'] = "" # will fill in below
dfrs['in_SiteName'] = dfts['short_name']
dfrs['in_SiteTypeCV'] = 'Reservoir'
dfrs['in_StateCV'] = 'TX'

# Site VariableAmounts Info
dfrs['in_Amount'] = dfts['reservoir_storage'] # change here
dfrs['in_BeneficialUseCategory'] = "Storage"
dfrs['in_ReportYearCV'] = dfts['date']
dfrs['in_TimeframeEnd'] = dfts['date']
dfrs['in_TimeframeStart'] = dfts['date']

print(len(dfrs))
dfrs.head(1)

#### Concatenate

In [None]:
frames = [dfwl, dfrs]
dfout = pd.concat(frames).drop_duplicates().reset_index(drop=True)
print(len(dfout))
dfout.head(1)

In [None]:
%%time
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDETX_S" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_SiteName'] = dfout['in_SiteName']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(val):
    if (val == "") or (pd.isnull(val)):
        outList = ""
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_SiteName'] == val), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ""
    return outList

dfout['in_SiteNativeID'] = dfout.apply(lambda row: retrieveSiteNativeID(row['in_SiteName']), axis=1)
dfout['in_SiteNativeID'].unique()

## Cleaning Output

In [None]:
dfout.info()
dfout.head(1)

In [None]:
# Convert History Year to YYYY-MM-DD format.
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], utc=True)
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], utc=True)
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

In [None]:
# extract year out
dfout['in_ReportYearCV'] = pd.to_datetime(dfout['in_ReportYearCV'], utc=True)
dfout['in_ReportYearCV'] = pd.to_datetime(dfout["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
dfout['in_ReportYearCV'] = dfout['in_ReportYearCV'].dt.year
dfout.head(1)

In [None]:
# in_Latitude & in_Longitude
dfout['in_Latitude'] = pd.to_numeric(dfout['in_Latitude'], errors='coerce').fillna(0)
dfout['in_Longitude'] = pd.to_numeric(dfout['in_Longitude'], errors='coerce').fillna(0)
dfout.head(1)

## Export Outputs

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

In [None]:
#Exporting to Finished File
dfout.to_csv('P_txSSROMain.zip', index=False, compression="zip")  # The output, save as a zip