# Preprocessing Texas Reservoir and Observation Site data for WaDE

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Texas/SS_ReservoirsObservationSites/RawInputData"
os.chdir(workingDir)

## Site Data
- use csv file

In [3]:
fileInput = "recent-conditions.csv"
dfs = pd.read_csv(fileInput)
print(len(dfs))
dfs.head(1)

122


Unnamed: 0,condensed_name,short_name,full_name,timestamp,tags,gauge_location,Longitude,Latitude,volume,elevation,area,percent_full,conservation_capacity,conservation_storage,conservation_pool_elevation,dead_pool_elevation,volume_under_conservation_pool_elevation
0,Abilene,Abilene,Lake Abilene,12/19/2022,"['climate_low_rolling_plains', 'monitored', 'm...","OrderedDict([('type', 'Point'), ('coordinates'...",-99.888977,32.234577,2827.0,2001.52,326.6,35.8,7900.0,2827.0,2012.3,1968.8,7900.0


In [4]:
# create unique site name to use for API service
dfs['apiSiteName'] = dfs['short_name'].str.lower().str.replace(' ', '-').str.strip()
dfs.head(3)

Unnamed: 0,condensed_name,short_name,full_name,timestamp,tags,gauge_location,Longitude,Latitude,volume,elevation,area,percent_full,conservation_capacity,conservation_storage,conservation_pool_elevation,dead_pool_elevation,volume_under_conservation_pool_elevation,apiSiteName
0,Abilene,Abilene,Lake Abilene,12/19/2022,"['climate_low_rolling_plains', 'monitored', 'm...","OrderedDict([('type', 'Point'), ('coordinates'...",-99.888977,32.234577,2827.0,2001.52,326.6,35.8,7900.0,2827.0,2012.3,1968.8,7900.0,abilene
1,Addicks,Addicks,Addicks Reservoir,12/19/2022,"['climate_upper_coast', 'monitored', 'municipa...","OrderedDict([('type', 'Point'), ('coordinates'...",-95.623558,29.79134,,76.43,,,,,67.5,67.5,,addicks
2,AlanHenry,Alan Henry,Lake Alan Henry,12/19/2022,"['climate_low_rolling_plains', 'monitored', 'm...","OrderedDict([('type', 'Point'), ('coordinates'...",-101.04763,33.062878,72120.0,2210.27,2199.78,75.0,96207.0,72120.0,2220.0,0.0,96206.79587,alan-henry


## Timeseries Data
- use API

In [5]:
# Get list of apiSiteName
apiSiteNameList = dfs['apiSiteName'].tolist()   
print(len(apiSiteNameList))
apiSiteNameList

122


['abilene',
 'addicks',
 'alan-henry',
 'amistad',
 'amon-g-carter',
 'aquilla',
 'arlington',
 'arrowhead',
 'athens',
 'austin',
 'b-a-steinhagen',
 'bardwell',
 'barker',
 'belton',
 'benbrook',
 'bob-sandlin',
 "bois-d'arc",
 'bonham',
 'brady-creek',
 'bridgeport',
 'brownwood',
 'buchanan',
 'caddo',
 'canyon',
 'cedar-creek',
 'champion-creek',
 'cherokee',
 'choke-canyon',
 'cisco',
 'coleman',
 'coleto-creek',
 'colorado-city',
 'conroe',
 'corpus-christi',
 'crook',
 'cypress-springs',
 'e-v-spence',
 'eagle-mountain',
 'elephant-butte',
 'falcon',
 'fork',
 'fort-phantom-hill',
 'georgetown',
 'gibbons-creek',
 'graham',
 'granbury',
 'granger',
 'grapevine',
 'greenbelt',
 'halbert',
 'hords-creek',
 'houston',
 'houston-county',
 'hubbard-creek',
 'hubert-h-moss',
 'inks',
 'j-b-thomas',
 'jacksonville',
 'jim-chapman',
 'joe-pool',
 'kemp',
 'kickapoo',
 "lake-o'-the-pines",
 'lavon',
 'leon',
 'lewisville',
 'limestone',
 'livingston',
 'lost-creek',
 'lyndon-b-johnson',

In [6]:
# done already, use zip file

# %%time
# # get timeseries results. this took about 23 minutes.
# # use StationNumber in url

# # issue with SSL verification for this TX data. Use this to ignore
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

# # create empty dataframe
# columnsList = ["date",
#                "water_level",
#                "surface_area",
#                "reservoir_storage",
#                "conservation_storage",
#                "percent_full",
#                "conservation_capacity",
#                "dead_pool_capacity",
#                "apiSiteName"]
# dfts = pd.DataFrame(columns=columnsList)

# sglength = len(apiSiteNameList)
# for i in range(sglength):
#     fileInputURL = "https://www.waterdatafortexas.org/reservoirs/individual/" + str(apiSiteNameList[i]) + ".csv"
#     print(fileInputURL)
#     try:
#         dftemp = pd.read_csv(fileInputURL, comment="#", skip_blank_lines=True) # skip comment lines with a #
#         dftemp['apiSiteName'] = str(apiSiteNameList[i])
#         dftemp.columns = dfts.columns
#         dfts = pd.concat([dfts, dftemp])   
#     except:
#         dftemp = pd.DataFrame()
#         dftemp['apiSiteName'] = str(apiSiteNameList[i])
#         dfts = pd.concat([dfts, dftemp], ignore_index=False, sort=False) 
#         print("Error, issue with API return.")

# dfts.to_csv('timeSeriesData.zip', index=False, compression="zip")  # The output, save as a zip
# print(len(dfts))
# dfts.head()

In [7]:
# Input File - dataframeTimeSeries.zip
dfts = pd.read_csv('timeSeriesData.zip', compression='zip')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfts:
    dfts['WaDEUUID'] = "txssro" + dfts.index.astype(str)
    dfts.to_csv('timeSeriesData.zip', index=False)

print(len(dfts))
dfts.head(1)

2011670


Unnamed: 0,date,water_level,surface_area,reservoir_storage,conservation_storage,percent_full,conservation_capacity,dead_pool_capacity,apiSiteName,WaDEUUID
0,1999-03-05,1999.87,280.1,2314.0,2314.0,29.3,7900.0,0,abilene,txssro0


## Output Dataframe

In [8]:
 #Merging dataframes into one, using left-join.
dfts = pd.merge(dfts, dfs, on='apiSiteName', how='left')
print(len(dfts))
dfts.head(1)

2011670


Unnamed: 0,date,water_level,surface_area,reservoir_storage,conservation_storage_x,percent_full_x,conservation_capacity_x,dead_pool_capacity,apiSiteName,WaDEUUID,condensed_name,short_name,full_name,timestamp,tags,gauge_location,Longitude,Latitude,volume,elevation,area,percent_full_y,conservation_capacity_y,conservation_storage_y,conservation_pool_elevation,dead_pool_elevation,volume_under_conservation_pool_elevation
0,1999-03-05,1999.87,280.1,2314.0,2314.0,29.3,7900.0,0,abilene,txssro0,Abilene,Abilene,Lake Abilene,12/19/2022,"['climate_low_rolling_plains', 'monitored', 'm...","OrderedDict([('type', 'Point'), ('coordinates'...",-99.888977,32.234577,2827.0,2001.52,326.6,35.8,7900.0,2827.0,2012.3,1968.8,7900.0


#### water_level

In [9]:
# output dataframe
dfwl = pd.DataFrame(index=dfts.index)

# data assessment
dfwl['WaDEUUID'] = dfts['WaDEUUID']

# variable info
dfwl['in_VariableCV'] = 'Reservoir Level' # change here

# Site Info
dfwl['in_Latitude'] = dfts['Latitude']
dfwl['in_Longitude'] = dfts['Longitude']
dfwl['in_PODorPOUSite'] = "Observation Site"
dfwl['in_SiteName'] = dfts['short_name']
dfwl['in_SiteTypeCV'] = 'Reservoir'

# Site VariableAmounts Info
dfwl['in_Amount'] = dfts['water_level'] # change here
dfwl['in_BeneficialUseCategory'] = "Storage"
dfwl['in_ReportYearCV'] = dfts['date']
dfwl['in_TimeframeEnd'] = dfts['date']
dfwl['in_TimeframeStart'] = dfts['date']

print(len(dfwl))
dfwl.head(1)

2011670


Unnamed: 0,WaDEUUID,in_VariableCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,txssro0,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.87,Storage,1999-03-05,1999-03-05,1999-03-05


#### reservoir_storage

In [10]:
# output dataframe
dfrs = pd.DataFrame(index=dfts.index)

# data assessment
dfrs['WaDEUUID'] = dfts['WaDEUUID']

# variable info
dfrs['in_VariableCV'] = 'Reservoir Storage' # change here

# Site Info
dfrs['in_Latitude'] = dfts['Latitude']
dfrs['in_Longitude'] = dfts['Longitude']
dfrs['in_PODorPOUSite'] = "Observation Site"
dfrs['in_SiteName'] = dfts['short_name']
dfrs['in_SiteTypeCV'] = 'Reservoir'

# Site VariableAmounts Info
dfrs['in_Amount'] = dfts['reservoir_storage'] # change here
dfrs['in_BeneficialUseCategory'] = "Storage"
dfrs['in_ReportYearCV'] = dfts['date']
dfrs['in_TimeframeEnd'] = dfts['date']
dfrs['in_TimeframeStart'] = dfts['date']

print(len(dfrs))
dfrs.head(1)

2011670


Unnamed: 0,WaDEUUID,in_VariableCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,txssro0,Reservoir Storage,32.234577,-99.888977,Observation Site,Abilene,Reservoir,2314.0,Storage,1999-03-05,1999-03-05,1999-03-05


#### Concatenate

In [11]:
frames = [dfwl, dfrs]
dfout = pd.concat(frames).drop_duplicates().reset_index(drop=True)
print(len(dfout))
dfout.head(1)

4023340


Unnamed: 0,WaDEUUID,in_VariableCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,txssro0,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.87,Storage,1999-03-05,1999-03-05,1999-03-05


In [12]:
%%time
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDETX_S" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_SiteName'] = dfout['in_SiteName']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(val):
    if (val == "") or (pd.isnull(val)):
        outList = ""
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_SiteName'] == val), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ""
    return outList

dfout['in_SiteNativeID'] = dfout.apply(lambda row: retrieveSiteNativeID(row['in_SiteName']), axis=1)
dfout

Unnamed: 0,WaDEUUID,in_VariableCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_SiteNativeID
0,txssro0,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.87,Storage,1999-03-05,1999-03-05,1999-03-05,WaDETX_S1
1,txssro1,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.85,Storage,1999-03-06,1999-03-06,1999-03-06,WaDETX_S1
2,txssro2,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.82,Storage,1999-03-07,1999-03-07,1999-03-07,WaDETX_S1
3,txssro3,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.82,Storage,1999-03-08,1999-03-08,1999-03-08,WaDETX_S1
4,txssro4,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.83,Storage,1999-03-09,1999-03-09,1999-03-09,WaDETX_S1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4023335,txssro2011665,Reservoir Storage,33.304569,-94.160744,Observation Site,Wright Patman,Reservoir,369555.00,Storage,2022-12-16,2022-12-16,2022-12-16,WaDETX_S122
4023336,txssro2011666,Reservoir Storage,33.304569,-94.160744,Observation Site,Wright Patman,Reservoir,362515.00,Storage,2022-12-17,2022-12-17,2022-12-17,WaDETX_S122
4023337,txssro2011667,Reservoir Storage,33.304569,-94.160744,Observation Site,Wright Patman,Reservoir,354452.00,Storage,2022-12-18,2022-12-18,2022-12-18,WaDETX_S122
4023338,txssro2011668,Reservoir Storage,33.304569,-94.160744,Observation Site,Wright Patman,Reservoir,347564.00,Storage,2022-12-19,2022-12-19,2022-12-19,WaDETX_S122


## Cleaning Output

In [13]:
dfout.info()
dfout.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4023340 entries, 0 to 4023339
Data columns (total 13 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   WaDEUUID                  object 
 1   in_VariableCV             object 
 2   in_Latitude               float64
 3   in_Longitude              float64
 4   in_PODorPOUSite           object 
 5   in_SiteName               object 
 6   in_SiteTypeCV             object 
 7   in_Amount                 float64
 8   in_BeneficialUseCategory  object 
 9   in_ReportYearCV           object 
 10  in_TimeframeEnd           object 
 11  in_TimeframeStart         object 
 12  in_SiteNativeID           object 
dtypes: float64(3), object(10)
memory usage: 399.0+ MB


Unnamed: 0,WaDEUUID,in_VariableCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_SiteNativeID
0,txssro0,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.87,Storage,1999-03-05,1999-03-05,1999-03-05,WaDETX_S1


In [14]:
# Convert History Year to YYYY-MM-DD format.
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], utc=True)
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], utc=True)
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

Unnamed: 0,WaDEUUID,in_VariableCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_SiteNativeID
0,txssro0,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.87,Storage,1999-03-05,1999-03-05,1999-03-05,WaDETX_S1


In [15]:
# extract year out
dfout['in_ReportYearCV'] = pd.to_datetime(dfout['in_ReportYearCV'], utc=True)
dfout['in_ReportYearCV'] = pd.to_datetime(dfout["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
dfout['in_ReportYearCV'] = dfout['in_ReportYearCV'].dt.year
dfout.head(1)

Unnamed: 0,WaDEUUID,in_VariableCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_SiteNativeID
0,txssro0,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.87,Storage,1999,1999-03-05,1999-03-05,WaDETX_S1


In [16]:
# in_Latitude & in_Longitude
dfout['in_Latitude'] = pd.to_numeric(dfout['in_Latitude'], errors='coerce').fillna(0)
dfout['in_Longitude'] = pd.to_numeric(dfout['in_Longitude'], errors='coerce').fillna(0)
dfout.head(1)

Unnamed: 0,WaDEUUID,in_VariableCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_SiteNativeID
0,txssro0,Reservoir Level,32.234577,-99.888977,Observation Site,Abilene,Reservoir,1999.87,Storage,1999,1999-03-05,1999-03-05,WaDETX_S1


## Export Outputs

In [17]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

WaDEUUID                            object
in_VariableCV                       object
in_Latitude                        float64
in_Longitude                       float64
in_PODorPOUSite                     object
in_SiteName                         object
in_SiteTypeCV                       object
in_Amount                          float64
in_BeneficialUseCategory            object
in_ReportYearCV                      int64
in_TimeframeEnd             datetime64[ns]
in_TimeframeStart           datetime64[ns]
in_SiteNativeID                     object
dtype: object


In [19]:
#Exporting to Finished File
#dfout.to_csv('P_txSSROMain.csv', index=False)  # The output
dfout.to_csv('P_txSSROMain.zip', index=False, compression="zip")  # The output, save as a zip