# Preprocessing Water Supply Site Time Series data for WaDE

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse
from bs4 import BeautifulSoup # text parser

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/WaDE Data Folder/California/WaterSupply_SiteSpecific"  # change here
os.chdir(workingDir)

## Input Files
- site & timeseries info for reservoirs
- site & timeseries info for streamgages
- site & timeseries info for active snow depth sensors

In [None]:
# Input File: Reservoirs
fileInput = "RawInputData/Reservoirs.zip"
dfr = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfr:
    dfr['WaDEUUID'] = "in1" + dfr.index.astype(str)
    dfr.to_csv('RawInputData/Reservoirs.zip', compression=dict(method='zip', archive_name='Reservoirs.csv'), index=False)

dfr.rename(columns=lambda x: x.rstrip(), inplace=True)
print(len(dfr))
dfr.head(1)

In [None]:
# Input File: StreamGages shp file
fileInput = "RawInputData/shapefiles/StreamGages.zip"
dfsg = gpd.read_file(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsg:
    dfsg['WaDEUUID'] = "in2" + dfsg.index.astype(str)
    dfsg.to_csv('RawInputData/StreamGages.zip', compression=dict(method='zip', archive_name='StreamGages.csv'), index=False)

print(len(dfsg))
dfsg.head(1)

In [None]:
# we only want to work with stream gages whose sites are considered "Acitve"

dfsg = dfsg[dfsg['sitestatus'] == 'Active'].reset_index(drop=True)
print(len(dfsg))

In [None]:
# StreamGage input data DOES contain some duplicate reservoirs, but does not cleary indicate which are reservoirs and which are stream gages
# if site already in resevoir site data, remove from stream gages

dfr_idList = dfr['ID'].tolist()
dfr_idList = list(set(dfr_idList))

dfsg = dfsg[~dfsg['siteid'].isin(dfr_idList)].reset_index(drop=True)
print(len(dfsg))
dfsg.head(1)

In [None]:
# Input File: Active Snow Depth Sensors
fileInput = "RawInputData/ActiveSnowDepthSesnsors.zip"
dfsd = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsd:
    dfsd['WaDEUUID'] = "in3" + dfsd.index.astype(str)
    dfsd.to_csv('RawInputData/ActiveSnowDepthSesnsors.zip', compression=dict(method='zip', archive_name='ActiveSnowDepthSesnsors.csv'), index=False)

dfsd.rename(columns=lambda x: x.rstrip(), inplace=True)
print(len(dfsd))
dfsd.head(1)

In [None]:
# Input File: Reservoirs_timeseries

fileInput = "RawInputData/Reservoirs_timeseries.zip"
dfr_ts = pd.read_csv(fileInput).replace(np.nan, "")
print(len(dfr_ts))
dfr_ts.head(1)

In [None]:
# Input File: StreamGages_timeseries

fileInput = "RawInputData/StreamGages_timeseries.zip"
dfsg_ts = pd.read_csv(fileInput).replace(np.nan, "")
print(len(dfsg_ts))
dfsg_ts.head(1)

In [None]:
# Input File: ActiveSnowDepthSesnsors_timeseries

fileInput = "RawInputData/ActiveSnowDepthSesnsors_timeseries.zip"
dfsd_ts = pd.read_csv(fileInput).replace(np.nan, "")
print(len(dfsd_ts))
dfsd_ts.head(1)

In [None]:
# inner-join reservoir data to timeseries

dfin1 = pd.merge(dfr, dfr_ts, left_on='ID', right_on='STATION_ID', how='inner').reset_index(drop=True)
print(len(dfin1))
dfin1.head(1)

In [None]:
# inner-join streamgage data to timeseries

dfin2 = pd.merge(dfsg, dfsg_ts, left_on='siteid', right_on='STATION_ID', how='inner').reset_index(drop=True)
print(len(dfin2))
dfin2.head(1)

In [None]:
# inner-join snow depth data to timeseries

dfin3 = pd.merge(dfsd, dfsd_ts, left_on='ID', right_on='STATION_ID', how='inner').reset_index(drop=True)
print(len(dfin3))
dfin3.head(1)

## Get metadata & timeseries data
- https://cdec.water.ca.gov/dynamicapp/staMeta
- this is out of order. But essnetialy steps include 1) use site info to get site ids; 2) use site ids with metadata api to determine what timeseries is available; 3) retreive timeseries data for sites based on available metadata.
- metadata and timeseries data already retreived, use hard copies for inputs instead now.

#### Reservoir

In [None]:
# already done

# %%time
# # get Reservoirs metadata

# tempList = dfr['ID'].tolist()
# dftemp = pd.DataFrame()

# for i in range(len(tempList)):
#     idString = str(tempList[i]).strip()   
#     url = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + idString
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         table = soup.find_all('table')
#         rawData = pd.read_html(str(table))[1]
#         rawData["ID"] = idString
#         dftemp = pd.concat([dftemp, rawData])
#     except:
#         print(f' did not work, {url}')

# dftemp.to_csv('RawInputData/Reservoirs_Metadata.zip', compression=dict(method='zip', archive_name='Reservoirs_Metadata.csv'), index=False)

# print(len(dftemp))
# dftemp.head()

In [None]:
# # Input File: Reservoirs_Metadata

# fileInput = "RawInputData/Reservoirs_Metadata.zip"
# dfr_m = pd.read_csv(fileInput).replace(np.nan, "")
# print(len(dfr_m))
# dfr_m.head(1)

In [None]:
# # Clean up reservoir metadata

# dfr_m = dfr_m.rename(columns={"0": "Sensor Description",  "1": "SensorNums", "2" : "Duration", "3" : "Plot", "4" : "Data Collection", "5" : "Data Available"})
# dfr_m[['Start Date', 'End Date']] = dfr_m['Data Available'].str.split('to', n=1, expand=True)
# dfr_m['Start Date'] = dfr_m['Start Date'].str.strip()
# dfr_m['End Date'] = dfr_m['End Date'].str.replace('present','01/01/2025').str.strip()
# dfr_m.head()

# dfr_m.head(1)

In [None]:
# # left-join by reservoir metadata to reservoir site data

# dfr = pd.merge(dfr, dfr_m, left_on='ID', right_on='ID', how='left')
# print(len(dfr))
# dfr.head(1)

#### StreamGages

In [None]:
# already done

# %%time
# # get StreamGages metadata

# tempList = dfsg['siteid'].unique().tolist()
# dftemp = pd.DataFrame()

# for i in range(len(tempList)):
#     idString = str(tempList[i]).strip()   
#     url = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + idString
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         table = soup.find_all('table')
#         rawData = pd.read_html(str(table))[1]
#         rawData["siteid"] = idString
#         dftemp = pd.concat([dftemp, rawData])
#     except:
#         print(f' did not work, {url}')

# dftemp.to_csv('RawInputData/StreamGages_Metadata.zip', compression=dict(method='zip', archive_name='StreamGages_Metadata.csv'), index=False)

# print(len(dftemp))
# dftemp.head()

In [None]:
# # Input File: StreamGages_Metadata

# fileInput = "RawInputData/StreamGages_Metadata.zip"
# dfsg_m = pd.read_csv(fileInput).replace(np.nan, "")
# print(len(dfsg_m))
# dfsg_m.head(1)

In [None]:
# # Clean up streamgage metadata

# dfsg_m = dfsg_m.rename(columns={"0": "Sensor Description",  "1": "SensorNums", "2" : "Duration", "3" : "Plot", "4" : "Data Collection", "5" : "Data Available"})
# dfsg_m[['Start Date', 'End Date']] = dfsg_m['Data Available'].str.split('to', n=1, expand=True)
# dfsg_m['Start Date'] = dfsg_m['Start Date'].str.strip()
# dfsg_m['End Date'] = dfsg_m['End Date'].str.replace('present','01/01/2025').str.strip()
# dfsg_m.head(1)

In [None]:
# # left-join by streamgage metadata to streamgage site data

# dfsg = pd.merge(dfsg, dfsg_m, left_on='siteid', right_on='siteid', how='left')
# print(len(dfsg))
# dfsg.head(1)

#### Active Snow Depth Sensors

In [None]:
# already done

# %%time
# # get Active Snow Depth Sensors metadata

# tempList = dfsd['ID'].tolist()
# dftemp = pd.DataFrame()

# for i in range(len(tempList)):
#     idString = str(tempList[i]).strip()   
#     url = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + idString
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         table = soup.find_all('table')
#         rawData = pd.read_html(str(table))[1]
#         rawData["ID"] = idString
#         dftemp = pd.concat([dftemp, rawData])
#     except:
#         print(f' did not work, {url}')

# dftemp.to_csv('RawInputData/ActiveSnowDepthSesnsors_Metadata.zip', compression=dict(method='zip', archive_name='ActiveSnowDepthSesnsors_Metadata.csv'), index=False)

# print(len(dftemp))
# dftemp.head()

In [None]:
# # Input File: ActiveSnowDepthSesnsors_Metadata

# fileInput = "RawInputData/ActiveSnowDepthSesnsors_Metadata.zip"
# dfsd_m = pd.read_csv(fileInput).replace(np.nan, "")
# print(len(dfsd_m))
# dfsd_m.head(1)

In [None]:
# # Clean up ActiveSnowDepthSesnsors_Metadata

# dfsd_m = dfsd_m.rename(columns={"0": "Sensor Description",  "1": "SensorNums", "2" : "Duration", "3" : "Plot", "4" : "Data Collection", "5" : "Data Available"})
# dfsd_m[['Start Date', 'End Date']] = dfsd_m['Data Available'].str.split('to', n=1, expand=True)
# dfsd_m['Start Date'] = dfsd_m['Start Date'].str.strip()
# dfsd_m['End Date'] = dfsd_m['End Date'].str.replace('present','01/01/2025').str.strip()
# dfsd_m.head(1)

In [None]:
# # left-join by ActiveSnowDepthSesnsors_Metadata to site data

# dfsd = pd.merge(dfsd, dfsd_m, left_on='ID', right_on='ID', how='left')
# print(len(dfsd))
# dfsd.head(1)

#### Get Timeseries

In [None]:
# # abbreviate Duration 

# durationDict = {
# "(daily)" : "d",
# "(monthly)" : "m",
# "(event)" : "e", 
# "(hourly)" : "h"
# }

# def CreateDurationAPIValueFunc(val):
#     val = str(val).strip()
#     try:
#         outString = durationDict[val]
#     except:
#         outString = ""
#     return outString

# dfr['Duration_abb'] = dfr.apply(lambda row: CreateDurationAPIValueFunc(row['Duration']), axis=1)
# dfsg['Duration_abb'] = dfsg.apply(lambda row: CreateDurationAPIValueFunc(row['Duration']), axis=1)
# dfsd['Duration_abb'] = dfsd.apply(lambda row: CreateDurationAPIValueFunc(row['Duration']), axis=1)

In [None]:
# # drop rows that do not contain monthly (m) or dailiy (d) data.

# timestep = ['m', 'd']
# dfr = dfr[dfr['Duration_abb'].isin(timestep)]
# dfsg = dfsg[dfsg['Duration_abb'].isin(timestep)]
# dfsd = dfsd[dfsd['Duration_abb'].isin(timestep)]

In [None]:
# # we only want the following Reservoir SesnorNums data...
# # RESERVOIR ELEVATION, FEET, 6, (daily)
# # RESERVOIR STORAGE, AF, 15, (daily), (monthly)
# # RESERVOIR OUTFLOW, CFS, 23, (daily)
# # RESERVOIR INFLOW, CFS, 76, (daily)

# dfr['SensorNums'] = dfr['SensorNums'].fillna(0).astype(np.int64).astype(str)
# dfr = dfr[dfr['SensorNums'].astype(str).isin(['6', '15', '23', '76'])]
# dfr['SensorNums'].value_counts()

In [None]:
# # we only want the following streamgage SesnorNums data...
# # FLOW, MEAN DAILY, CFS, 41, (daily)
# # FLOW, FULL NATURAL, AF, 65, (monthly)
# # FLOW, MONTHLY VOLUME, AF, 66, (monthly)

# dfsg['SensorNums'] = dfsg['SensorNums'].fillna(0).astype(np.int64).astype(str)
# dfsg = dfsg[dfsg['SensorNums'].astype(str).isin(['41', '65', '66'])]
# dfsg['SensorNums'].value_counts()

In [None]:
# # we only want the following snow depth SesnorNums data...
# # SNOW DEPTH, INCHES, 18, (daily)

# dfsd['SensorNums'] = dfsd['SensorNums'].fillna(0).astype(np.int64).astype(str)
# dfsd = dfsd[dfsd['SensorNums'].astype(str).isin(['18'])]
# dfsd['SensorNums'].value_counts()

In [None]:
# %%time
# # get timeseries for reservoirs

# stationsList = dfr['ID'].tolist()
# sensorNumsList = dfr['SensorNums'].tolist()
# dur_codeList =  dfr['Duration_abb'].tolist()
# plotList = dfr['Plot'].tolist()
# datacollectionList = dfr['Data Collection'].tolist()
# startList = dfr['Start Date'].tolist()
# endList = dfr['End Date'].tolist()

# # Time Series Dataframe
# dfr_ts = pd.DataFrame()

# for i in range(len(stationsList)):
#     stationStr = str(stationsList[i]).strip()
#     sensorNumsStr = str(sensorNumsList[i]).strip()
#     dur_codeStr = str(dur_codeList[i]).strip()
#     plotStr = str(plotList[i]).strip()
#     datacollectionStr = str(datacollectionList[i]).strip()
#     startStr = str(startList[i]).strip()
#     endStr = str(endList[i]).strip()   
#     urlInput = "https://cdec.water.ca.gov/dynamicapp/req/CSVDataServlet?Stations=" + stationStr + "&SensorNums=" + sensorNumsStr + "&dur_code=" + dur_codeStr + "&Start=" + startStr +"&End=" + endStr
#     try:
#         tempdf = pd.read_csv(urlInput).replace(np.nan, "")
#         dfr_ts = pd.concat([dfr_ts, tempdf])
#         dfr_ts['Duration_abbe'] = dur_codeStr
#         dfr_ts['Plot'] = plotStr
#         dfr_ts['Data Collection'] = datacollectionStr
#         dfr_ts['Start Date'] = startStr
#         dfr_ts['End Date'] = endStr
              
#     except:
#         print("...bad response")

# dfr_ts.to_csv('RawInputData/Reservoirs_timeseries.zip', compression=dict(method='zip', archive_name='Reservoirs_timeseries.csv'), index=False)
# print(len(dfr_ts))
# dfr_ts.head()

In [None]:
# %%time
# # get timeseries for streamgages

# dfsg['SensorNums'] = dfsg['SensorNums'].astype(int).astype(str)

# stationsList = dfsg['siteid'].tolist()
# sensorNumsList = dfsg['SensorNums'].tolist()
# dur_codeList =  dfsg['Duration_abb'].tolist()
# plotList = dfsg['Plot'].tolist()
# datacollectionList = dfsg['Data Collection'].tolist()
# startList = dfsg['Start Date'].tolist()
# endList = dfsg['End Date'].tolist()

# # Time Series Dataframe
# dfsg_ts = pd.DataFrame()

# for i in range(len(stationsList)):
#     stationStr = str(stationsList[i]).strip()
#     sensorNumsStr = str(sensorNumsList[i]).strip()
#     dur_codeStr = str(dur_codeList[i]).strip()
#     plotStr = str(plotList[i]).strip()
#     datacollectionStr = str(datacollectionList[i]).strip()
#     startStr = str(startList[i]).strip()
#     endStr = str(endList[i]).strip()   
#     urlInput = "https://cdec.water.ca.gov/dynamicapp/req/CSVDataServlet?Stations=" + stationStr + "&SensorNums=" + sensorNumsStr + "&dur_code=" + dur_codeStr + "&Start=" + startStr +"&End=" + endStr
#     try:
#         tempdf = pd.read_csv(urlInput).replace(np.nan, "")
#         dfsg_ts = pd.concat([dfsg_ts, tempdf])
#         dfsg_ts['Duration_abbe'] = dur_codeStr
#         dfsg_ts['Plot'] = plotStr
#         dfsg_ts['Data Collection'] = datacollectionStr
#         dfsg_ts['Start Date'] = startStr
#         dfsg_ts['End Date'] = endStr
              
#     except:
#         print("...bad response")

# dfsg_ts.to_csv('RawInputData/StreamGages_timeseries.zip', compression=dict(method='zip', archive_name='StreamGages_timeseries.csv'), index=False)
# print(len(dfsg_ts))
# dfsg_ts.head()

In [None]:
# %%time
# # get timeseries for ActiveSnowDepthSesnsors

# dfsd['SensorNums'] = dfsd['SensorNums'].astype(int).astype(str)

# stationsList = dfsd['ID'].tolist()
# sensorNumsList = dfsd['SensorNums'].tolist()
# dur_codeList =  dfsd['Duration_abb'].tolist()
# plotList = dfsd['Plot'].tolist()
# datacollectionList = dfsd['Data Collection'].tolist()
# startList = dfsd['Start Date'].tolist()
# endList = dfsd['End Date'].tolist()

# # Time Series Dataframe
# dfsd_ts = pd.DataFrame()

# for i in range(len(stationsList)):
#     stationStr = str(stationsList[i]).strip()
#     sensorNumsStr = str(sensorNumsList[i]).strip()
#     dur_codeStr = str(dur_codeList[i]).strip()
#     plotStr = str(plotList[i]).strip()
#     datacollectionStr = str(datacollectionList[i]).strip()
#     startStr = str(startList[i]).strip()
#     endStr = str(endList[i]).strip()   
#     urlInput = "https://cdec.water.ca.gov/dynamicapp/req/CSVDataServlet?Stations=" + stationStr + "&SensorNums=" + sensorNumsStr + "&dur_code=" + dur_codeStr + "&Start=" + startStr +"&End=" + endStr
#     try:
#         tempdf = pd.read_csv(urlInput).replace(np.nan, "")
#         dfsd_ts = pd.concat([dfsd_ts, tempdf])
#         dfsd_ts['Duration_abbe'] = dur_codeStr
#         dfsd_ts['Plot'] = plotStr
#         dfsd_ts['Data Collection'] = datacollectionStr
#         dfsd_ts['Start Date'] = startStr
#         dfsd_ts['End Date'] = endStr
              
#     except:
#         print("...bad response")

# dfsd_ts.to_csv('RawInputData/ActiveSnowDepthSesnsors_timeseries.zip', compression=dict(method='zip', archive_name='ActiveSnowDepthSesnsors_timeseries.csv'), index=False)
# print(len(dfsd_ts))
# dfsd_ts.head()

## WaDE Data

In [None]:
# reservoir data
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CAwsss_M1"

# Variable Info
df['in_AggregationIntervalUnitCV'] = dfin1['Duration_abbe']
df['in_AmountUnitCV'] = dfin1['UNITS']
df['in_VariableCV'] = "Water Supply"

# Organization Info
df['in_OrganizationUUID'] = "CAwsss_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = "Surface Water" # need this for auto fill below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = dfin1['County']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] =dfin1['Latitude']
df['in_Longitude'] = dfin1['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = ""
df['in_SiteName'] = dfin1['Station']
df['in_SiteNativeID'] = dfin1['STATION_ID']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Reservoir"
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin1.VALUE.replace('---',0).replace('BRT',0).replace('ART',0).astype(float)
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = dfin1['SENSOR_TYPE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = dfin1['SENSOR_TYPE']
df['in_ReportYearCV'] =  dfin1['OBS DATE']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin1['OBS DATE']
df['in_TimeframeStart'] = dfin1['OBS DATE']

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

In [None]:
# stream gage data
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin2['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CAwsss_M1"

# Variable Info
df['in_AggregationIntervalUnitCV'] = dfin2['Duration_abbe']
df['in_AmountUnitCV'] = dfin2['UNITS']
df['in_VariableCV'] = "Water Supply"

# Organization Info
df['in_OrganizationUUID'] = "CAwsss_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = "Surface Water" # need this for auto fill below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = dfin2['huc12']
df['in_HUC8'] = dfin2['huc8']
df['in_Latitude'] = dfin2['wade_Latit']
df['in_Longitude'] = dfin2['wade_Longi']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = ""
df['in_SiteName'] = dfin2['sitename']
df['in_SiteNativeID'] = dfin2['siteid']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Stream Gage"
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin2.VALUE.replace('---',0).replace('BRT',0).replace('ART',0).astype(float)
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = dfin2['SENSOR_TYPE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = dfin2['SENSOR_TYPE']
df['in_ReportYearCV'] =  dfin2['OBS DATE']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin2['OBS DATE']
df['in_TimeframeStart'] = dfin2['OBS DATE']

outdf2 = df.copy()
outdf2 = outdf2.drop_duplicates().reset_index(drop=True)
print(len(outdf2))
outdf2.head()

In [None]:
# Snow depth data
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin3['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CAwsss_M1"

# Variable Info
df['in_AggregationIntervalUnitCV'] = dfin3['Duration_abbe']
df['in_AmountUnitCV'] = dfin3['UNITS']
df['in_VariableCV'] = "Water Supply"

# Organization Info
df['in_OrganizationUUID'] = "CAwsss_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = "Surface Water" # need this for auto fill below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin3['LATITUDE']
df['in_Longitude'] = dfin3['LONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = ""
df['in_SiteName'] = dfin3['STATION']
df['in_SiteNativeID'] = dfin3['STATION_ID']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Snow Depth"
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin3.VALUE.replace('---',0).replace('BRT',0).replace('ART',0).astype(float)
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = dfin3['SENSOR_TYPE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = dfin3['SENSOR_TYPE']
df['in_ReportYearCV'] =  dfin3['OBS DATE']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin3['OBS DATE']
df['in_TimeframeStart'] = dfin3['OBS DATE']

outdf3 = df.copy()
outdf3 = outdf3.drop_duplicates().reset_index(drop=True)
print(len(outdf3))
outdf3.head()

In [None]:
# Concatenate dataframes
frames = [outdf1, outdf2, outdf3]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Fix AggregationIntervalUnit input

intervalDict = {
    "d" : "Daily",
    "m" : "Monthly",
}
def FixAggregationIntervalUnit(val):
    val = str(val).strip()
    try:
        outString = intervalDict[val]
    except:
        outString = "NULL"
    return outString

outdf['in_AggregationIntervalUnitCV'] = outdf.apply(lambda row: FixAggregationIntervalUnit(row['in_AggregationIntervalUnitCV']), axis=1)
outdf['in_AggregationIntervalUnitCV'].unique()

In [None]:
# Fix PrimaryUseCategoryCV input
# This dictionary is an estimated translation of the metadata from CA's systems

PrimaryUseCategoryCVDict = {
"% UP 01" : "Percentage of Upper 01 (possibly related to a specific upper layer or zone)",
"% UP 02" : "Percentage of Upper 02",
"% UP 03" : "Percentage of Upper 03",
"% UP 04" : "Percentage of Upper 04",
"% UP 05" : "Percentage of Upper 05",
"% UP 06" : "Percentage of Upper 06",
"% UP 07" : "Percentage of Upper 07",
"% UP 08" : "Percentage of Upper 08",
"% UP 09" : "Percentage of Upper 09",
"% UP 10" : "Percentage of Upper 10",
"%Q" : "Percentage of Discharge Flow",
"10day%Q" : "10-day Percentage of Flow Discharge",
"10dayQ" : "10-day Flow Discharge",
"1day%Q" : "1-day Percentage of Flow Discharge",
"1DAYQ" : "1-day Flow Discharge",
"ABV TOC" : "Above Top of Conservation",
"AJ 10%" : "Adjusted 10% Exceedance",
"AJ 50%" : "Adjusted 50% Exceedance",
"AJ 90%" : "Adjusted 90% Exceedance",
"AUXFLOW" : "Auxiliary Flow",
"AVG INF" : "Average Inflow",
"BAR PRE" : "Barometric Pressure",
"BAT VOL" : "Battery Voltage",
"BAT VOLA" : "Battery Voltage A",
"CHLORPH" : "Chlorophyll",
"CONTROL" : "Control Flow or Parameter",
"D ORGCO" : "Dissolved Organic Carbon",
"D ORGCZ" : "Dissolved Organic Carbon Zone",
"DC PUMP" : "Direct Current Pump",
"DEW PT" : "Dew Point",
"DIS OXY" : "Dissolved Oxygen",
"DIS PWR" : "Discharge Power",
"Diss Br" : "Dissolved Bromide",
"Diss Cl" : "Dissolved Chloride",
"Diss F" : "Dissolved Fluoride",
"DissNO3" : "Dissolved Nitrate",
"DissPO4" : "Dissolved Phosphate",
"DissSO4" : "Dissolved Sulfate",
"DIVERSN" : "Diversion",
"DIVERSN" : "Diversion Flow",
"DO MAX" : "Maximum Dissolved Oxygen",
"DO MDN" : "Median Dissolved Oxygen",
"DO MIN" : "Minimum Dissolved Oxygen",
"E T" : "Evapotranspiration",
"EC MAX" : "Maximum Electrical Conductivity",
"EC MDN" : "Median Electrical Conductivity",
"EC MIN" : "Minimum Electrical Conductivity",
"EL CND" : "Electrical Conductivity (Generic)",
"EL COND" : "Electrical Conductivity",
"EL CONDB" : "Electrical Conductivity (Backup)",
"EVAP" : "Evaporation",
"EVP PAN" : "Evaporation Pan",
"FDOM" : "Fluorescent Dissolved Organic Matter",
"FGAMRVL" : "Flow Gauge Manual River Level",
"FLOW" : "Flow Rate",
"FLOW.XX" : "Flow Rate (specific sensor or parameter)",
"FNF ACC" : "Forecast Natural Flow Accumulation",
"FNF" : "Forecasted Natural Flow",
"FOUTFLW" : "Forecast Outflow",
"FTEMPVL" : "Forecast Temperature Value",
"FTOCSTO" : "Forecast to Storage",
"HEAD HT" : "Head Height",
"INFLOW" : "Inflow",
"IRR&CNS" : "Irrigation and Conservation",
"LK EVAP" : "Lake Evaporation",
"M FLOW" : "Mean Flow",
"MON FLO" : "Monthly Flow",
"MON FNF" : "Monthly Forecasted Natural Flow",
"NSLR AV" : "Net Solar Radiation Average",
"NSLR MN" : "Net Solar Radiation Minimum",
"NSLR MX" : "Net Solar Radiation Maximum",
"OUTFLOW" : "Outflow",
"OUTFLWV" : "Outflow Volume",
"PEAK WD" : "Peak Wind Direction",
"PEAK WS" : "Peak Wind Speed",
"PH VAL" : "pH Value",
"PPT INC" : "Precipitation Increment",
"PPTINC4" : "Precipitation Increment (4-hour)",
"RAIN" : "Rainfall",
"RAINTIP" : "Rain Tip Gauge",
"REL HUM" : "Relative Humidity",
"REL SCH" : "Release Schedule",
"RES CHG" : "Reservoir Change",
"RES ELE" : "Reservoir Elevation",
"RGAMRVL" : "River Gauge Manual River Level",
"RIV REL" : "River Release",
"RIV STG" : "River Stage",
"RIVST29" : "River Stage at Station 29",
"RIVST88" : "River Stage at Station 88",
"RIVSTGA" : "River Stage Gauge",
"RTEMPVL" : "Real-Time Temperature Value",
"SLRR AV" : "Solar Radiation Average",
"SLRR IN" : "Solar Radiation Incoming",
"SLRR MN" : "Solar Radiation Minimum",
"SLRR MX" : "Solar Radiation Maximum",
"SLRRREF" : "Solar Radiation Reference",
"SNO ADJ" : "Snow Adjustment",
"SNOW DP" : "Snow Depth",
"SNOW WC" : "Snow Water Content",
"SOIL TP" : "Soil Temperature",
"SOILMD1" : "Soil Moisture Depth 1",
"SOILMD2" : "Soil Moisture Depth 2",
"SOILMD3" : "Soil Moisture Depth 3",
"SOILTD1" : "Soil Temperature Depth 1",
"SOILTD2" : "Soil Temperature Depth 2",
"SOILTD3" : "Soil Temperature Depth 3",
"SOLAR R" : "Solar Radiation",
"SPILL" : "Spill Rate",
"STAGE F" : "Stage Flow",
"STORAGE" : "Storage Volume",
"T ORG C" : "Total Organic Carbon",
"T ORGCZ" : "Total Organic Carbon Zone",
"TEMP AV" : "Average Temperature",
"TEMP MN" : "Minimum Temperature",
"TEMP MX" : "Maximum Temperature",
"TEMP W" : "Water Temperature",
"TEMP" : "Air Temperature",
"TEMPIDX" : "Temperature Index",
"TEMPW C" : "Water Temperature in Celsius",
"TMPW MAX" : "Maximum Water Temperature",
"TMPW MDN" : "Median Water Temperature",
"TMPW MIN" : "Minimum Water Temperature",
"TOC STO" : "Top of Conservation Storage",
"TURB W" : "Turbidity in Water",
"TURB WF" : "Turbidity Flow Rate",
"TURBVAR" : "Turbidity Variance",
"VLOCITY" : "Velocity (Flow Speed)",
"WIND DR" : "Wind Direction",
"WIND SP" : "Wind Speed",
"WINDLEN" : "Wind Length",
"WY 10%" : "Water Year 10% Exceedance",
"WY 50%" : "Water Year 50% Exceedance",
"WY 90%" : "Water Year 90% Exceedance"
}

def FixPrimaryUseCategoryCVFunc(val):
    val = str(val).strip()
    try:
        outString = PrimaryUseCategoryCVDict[val]
    except:
        if val == "":
            outString = "NULL"
        else:
            outString = val
    return outString


outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: FixPrimaryUseCategoryCVFunc(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: FixPrimaryUseCategoryCVFunc(row['in_PrimaryUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].value_counts()

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

In [None]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

In [None]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

In [None]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

In [None]:
# extract year out
outdf['in_ReportYearCV'] = pd.to_datetime(outdf['in_ReportYearCV'], utc=True, errors = 'coerce').fillna("")
outdf['in_ReportYearCV'] = pd.to_datetime(outdf["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].dt.year
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].fillna(0).astype(int)
outdf['in_ReportYearCV'].unique()

In [None]:
# # Assign Primary Use Category

# import sys
# sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
# import AssignPrimaryUseCategoryFile # Use Custom import file

# outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
# outdf['in_PrimaryUseCategory'].unique()

In [None]:
# change in_TimeframeEnd to be months end if a monthly value
import datetime
import calendar

def MonthsEndFunc(date, timestep):
    timestep = str(timestep).strip()
    if timestep == "Monthly":
        last_day_of_month  = calendar.monthrange(date.year, date.month)[1]
        outString = date.replace(day=last_day_of_month)
    else:
        outString = date
    return outString

outdf['in_TimeframeEnd'] = outdf.apply(lambda row: MonthsEndFunc(row['in_TimeframeEnd'], row['in_AggregationIntervalUnitCV']), axis=1)
outdf['in_TimeframeEnd'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwsss_caMain.zip', compression=dict(method='zip', archive_name='Pwsss_caMain.csv'), index=False)  # The output, save as a zip