# Preprocessing Water Supply Site Time Series data for WaDE

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse
from bs4 import BeautifulSoup # text parser

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/WaDE Data Folder/California/WaterSupply_SiteSpecific"  # change here
os.chdir(workingDir)

## Input Files
- site & timeseries info for reservoirs
- site & timeseries info for streamgages
- site & timeseries info for active snow depth sensors

In [3]:
# Input File: Reservoirs
fileInput = "RawInputData/Reservoirs.zip"
dfr = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfr:
    dfr['WaDEUUID'] = "in1" + dfr.index.astype(str)
    dfr.to_csv('RawInputData/Reservoirs.zip', compression=dict(method='zip', archive_name='Reservoirs.csv'), index=False)

dfr.rename(columns=lambda x: x.rstrip(), inplace=True)
print(len(dfr))
dfr.head(1)

180


Unnamed: 0,Station,ID,Elev,Latitude,Longitude,County,Operating Agency,WaDEUUID
0,LAKE JENNINGS,JNN,707,32.854,-116.892,SAN DIEGO,None Specified,in10


In [4]:
# Input File: StreamGages shp file
fileInput = "RawInputData/shapefiles/StreamGages.zip"
dfsg = gpd.read_file(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsg:
    dfsg['WaDEUUID'] = "in2" + dfsg.index.astype(str)
    dfsg.to_csv('RawInputData/StreamGages.zip', compression=dict(method='zip', archive_name='StreamGages.csv'), index=False)

print(len(dfsg))
dfsg.head(1)

2597


Unnamed: 0,siteid,sitename,gage_statu,operator,datasource,sitestatus,stage_yn,stage_por,stage_stat,stage_real,flow_yn,flow_por,flow_statu,flow_realt,watqual_yn,watqual_po,watqual_st,watqual_re,temp_yn,temp_por,temp_statu,temp_realt,strmorder,ucdstrmcla,streamtype,totdasqkm,totdasqmi,weblink,gnisid_med,rchcd_medr,comid_medr,wtrshdnm_h,huc8,wtrshdnm_1,huc10,wtrshdnm_2,huc12,gagegap_st,reactivate,gage_histo,addflow_2s,addflow_2w,addtelemet,addtemp_2f,infrastruc,waterbody,tier,primary_be,sb19_actio,cnrfc,reference_,refpotenti,ecosysmgmt,wtrsupply,wtrquality,pubsafety,wade_Latit,wade_Longi,geometry,WaDEUUID
0,ACZ,ALHAMBRA CREEK AT D STREET,Active-Limited Use,OTHER,CDEC,Active,Y,1454,Active,Y,N,0,,N,N,0,,N,N,0,,N,2,Rain and seasonal groundwater (RGW),Stream/River - Intermittent,42.8499,16.54443,http://cdec.water.ca.gov/cgi-progs/staMeta?sta...,,18050001006347,948050078,Suisun Bay,18050001,Mount Diablo Creek-Frontal Suisun Bay Estuaries,1805000103,Arroyo del Hambre-Frontal Suisun Bay Estuaries,180500010303,AWG,N,0,Y,N,N,Y,,,3,ecosystem,Upgrade,,,,B,,,,38.00331,-122.12981,POINT Z (-122.12981 38.00331 0.00000),in20


In [5]:
# we only want to work with stream gages whose sites are considered "Acitve"

dfsg = dfsg[dfsg['sitestatus'] == 'Active'].reset_index(drop=True)
print(len(dfsg))

1089


In [6]:
# StreamGage input data DOES contain some duplicate reservoirs, but does not cleary indicate which are reservoirs and which are stream gages
# if site already in resevoir site data, remove from stream gages

dfr_idList = dfr['ID'].tolist()
dfr_idList = list(set(dfr_idList))

dfsg = dfsg[~dfsg['siteid'].isin(dfr_idList)].reset_index(drop=True)
print(len(dfsg))
dfsg.head(1)

1070


Unnamed: 0,siteid,sitename,gage_statu,operator,datasource,sitestatus,stage_yn,stage_por,stage_stat,stage_real,flow_yn,flow_por,flow_statu,flow_realt,watqual_yn,watqual_po,watqual_st,watqual_re,temp_yn,temp_por,temp_statu,temp_realt,strmorder,ucdstrmcla,streamtype,totdasqkm,totdasqmi,weblink,gnisid_med,rchcd_medr,comid_medr,wtrshdnm_h,huc8,wtrshdnm_1,huc10,wtrshdnm_2,huc12,gagegap_st,reactivate,gage_histo,addflow_2s,addflow_2w,addtelemet,addtemp_2f,infrastruc,waterbody,tier,primary_be,sb19_actio,cnrfc,reference_,refpotenti,ecosysmgmt,wtrsupply,wtrquality,pubsafety,wade_Latit,wade_Longi,geometry,WaDEUUID
0,ACZ,ALHAMBRA CREEK AT D STREET,Active-Limited Use,OTHER,CDEC,Active,Y,1454,Active,Y,N,0,,N,N,0,,N,N,0,,N,2,Rain and seasonal groundwater (RGW),Stream/River - Intermittent,42.8499,16.54443,http://cdec.water.ca.gov/cgi-progs/staMeta?sta...,,18050001006347,948050078,Suisun Bay,18050001,Mount Diablo Creek-Frontal Suisun Bay Estuaries,1805000103,Arroyo del Hambre-Frontal Suisun Bay Estuaries,180500010303,AWG,N,0,Y,N,N,Y,,,3,ecosystem,Upgrade,,,,B,,,,38.00331,-122.12981,POINT Z (-122.12981 38.00331 0.00000),in20


In [7]:
# Input File: Active Snow Depth Sensors
fileInput = "RawInputData/ActiveSnowDepthSesnsors.zip"
dfsd = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsd:
    dfsd['WaDEUUID'] = "in3" + dfsd.index.astype(str)
    dfsd.to_csv('RawInputData/ActiveSnowDepthSesnsors.zip', compression=dict(method='zip', archive_name='ActiveSnowDepthSesnsors.csv'), index=False)

dfsd.rename(columns=lambda x: x.rstrip(), inplace=True)
print(len(dfsd))
dfsd.head(1)

165


Unnamed: 0,ID,STATION,ELEV_FEET,LATITUDE,LONGITUDE,OPERATOR AGENCY,WaDEUUID
0,ADM,ADIN MOUNTAIN,6200,41.237,-120.792,Natural Resources Conservation Service,in30


In [8]:
# Input File: Reservoirs_timeseries

fileInput = "RawInputData/Reservoirs_timeseries.zip"
dfr_ts = pd.read_csv(fileInput).replace(np.nan, "")
print(len(dfr_ts))
dfr_ts.head(1)

2657875


Unnamed: 0,STATION_ID,DURATION,SENSOR_NUMBER,SENSOR_TYPE,DATE TIME,OBS DATE,VALUE,DATA_FLAG,UNITS,Duration_abbe,Plot,Data Collection,Start Date,End Date
0,JNN,D,15,STORAGE,20211001 0000,20211001 0000,---,,AF,m,(STORAGE),MANUAL ENTRY,10/01/1968,01/01/2025


In [9]:
# Input File: StreamGages_timeseries

fileInput = "RawInputData/StreamGages_timeseries.zip"
dfsg_ts = pd.read_csv(fileInput).replace(np.nan, "")
print(len(dfsg_ts))
dfsg_ts.head(1)

297634


Unnamed: 0,STATION_ID,DURATION,SENSOR_NUMBER,SENSOR_TYPE,DATE TIME,OBS DATE,VALUE,DATA_FLAG,UNITS,Duration_abbe,Plot,Data Collection,Start Date,End Date
0,GLC,D,41,M FLOW,20080701 0000,20080701 0000,93,,CFS,d,(M FLOW),COMPUTED,07/21/1999,01/01/2025


In [10]:
# Input File: ActiveSnowDepthSesnsors_timeseries

fileInput = "RawInputData/ActiveSnowDepthSesnsors_timeseries.zip"
dfsd_ts = pd.read_csv(fileInput).replace(np.nan, "")
print(len(dfsd_ts))
dfsd_ts.head(1)

713359


Unnamed: 0,STATION_ID,DURATION,SENSOR_NUMBER,SENSOR_TYPE,DATE TIME,OBS DATE,VALUE,DATA_FLAG,UNITS,Duration_abbe,Plot,Data Collection,Start Date,End Date
0,BCB,D,18,SNOW DP,20041001 0000,20041001 0600,0,,INCHES,m,(SNOW DP),MANUAL ENTRY,04/01/1972,01/01/2025


In [11]:
# inner-join reservoir data to timeseries

dfin1 = pd.merge(dfr, dfr_ts, left_on='ID', right_on='STATION_ID', how='inner').reset_index(drop=True)
print(len(dfin1))
dfin1.head(1)

2657875


Unnamed: 0,Station,ID,Elev,Latitude,Longitude,County,Operating Agency,WaDEUUID,STATION_ID,DURATION,SENSOR_NUMBER,SENSOR_TYPE,DATE TIME,OBS DATE,VALUE,DATA_FLAG,UNITS,Duration_abbe,Plot,Data Collection,Start Date,End Date
0,LAKE JENNINGS,JNN,707,32.854,-116.892,SAN DIEGO,None Specified,in10,JNN,D,15,STORAGE,20211001 0000,20211001 0000,---,,AF,m,(STORAGE),MANUAL ENTRY,10/01/1968,01/01/2025


In [12]:
# inner-join streamgage data to timeseries

dfin2 = pd.merge(dfsg, dfsg_ts, left_on='siteid', right_on='STATION_ID', how='inner').reset_index(drop=True)
print(len(dfin2))
dfin2.head(1)

297634


Unnamed: 0,siteid,sitename,gage_statu,operator,datasource,sitestatus,stage_yn,stage_por,stage_stat,stage_real,flow_yn,flow_por,flow_statu,flow_realt,watqual_yn,watqual_po,watqual_st,watqual_re,temp_yn,temp_por,temp_statu,temp_realt,strmorder,ucdstrmcla,streamtype,totdasqkm,totdasqmi,weblink,gnisid_med,rchcd_medr,comid_medr,wtrshdnm_h,huc8,wtrshdnm_1,huc10,wtrshdnm_2,huc12,gagegap_st,reactivate,gage_histo,addflow_2s,addflow_2w,addtelemet,addtemp_2f,infrastruc,waterbody,tier,primary_be,sb19_actio,cnrfc,reference_,refpotenti,ecosysmgmt,wtrsupply,wtrquality,pubsafety,wade_Latit,wade_Longi,geometry,WaDEUUID,STATION_ID,DURATION,SENSOR_NUMBER,SENSOR_TYPE,DATE TIME,OBS DATE,VALUE,DATA_FLAG,UNITS,Duration_abbe,Plot,Data Collection,Start Date,End Date
0,GLC,GRANTLINE CANAL (USGS),Active-High Quality,DWR,CDEC,Active,Y,5634,Active,Y,Y,6294,Active,Y,Y,4928,Active,Y,Y,4928,Active,Y,1,Rain and seasonal groundwater (RGW),Artificial Path,3.9447,1.52306,https://cdec.water.ca.gov/dynamicapp/staMeta?s...,,18040003004612,1897448,San Joaquin Delta,18040003,Old River,1804000306,Lower Old River,180400030605,WG,N,0,N,N,N,N,Y,,0,water quality,High Priority - Keep Gage Active,,,,,B,B,,37.81986,-121.5484,POINT Z (-121.54840 37.81986 0.00000),in261,GLC,D,41,M FLOW,20080701 0000,20080701 0000,93,,CFS,d,(M FLOW),COMPUTED,07/21/1999,01/01/2025


In [13]:
# inner-join snow depth data to timeseries

dfin3 = pd.merge(dfsd, dfsd_ts, left_on='ID', right_on='STATION_ID', how='inner').reset_index(drop=True)
print(len(dfin3))
dfin3.head(1)

713298


Unnamed: 0,ID,STATION,ELEV_FEET,LATITUDE,LONGITUDE,OPERATOR AGENCY,WaDEUUID,STATION_ID,DURATION,SENSOR_NUMBER,SENSOR_TYPE,DATE TIME,OBS DATE,VALUE,DATA_FLAG,UNITS,Duration_abbe,Plot,Data Collection,Start Date,End Date
0,ADM,ADIN MOUNTAIN,6200,41.237,-120.792,Natural Resources Conservation Service,in30,ADM,D,18,SNOW DP,20041001 0000,20041001 0700,0,,INCHES,m,(SNOW DP),MANUAL ENTRY,04/01/1972,01/01/2025


## Get metadata & timeseries data
- https://cdec.water.ca.gov/dynamicapp/staMeta
- this is out of order. But essnetialy steps include 1) use site info to get site ids; 2) use site ids with metadata api to determine what timeseries is available; 3) retreive timeseries data for sites based on available metadata.
- metadata and timeseries data already retreived, use hard copies for inputs instead now.

#### Reservoir

In [14]:
# already done

# %%time
# # get Reservoirs metadata

# tempList = dfr['ID'].tolist()
# dftemp = pd.DataFrame()

# for i in range(len(tempList)):
#     idString = str(tempList[i]).strip()   
#     url = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + idString
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         table = soup.find_all('table')
#         rawData = pd.read_html(str(table))[1]
#         rawData["ID"] = idString
#         dftemp = pd.concat([dftemp, rawData])
#     except:
#         print(f' did not work, {url}')

# dftemp.to_csv('RawInputData/Reservoirs_Metadata.zip', compression=dict(method='zip', archive_name='Reservoirs_Metadata.csv'), index=False)

# print(len(dftemp))
# dftemp.head()

In [15]:
# # Input File: Reservoirs_Metadata

# fileInput = "RawInputData/Reservoirs_Metadata.zip"
# dfr_m = pd.read_csv(fileInput).replace(np.nan, "")
# print(len(dfr_m))
# dfr_m.head(1)

In [16]:
# # Clean up reservoir metadata

# dfr_m = dfr_m.rename(columns={"0": "Sensor Description",  "1": "SensorNums", "2" : "Duration", "3" : "Plot", "4" : "Data Collection", "5" : "Data Available"})
# dfr_m[['Start Date', 'End Date']] = dfr_m['Data Available'].str.split('to', n=1, expand=True)
# dfr_m['Start Date'] = dfr_m['Start Date'].str.strip()
# dfr_m['End Date'] = dfr_m['End Date'].str.replace('present','01/01/2025').str.strip()
# dfr_m.head()

# dfr_m.head(1)

In [17]:
# # left-join by reservoir metadata to reservoir site data

# dfr = pd.merge(dfr, dfr_m, left_on='ID', right_on='ID', how='left')
# print(len(dfr))
# dfr.head(1)

#### StreamGages

In [18]:
# already done

# %%time
# # get StreamGages metadata

# tempList = dfsg['siteid'].unique().tolist()
# dftemp = pd.DataFrame()

# for i in range(len(tempList)):
#     idString = str(tempList[i]).strip()   
#     url = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + idString
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         table = soup.find_all('table')
#         rawData = pd.read_html(str(table))[1]
#         rawData["siteid"] = idString
#         dftemp = pd.concat([dftemp, rawData])
#     except:
#         print(f' did not work, {url}')

# dftemp.to_csv('RawInputData/StreamGages_Metadata.zip', compression=dict(method='zip', archive_name='StreamGages_Metadata.csv'), index=False)

# print(len(dftemp))
# dftemp.head()

In [19]:
# # Input File: StreamGages_Metadata

# fileInput = "RawInputData/StreamGages_Metadata.zip"
# dfsg_m = pd.read_csv(fileInput).replace(np.nan, "")
# print(len(dfsg_m))
# dfsg_m.head(1)

In [20]:
# # Clean up streamgage metadata

# dfsg_m = dfsg_m.rename(columns={"0": "Sensor Description",  "1": "SensorNums", "2" : "Duration", "3" : "Plot", "4" : "Data Collection", "5" : "Data Available"})
# dfsg_m[['Start Date', 'End Date']] = dfsg_m['Data Available'].str.split('to', n=1, expand=True)
# dfsg_m['Start Date'] = dfsg_m['Start Date'].str.strip()
# dfsg_m['End Date'] = dfsg_m['End Date'].str.replace('present','01/01/2025').str.strip()
# dfsg_m.head(1)

In [21]:
# # left-join by streamgage metadata to streamgage site data

# dfsg = pd.merge(dfsg, dfsg_m, left_on='siteid', right_on='siteid', how='left')
# print(len(dfsg))
# dfsg.head(1)

#### Active Snow Depth Sensors

In [22]:
# already done

# %%time
# # get Active Snow Depth Sensors metadata

# tempList = dfsd['ID'].tolist()
# dftemp = pd.DataFrame()

# for i in range(len(tempList)):
#     idString = str(tempList[i]).strip()   
#     url = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + idString
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         table = soup.find_all('table')
#         rawData = pd.read_html(str(table))[1]
#         rawData["ID"] = idString
#         dftemp = pd.concat([dftemp, rawData])
#     except:
#         print(f' did not work, {url}')

# dftemp.to_csv('RawInputData/ActiveSnowDepthSesnsors_Metadata.zip', compression=dict(method='zip', archive_name='ActiveSnowDepthSesnsors_Metadata.csv'), index=False)

# print(len(dftemp))
# dftemp.head()

In [23]:
# # Input File: ActiveSnowDepthSesnsors_Metadata

# fileInput = "RawInputData/ActiveSnowDepthSesnsors_Metadata.zip"
# dfsd_m = pd.read_csv(fileInput).replace(np.nan, "")
# print(len(dfsd_m))
# dfsd_m.head(1)

In [24]:
# # Clean up ActiveSnowDepthSesnsors_Metadata

# dfsd_m = dfsd_m.rename(columns={"0": "Sensor Description",  "1": "SensorNums", "2" : "Duration", "3" : "Plot", "4" : "Data Collection", "5" : "Data Available"})
# dfsd_m[['Start Date', 'End Date']] = dfsd_m['Data Available'].str.split('to', n=1, expand=True)
# dfsd_m['Start Date'] = dfsd_m['Start Date'].str.strip()
# dfsd_m['End Date'] = dfsd_m['End Date'].str.replace('present','01/01/2025').str.strip()
# dfsd_m.head(1)

In [25]:
# # left-join by ActiveSnowDepthSesnsors_Metadata to site data

# dfsd = pd.merge(dfsd, dfsd_m, left_on='ID', right_on='ID', how='left')
# print(len(dfsd))
# dfsd.head(1)

#### Get Timeseries

In [26]:
# # abbreviate Duration 

# durationDict = {
# "(daily)" : "d",
# "(monthly)" : "m",
# "(event)" : "e", 
# "(hourly)" : "h"
# }

# def CreateDurationAPIValueFunc(val):
#     val = str(val).strip()
#     try:
#         outString = durationDict[val]
#     except:
#         outString = ""
#     return outString

# dfr['Duration_abb'] = dfr.apply(lambda row: CreateDurationAPIValueFunc(row['Duration']), axis=1)
# dfsg['Duration_abb'] = dfsg.apply(lambda row: CreateDurationAPIValueFunc(row['Duration']), axis=1)
# dfsd['Duration_abb'] = dfsd.apply(lambda row: CreateDurationAPIValueFunc(row['Duration']), axis=1)

In [27]:
# # drop rows that do not contain monthly (m) or dailiy (d) data.

# timestep = ['m', 'd']
# dfr = dfr[dfr['Duration_abb'].isin(timestep)]
# dfsg = dfsg[dfsg['Duration_abb'].isin(timestep)]
# dfsd = dfsd[dfsd['Duration_abb'].isin(timestep)]

In [28]:
# # we only want the following Reservoir SesnorNums data...
# # RESERVOIR ELEVATION, FEET, 6, (daily)
# # RESERVOIR STORAGE, AF, 15, (daily), (monthly)
# # RESERVOIR OUTFLOW, CFS, 23, (daily)
# # RESERVOIR INFLOW, CFS, 76, (daily)

# dfr['SensorNums'] = dfr['SensorNums'].fillna(0).astype(np.int64).astype(str)
# dfr = dfr[dfr['SensorNums'].astype(str).isin(['6', '15', '23', '76'])]
# dfr['SensorNums'].value_counts()

In [29]:
# # we only want the following streamgage SesnorNums data...
# # FLOW, MEAN DAILY, CFS, 41, (daily)
# # FLOW, FULL NATURAL, AF, 65, (monthly)
# # FLOW, MONTHLY VOLUME, AF, 66, (monthly)

# dfsg['SensorNums'] = dfsg['SensorNums'].fillna(0).astype(np.int64).astype(str)
# dfsg = dfsg[dfsg['SensorNums'].astype(str).isin(['41', '65', '66'])]
# dfsg['SensorNums'].value_counts()

In [30]:
# # we only want the following snow depth SesnorNums data...
# # SNOW DEPTH, INCHES, 18, (daily)

# dfsd['SensorNums'] = dfsd['SensorNums'].fillna(0).astype(np.int64).astype(str)
# dfsd = dfsd[dfsd['SensorNums'].astype(str).isin(['18'])]
# dfsd['SensorNums'].value_counts()

In [31]:
# %%time
# # get timeseries for reservoirs

# stationsList = dfr['ID'].tolist()
# sensorNumsList = dfr['SensorNums'].tolist()
# dur_codeList =  dfr['Duration_abb'].tolist()
# plotList = dfr['Plot'].tolist()
# datacollectionList = dfr['Data Collection'].tolist()
# startList = dfr['Start Date'].tolist()
# endList = dfr['End Date'].tolist()

# # Time Series Dataframe
# dfr_ts = pd.DataFrame()

# for i in range(len(stationsList)):
#     stationStr = str(stationsList[i]).strip()
#     sensorNumsStr = str(sensorNumsList[i]).strip()
#     dur_codeStr = str(dur_codeList[i]).strip()
#     plotStr = str(plotList[i]).strip()
#     datacollectionStr = str(datacollectionList[i]).strip()
#     startStr = str(startList[i]).strip()
#     endStr = str(endList[i]).strip()   
#     urlInput = "https://cdec.water.ca.gov/dynamicapp/req/CSVDataServlet?Stations=" + stationStr + "&SensorNums=" + sensorNumsStr + "&dur_code=" + dur_codeStr + "&Start=" + startStr +"&End=" + endStr
#     try:
#         tempdf = pd.read_csv(urlInput).replace(np.nan, "")
#         dfr_ts = pd.concat([dfr_ts, tempdf])
#         dfr_ts['Duration_abbe'] = dur_codeStr
#         dfr_ts['Plot'] = plotStr
#         dfr_ts['Data Collection'] = datacollectionStr
#         dfr_ts['Start Date'] = startStr
#         dfr_ts['End Date'] = endStr
              
#     except:
#         print("...bad response")

# dfr_ts.to_csv('RawInputData/Reservoirs_timeseries.zip', compression=dict(method='zip', archive_name='Reservoirs_timeseries.csv'), index=False)
# print(len(dfr_ts))
# dfr_ts.head()

In [32]:
# %%time
# # get timeseries for streamgages

# dfsg['SensorNums'] = dfsg['SensorNums'].astype(int).astype(str)

# stationsList = dfsg['siteid'].tolist()
# sensorNumsList = dfsg['SensorNums'].tolist()
# dur_codeList =  dfsg['Duration_abb'].tolist()
# plotList = dfsg['Plot'].tolist()
# datacollectionList = dfsg['Data Collection'].tolist()
# startList = dfsg['Start Date'].tolist()
# endList = dfsg['End Date'].tolist()

# # Time Series Dataframe
# dfsg_ts = pd.DataFrame()

# for i in range(len(stationsList)):
#     stationStr = str(stationsList[i]).strip()
#     sensorNumsStr = str(sensorNumsList[i]).strip()
#     dur_codeStr = str(dur_codeList[i]).strip()
#     plotStr = str(plotList[i]).strip()
#     datacollectionStr = str(datacollectionList[i]).strip()
#     startStr = str(startList[i]).strip()
#     endStr = str(endList[i]).strip()   
#     urlInput = "https://cdec.water.ca.gov/dynamicapp/req/CSVDataServlet?Stations=" + stationStr + "&SensorNums=" + sensorNumsStr + "&dur_code=" + dur_codeStr + "&Start=" + startStr +"&End=" + endStr
#     try:
#         tempdf = pd.read_csv(urlInput).replace(np.nan, "")
#         dfsg_ts = pd.concat([dfsg_ts, tempdf])
#         dfsg_ts['Duration_abbe'] = dur_codeStr
#         dfsg_ts['Plot'] = plotStr
#         dfsg_ts['Data Collection'] = datacollectionStr
#         dfsg_ts['Start Date'] = startStr
#         dfsg_ts['End Date'] = endStr
              
#     except:
#         print("...bad response")

# dfsg_ts.to_csv('RawInputData/StreamGages_timeseries.zip', compression=dict(method='zip', archive_name='StreamGages_timeseries.csv'), index=False)
# print(len(dfsg_ts))
# dfsg_ts.head()

In [33]:
# %%time
# # get timeseries for ActiveSnowDepthSesnsors

# dfsd['SensorNums'] = dfsd['SensorNums'].astype(int).astype(str)

# stationsList = dfsd['ID'].tolist()
# sensorNumsList = dfsd['SensorNums'].tolist()
# dur_codeList =  dfsd['Duration_abb'].tolist()
# plotList = dfsd['Plot'].tolist()
# datacollectionList = dfsd['Data Collection'].tolist()
# startList = dfsd['Start Date'].tolist()
# endList = dfsd['End Date'].tolist()

# # Time Series Dataframe
# dfsd_ts = pd.DataFrame()

# for i in range(len(stationsList)):
#     stationStr = str(stationsList[i]).strip()
#     sensorNumsStr = str(sensorNumsList[i]).strip()
#     dur_codeStr = str(dur_codeList[i]).strip()
#     plotStr = str(plotList[i]).strip()
#     datacollectionStr = str(datacollectionList[i]).strip()
#     startStr = str(startList[i]).strip()
#     endStr = str(endList[i]).strip()   
#     urlInput = "https://cdec.water.ca.gov/dynamicapp/req/CSVDataServlet?Stations=" + stationStr + "&SensorNums=" + sensorNumsStr + "&dur_code=" + dur_codeStr + "&Start=" + startStr +"&End=" + endStr
#     try:
#         tempdf = pd.read_csv(urlInput).replace(np.nan, "")
#         dfsd_ts = pd.concat([dfsd_ts, tempdf])
#         dfsd_ts['Duration_abbe'] = dur_codeStr
#         dfsd_ts['Plot'] = plotStr
#         dfsd_ts['Data Collection'] = datacollectionStr
#         dfsd_ts['Start Date'] = startStr
#         dfsd_ts['End Date'] = endStr
              
#     except:
#         print("...bad response")

# dfsd_ts.to_csv('RawInputData/ActiveSnowDepthSesnsors_timeseries.zip', compression=dict(method='zip', archive_name='ActiveSnowDepthSesnsors_timeseries.csv'), index=False)
# print(len(dfsd_ts))
# dfsd_ts.head()

## WaDE Data

In [34]:
# reservoir data
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CAwsss_M1"

# Variable Info
df['in_AggregationIntervalUnitCV'] = dfin1['Duration_abbe']
df['in_AmountUnitCV'] = dfin1['UNITS']
df['in_VariableCV'] = "Water Supply"

# Organization Info
df['in_OrganizationUUID'] = "CAwsss_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = "Surface Water" # need this for auto fill below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = dfin1['County']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] =dfin1['Latitude']
df['in_Longitude'] = dfin1['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = ""
df['in_SiteName'] = dfin1['Station']
df['in_SiteNativeID'] = dfin1['STATION_ID']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Reservoir"
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin1.VALUE.replace('---',0).replace('BRT',0).replace('ART',0).astype(float)
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = dfin1['SENSOR_TYPE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = dfin1['SENSOR_TYPE']
df['in_ReportYearCV'] =  dfin1['OBS DATE']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin1['OBS DATE']
df['in_TimeframeStart'] = dfin1['OBS DATE']

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

2654305


Unnamed: 0,WaDEUUID,in_MethodUUID,in_AggregationIntervalUnitCV,in_AmountUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,in10,CAwsss_M1,m,AF,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,SAN DIEGO,4326,,,,32.854,-116.892,,,,LAKE JENNINGS,JNN,,Reservoir,CA,,0.0,,,STORAGE,,,,,,,,,,,STORAGE,20211001 0000,,20211001 0000,20211001 0000
1,in10,CAwsss_M1,m,AF,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,SAN DIEGO,4326,,,,32.854,-116.892,,,,LAKE JENNINGS,JNN,,Reservoir,CA,,0.0,,,STORAGE,,,,,,,,,,,STORAGE,20211002 0000,,20211002 0000,20211002 0000
2,in10,CAwsss_M1,m,AF,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,SAN DIEGO,4326,,,,32.854,-116.892,,,,LAKE JENNINGS,JNN,,Reservoir,CA,,0.0,,,STORAGE,,,,,,,,,,,STORAGE,20211003 0000,,20211003 0000,20211003 0000
3,in10,CAwsss_M1,m,AF,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,SAN DIEGO,4326,,,,32.854,-116.892,,,,LAKE JENNINGS,JNN,,Reservoir,CA,,0.0,,,STORAGE,,,,,,,,,,,STORAGE,20211004 0000,,20211004 0000,20211004 0000
4,in10,CAwsss_M1,m,AF,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,SAN DIEGO,4326,,,,32.854,-116.892,,,,LAKE JENNINGS,JNN,,Reservoir,CA,,0.0,,,STORAGE,,,,,,,,,,,STORAGE,20211005 0000,,20211005 0000,20211005 0000


In [35]:
# stream gage data
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin2['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CAwsss_M1"

# Variable Info
df['in_AggregationIntervalUnitCV'] = dfin2['Duration_abbe']
df['in_AmountUnitCV'] = dfin2['UNITS']
df['in_VariableCV'] = "Water Supply"

# Organization Info
df['in_OrganizationUUID'] = "CAwsss_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = "Surface Water" # need this for auto fill below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = dfin2['huc12']
df['in_HUC8'] = dfin2['huc8']
df['in_Latitude'] = dfin2['wade_Latit']
df['in_Longitude'] = dfin2['wade_Longi']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = ""
df['in_SiteName'] = dfin2['sitename']
df['in_SiteNativeID'] = dfin2['siteid']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Stream Gage"
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin2.VALUE.replace('---',0).replace('BRT',0).replace('ART',0).astype(float)
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = dfin2['SENSOR_TYPE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = dfin2['SENSOR_TYPE']
df['in_ReportYearCV'] =  dfin2['OBS DATE']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin2['OBS DATE']
df['in_TimeframeStart'] = dfin2['OBS DATE']

outdf2 = df.copy()
outdf2 = outdf2.drop_duplicates().reset_index(drop=True)
print(len(outdf2))
outdf2.head()

297610


Unnamed: 0,WaDEUUID,in_MethodUUID,in_AggregationIntervalUnitCV,in_AmountUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,in261,CAwsss_M1,d,CFS,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,,4326,,180400030605,18040003,37.81986,-121.5484,,,,GRANTLINE CANAL (USGS),GLC,,Stream Gage,CA,,93.0,,,M FLOW,,,,,,,,,,,M FLOW,20080701 0000,,20080701 0000,20080701 0000
1,in261,CAwsss_M1,d,CFS,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,,4326,,180400030605,18040003,37.81986,-121.5484,,,,GRANTLINE CANAL (USGS),GLC,,Stream Gage,CA,,-250.0,,,M FLOW,,,,,,,,,,,M FLOW,20080702 0000,,20080702 0000,20080702 0000
2,in261,CAwsss_M1,d,CFS,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,,4326,,180400030605,18040003,37.81986,-121.5484,,,,GRANTLINE CANAL (USGS),GLC,,Stream Gage,CA,,-10.0,,,M FLOW,,,,,,,,,,,M FLOW,20080703 0000,,20080703 0000,20080703 0000
3,in261,CAwsss_M1,d,CFS,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,,4326,,180400030605,18040003,37.81986,-121.5484,,,,GRANTLINE CANAL (USGS),GLC,,Stream Gage,CA,,-19.0,,,M FLOW,,,,,,,,,,,M FLOW,20080704 0000,,20080704 0000,20080704 0000
4,in261,CAwsss_M1,d,CFS,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,,4326,,180400030605,18040003,37.81986,-121.5484,,,,GRANTLINE CANAL (USGS),GLC,,Stream Gage,CA,,41.0,,,M FLOW,,,,,,,,,,,M FLOW,20080705 0000,,20080705 0000,20080705 0000


In [36]:
# Snow depth data
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin3['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CAwsss_M1"

# Variable Info
df['in_AggregationIntervalUnitCV'] = dfin3['Duration_abbe']
df['in_AmountUnitCV'] = dfin3['UNITS']
df['in_VariableCV'] = "Water Supply"

# Organization Info
df['in_OrganizationUUID'] = "CAwsss_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = "Surface Water" # need this for auto fill below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin3['LATITUDE']
df['in_Longitude'] = dfin3['LONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = ""
df['in_SiteName'] = dfin3['STATION']
df['in_SiteNativeID'] = dfin3['STATION_ID']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Snow Depth"
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin3.VALUE.replace('---',0).replace('BRT',0).replace('ART',0).astype(float)
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = dfin3['SENSOR_TYPE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = dfin3['SENSOR_TYPE']
df['in_ReportYearCV'] =  dfin3['OBS DATE']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin3['OBS DATE']
df['in_TimeframeStart'] = dfin3['OBS DATE']

outdf3 = df.copy()
outdf3 = outdf3.drop_duplicates().reset_index(drop=True)
print(len(outdf3))
outdf3.head()

712795


Unnamed: 0,WaDEUUID,in_MethodUUID,in_AggregationIntervalUnitCV,in_AmountUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,in30,CAwsss_M1,m,INCHES,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,,4326,,,,41.237,-120.792,,,,ADIN MOUNTAIN,ADM,,Snow Depth,CA,,0.0,,,SNOW DP,,,,,,,,,,,SNOW DP,20041001 0700,,20041001 0700,20041001 0700
1,in30,CAwsss_M1,m,INCHES,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,,4326,,,,41.237,-120.792,,,,ADIN MOUNTAIN,ADM,,Snow Depth,CA,,0.0,,,SNOW DP,,,,,,,,,,,SNOW DP,20041002 0700,,20041002 0700,20041002 0700
2,in30,CAwsss_M1,m,INCHES,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,,4326,,,,41.237,-120.792,,,,ADIN MOUNTAIN,ADM,,Snow Depth,CA,,-0.0,,,SNOW DP,,,,,,,,,,,SNOW DP,20041003 0700,,20041003 0700,20041003 0700
3,in30,CAwsss_M1,m,INCHES,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,,4326,,,,41.237,-120.792,,,,ADIN MOUNTAIN,ADM,,Snow Depth,CA,,-0.0,,,SNOW DP,,,,,,,,,,,SNOW DP,20041004 0700,,20041004 0700,20041004 0700
4,in30,CAwsss_M1,m,INCHES,Water Supply,CAwsss_O1,,,,WaDE Blank,,Surface Water,,,,4326,,,,41.237,-120.792,,,,ADIN MOUNTAIN,ADM,,Snow Depth,CA,,0.0,,,SNOW DP,,,,,,,,,,,SNOW DP,20041005 0700,,20041005 0700,20041005 0700


In [37]:
# Concatenate dataframes
frames = [outdf1, outdf2, outdf3]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

3664710


## Clean Data / data types

In [38]:
# Fix AggregationIntervalUnit input

intervalDict = {
    "d" : "Daily",
    "m" : "Monthly",
}
def FixAggregationIntervalUnit(val):
    val = str(val).strip()
    try:
        outString = intervalDict[val]
    except:
        outString = "NULL"
    return outString

outdf['in_AggregationIntervalUnitCV'] = outdf.apply(lambda row: FixAggregationIntervalUnit(row['in_AggregationIntervalUnitCV']), axis=1)
outdf['in_AggregationIntervalUnitCV'].unique()

array(['Monthly', 'Daily'], dtype=object)

In [39]:
# Fix PrimaryUseCategoryCV input
# This dictionary is an estimated translation of the metadata from CA's systems

PrimaryUseCategoryCVDict = {
"% UP 01" : "Percentage of Upper 01 (possibly related to a specific upper layer or zone)",
"% UP 02" : "Percentage of Upper 02",
"% UP 03" : "Percentage of Upper 03",
"% UP 04" : "Percentage of Upper 04",
"% UP 05" : "Percentage of Upper 05",
"% UP 06" : "Percentage of Upper 06",
"% UP 07" : "Percentage of Upper 07",
"% UP 08" : "Percentage of Upper 08",
"% UP 09" : "Percentage of Upper 09",
"% UP 10" : "Percentage of Upper 10",
"%Q" : "Percentage of Discharge Flow",
"10day%Q" : "10-day Percentage of Flow Discharge",
"10dayQ" : "10-day Flow Discharge",
"1day%Q" : "1-day Percentage of Flow Discharge",
"1DAYQ" : "1-day Flow Discharge",
"ABV TOC" : "Above Top of Conservation",
"AJ 10%" : "Adjusted 10% Exceedance",
"AJ 50%" : "Adjusted 50% Exceedance",
"AJ 90%" : "Adjusted 90% Exceedance",
"AUXFLOW" : "Auxiliary Flow",
"AVG INF" : "Average Inflow",
"BAR PRE" : "Barometric Pressure",
"BAT VOL" : "Battery Voltage",
"BAT VOLA" : "Battery Voltage A",
"CHLORPH" : "Chlorophyll",
"CONTROL" : "Control Flow or Parameter",
"D ORGCO" : "Dissolved Organic Carbon",
"D ORGCZ" : "Dissolved Organic Carbon Zone",
"DC PUMP" : "Direct Current Pump",
"DEW PT" : "Dew Point",
"DIS OXY" : "Dissolved Oxygen",
"DIS PWR" : "Discharge Power",
"Diss Br" : "Dissolved Bromide",
"Diss Cl" : "Dissolved Chloride",
"Diss F" : "Dissolved Fluoride",
"DissNO3" : "Dissolved Nitrate",
"DissPO4" : "Dissolved Phosphate",
"DissSO4" : "Dissolved Sulfate",
"DIVERSN" : "Diversion",
"DIVERSN" : "Diversion Flow",
"DO MAX" : "Maximum Dissolved Oxygen",
"DO MDN" : "Median Dissolved Oxygen",
"DO MIN" : "Minimum Dissolved Oxygen",
"E T" : "Evapotranspiration",
"EC MAX" : "Maximum Electrical Conductivity",
"EC MDN" : "Median Electrical Conductivity",
"EC MIN" : "Minimum Electrical Conductivity",
"EL CND" : "Electrical Conductivity (Generic)",
"EL COND" : "Electrical Conductivity",
"EL CONDB" : "Electrical Conductivity (Backup)",
"EVAP" : "Evaporation",
"EVP PAN" : "Evaporation Pan",
"FDOM" : "Fluorescent Dissolved Organic Matter",
"FGAMRVL" : "Flow Gauge Manual River Level",
"FLOW" : "Flow Rate",
"FLOW.XX" : "Flow Rate (specific sensor or parameter)",
"FNF ACC" : "Forecast Natural Flow Accumulation",
"FNF" : "Forecasted Natural Flow",
"FOUTFLW" : "Forecast Outflow",
"FTEMPVL" : "Forecast Temperature Value",
"FTOCSTO" : "Forecast to Storage",
"HEAD HT" : "Head Height",
"INFLOW" : "Inflow",
"IRR&CNS" : "Irrigation and Conservation",
"LK EVAP" : "Lake Evaporation",
"M FLOW" : "Mean Flow",
"MON FLO" : "Monthly Flow",
"MON FNF" : "Monthly Forecasted Natural Flow",
"NSLR AV" : "Net Solar Radiation Average",
"NSLR MN" : "Net Solar Radiation Minimum",
"NSLR MX" : "Net Solar Radiation Maximum",
"OUTFLOW" : "Outflow",
"OUTFLWV" : "Outflow Volume",
"PEAK WD" : "Peak Wind Direction",
"PEAK WS" : "Peak Wind Speed",
"PH VAL" : "pH Value",
"PPT INC" : "Precipitation Increment",
"PPTINC4" : "Precipitation Increment (4-hour)",
"RAIN" : "Rainfall",
"RAINTIP" : "Rain Tip Gauge",
"REL HUM" : "Relative Humidity",
"REL SCH" : "Release Schedule",
"RES CHG" : "Reservoir Change",
"RES ELE" : "Reservoir Elevation",
"RGAMRVL" : "River Gauge Manual River Level",
"RIV REL" : "River Release",
"RIV STG" : "River Stage",
"RIVST29" : "River Stage at Station 29",
"RIVST88" : "River Stage at Station 88",
"RIVSTGA" : "River Stage Gauge",
"RTEMPVL" : "Real-Time Temperature Value",
"SLRR AV" : "Solar Radiation Average",
"SLRR IN" : "Solar Radiation Incoming",
"SLRR MN" : "Solar Radiation Minimum",
"SLRR MX" : "Solar Radiation Maximum",
"SLRRREF" : "Solar Radiation Reference",
"SNO ADJ" : "Snow Adjustment",
"SNOW DP" : "Snow Depth",
"SNOW WC" : "Snow Water Content",
"SOIL TP" : "Soil Temperature",
"SOILMD1" : "Soil Moisture Depth 1",
"SOILMD2" : "Soil Moisture Depth 2",
"SOILMD3" : "Soil Moisture Depth 3",
"SOILTD1" : "Soil Temperature Depth 1",
"SOILTD2" : "Soil Temperature Depth 2",
"SOILTD3" : "Soil Temperature Depth 3",
"SOLAR R" : "Solar Radiation",
"SPILL" : "Spill Rate",
"STAGE F" : "Stage Flow",
"STORAGE" : "Storage Volume",
"T ORG C" : "Total Organic Carbon",
"T ORGCZ" : "Total Organic Carbon Zone",
"TEMP AV" : "Average Temperature",
"TEMP MN" : "Minimum Temperature",
"TEMP MX" : "Maximum Temperature",
"TEMP W" : "Water Temperature",
"TEMP" : "Air Temperature",
"TEMPIDX" : "Temperature Index",
"TEMPW C" : "Water Temperature in Celsius",
"TMPW MAX" : "Maximum Water Temperature",
"TMPW MDN" : "Median Water Temperature",
"TMPW MIN" : "Minimum Water Temperature",
"TOC STO" : "Top of Conservation Storage",
"TURB W" : "Turbidity in Water",
"TURB WF" : "Turbidity Flow Rate",
"TURBVAR" : "Turbidity Variance",
"VLOCITY" : "Velocity (Flow Speed)",
"WIND DR" : "Wind Direction",
"WIND SP" : "Wind Speed",
"WINDLEN" : "Wind Length",
"WY 10%" : "Water Year 10% Exceedance",
"WY 50%" : "Water Year 50% Exceedance",
"WY 90%" : "Water Year 90% Exceedance"
}

def FixPrimaryUseCategoryCVFunc(val):
    val = str(val).strip()
    try:
        outString = PrimaryUseCategoryCVDict[val]
    except:
        if val == "":
            outString = "NULL"
        else:
            outString = val
    return outString


outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: FixPrimaryUseCategoryCVFunc(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: FixPrimaryUseCategoryCVFunc(row['in_PrimaryUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].value_counts()

in_PrimaryUseCategory
Storage Volume                     1197026
Reservoir Elevation                 756959
Snow Depth                          712795
Outflow                             374725
Inflow                              325595
Mean Flow                           241148
Monthly Forecasted Natural Flow      36327
Monthly Flow                         20135
Name: count, dtype: int64

In [40]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

  Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')


In [41]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Wade Blank'], dtype=object)

In [42]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Lake Jennings', 'Bear Valley Dam', 'Thermalito Divers Pool',
       'Thermalito Forebay', 'Thermalito Total',
       'San Luis Reservoir State', 'San Luis Reservoir Federal',
       'Bear River At Camp Far West Dam', 'Antelope Lake',
       'Lake Davis Dwr', 'Frenchman Dam', 'Oroville Dam',
       'Oneill Forebay', 'San Luis Reservoir', 'Pyramid', 'Castaic',
       'Perris', 'Del Valle', 'Thermalito Afterbay', 'Lake Silverwood',
       'Spicer Meadows', 'Grant Lake', 'Lake Crowley',
       'Pleasant Valley Reservoir', 'Tinemaha Reservoir', 'Haiwee',
       'Fairmont Reservoir', 'San Vicente', 'Lower Otay', 'Gibraltar Dam',
       'Los Vaqueros Reservoir', 'Briones Dam', 'U San Leandro', 'Chabot',
       'San Pablo', 'Pardee', 'Camanche Reservoir', 'Caples Lake Eid',
       'Silver Lake Reservoir', 'Jenkinson Lake', 'Railroad Canyon',
       'Stumpy Meadows Reservoirmark Edson Dam', 'Lake Eleanor',
       'Hetch Hetchy', 'Ruth Dam', 'Lake Hemet', 'Bouquet Canyon',
       'Cogswe

In [43]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['San Diego', 'San Bernardino', 'Butte', 'Merced', 'Yuba', 'Plumas',
       'Los Angeles', 'Riverside', 'Alameda', 'Tuolumne', 'Mono', 'Inyo',
       'Santa Barbara', 'Contra Costa', 'Calaveras', 'San Joaquin',
       'Alpine', 'Amador', 'El Dorado', 'Trinity', 'Marin', 'Mariposa',
       'Stanislaus', 'Monterey', 'San Luis Obispo', 'Napa', 'Nevada',
       'Placer', 'Shasta', 'Lassen', 'Madera', 'Fresno', 'San Mateo',
       'Santa Clara', 'Orange', 'Ventura', 'Sonoma', 'Tehama', 'Tulare',
       'Kern', 'Siskiyou', 'Modoc', 'Glenn', 'Colusa', 'Sacramento',
       'State Of Arizona', 'State Of Nevada', 'Sierra', 'Lake', ''],
      dtype=object)

In [44]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [45]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Wade Blank'], dtype=object)

In [46]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water'], dtype=object)

In [47]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Reservoir', 'Stream Gage', 'Snow Depth'], dtype=object)

In [48]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Lake Jennings', 'Bear Valley Dam', 'Thermalito Divers Pool',
       'Thermalito Forebay', 'Thermalito Total',
       'San Luis Reservoir State', 'San Luis Reservoir Federal',
       'Bear River At Camp Far West Dam', 'Antelope Lake',
       'Lake Davis Dwr', 'Frenchman Dam', 'Oroville Dam',
       'Oneill Forebay', 'San Luis Reservoir', 'Pyramid', 'Castaic',
       'Perris', 'Del Valle', 'Thermalito Afterbay', 'Lake Silverwood',
       'Spicer Meadows', 'Grant Lake', 'Lake Crowley',
       'Pleasant Valley Reservoir', 'Tinemaha Reservoir', 'Haiwee',
       'Fairmont Reservoir', 'San Vicente', 'Lower Otay', 'Gibraltar Dam',
       'Los Vaqueros Reservoir', 'Briones Dam', 'U San Leandro', 'Chabot',
       'San Pablo', 'Pardee', 'Camanche Reservoir', 'Caples Lake Eid',
       'Silver Lake Reservoir', 'Jenkinson Lake', 'Railroad Canyon',
       'Stumpy Meadows Reservoirmark Edson Dam', 'Lake Eleanor',
       'Hetch Hetchy', 'Ruth Dam', 'Lake Hemet', 'Bouquet Canyon',
       'Cogswe

In [49]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['San Diego', 'San Bernardino', 'Butte', 'Merced', 'Yuba', 'Plumas',
       'Los Angeles', 'Riverside', 'Alameda', 'Tuolumne', 'Mono', 'Inyo',
       'Santa Barbara', 'Contra Costa', 'Calaveras', 'San Joaquin',
       'Alpine', 'Amador', 'El Dorado', 'Trinity', 'Marin', 'Mariposa',
       'Stanislaus', 'Monterey', 'San Luis Obispo', 'Napa', 'Nevada',
       'Placer', 'Shasta', 'Lassen', 'Madera', 'Fresno', 'San Mateo',
       'Santa Clara', 'Orange', 'Ventura', 'Sonoma', 'Tehama', 'Tulare',
       'Kern', 'Siskiyou', 'Modoc', 'Glenn', 'Colusa', 'Sacramento',
       'State Of Arizona', 'State Of Nevada', 'Sierra', 'Lake', ''],
      dtype=object)

In [50]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['Inflow',
 'Mean Flow',
 'Monthly Flow',
 'Monthly Forecasted Natural Flow',
 'Outflow',
 'Reservoir Elevation',
 'Snow Depth',
 'Storage Volume']

In [51]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([32.854     , 34.242     , 39.528     , 39.519     , 39.458     ,
       37.001     , 39.05      , 40.18      , 39.883     , 39.54      ,
       37.067     , 37.033     , 34.644153  , 34.5152    , 33.858     ,
       37.614     , 39.45      , 34.304     , 38.393     , 37.862     ,
       37.604     , 37.423578  , 37.058     , 36.137     , 34.711777  ,
       32.912     , 32.609     , 34.526     , 37.838     , 37.9151    ,
       37.7644    , 37.73      , 37.9423    , 38.25      , 38.225     ,
       38.707     , 38.669     , 38.713     , 33.675     , 38.903     ,
       37.974     , 37.95      , 40.367     , 33.666     , 34.586714  ,
       34.245     , 34.207     , 38.153     , 38.075     , 37.997     ,
       37.94      , 37.52      , 33.833     , 33.67      , 33.583     ,
       37.657     , 35.798     , 35.758     , 38.482     , 32.99      ,
       39.509     , 39.44      , 39.135     , 39.274     , 38.203     ,
       34.1178    , 41.135     , 41.045     , 41.022     , 40.84

In [52]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-116.892     , -116.978     , -121.543     , -121.629     ,
       -121.638     , -121.002     , -121.317     , -120.607     ,
       -120.467     , -120.183     , -121.493     , -121.067     ,
       -121.133     , -118.764528  , -118.6101    , -117.183     ,
       -121.745     , -121.633     , -117.318     , -119.998     ,
       -119.102     , -118.707     , -118.583576  , -118.225     ,
       -117.948     , -118.433462  , -116.924     , -116.927     ,
       -119.686     , -121.726     , -122.207     , -122.1016    ,
       -122.122     , -122.2587    , -120.85      , -121.021     ,
       -120.048     , -120.121     , -120.56      , -117.272     ,
       -120.603     , -119.88      , -119.783     , -123.433     ,
       -116.705     , -118.395314  , -117.965     , -117.858     ,
       -122.782     , -122.757     , -122.702     , -122.637     ,
       -120.309     , -117.46      , -117.066     , -117.122     ,
       -120.675     , -120.883     , -120.884     , -122.372  

In [53]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

array(['', 820.0, 1910.0, ..., 19217.0, -458.0, -6364.0], dtype=object)

In [54]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

array([''], dtype=object)

In [55]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

<DatetimeArray>
['2021-10-01 00:00:00', '2021-10-02 00:00:00', '2021-10-03 00:00:00',
 '2021-10-04 00:00:00', '2021-10-05 00:00:00', '2021-10-06 00:00:00',
 '2021-10-07 00:00:00', '2021-10-08 00:00:00', '2021-10-09 00:00:00',
 '2021-10-10 00:00:00',
 ...
 '1939-03-26 00:00:00', '1945-04-05 00:00:00', '1958-03-05 00:00:00',
 '1958-04-11 00:00:00', '1946-04-08 00:00:00', '1951-01-06 00:00:00',
 '1953-01-20 00:00:00', '1955-01-23 00:00:00', '1958-04-05 00:00:00',
 '1960-02-06 00:00:00']
Length: 25086, dtype: datetime64[ns]

In [56]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

<DatetimeArray>
['2021-10-01 00:00:00', '2021-10-02 00:00:00', '2021-10-03 00:00:00',
 '2021-10-04 00:00:00', '2021-10-05 00:00:00', '2021-10-06 00:00:00',
 '2021-10-07 00:00:00', '2021-10-08 00:00:00', '2021-10-09 00:00:00',
 '2021-10-10 00:00:00',
 ...
 '1939-03-26 00:00:00', '1945-04-05 00:00:00', '1958-03-05 00:00:00',
 '1958-04-11 00:00:00', '1946-04-08 00:00:00', '1951-01-06 00:00:00',
 '1953-01-20 00:00:00', '1955-01-23 00:00:00', '1958-04-05 00:00:00',
 '1960-02-06 00:00:00']
Length: 25086, dtype: datetime64[ns]

In [57]:
# extract year out
outdf['in_ReportYearCV'] = pd.to_datetime(outdf['in_ReportYearCV'], utc=True, errors = 'coerce').fillna("")
outdf['in_ReportYearCV'] = pd.to_datetime(outdf["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].dt.year
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].fillna(0).astype(int)
outdf['in_ReportYearCV'].unique()

array([2021, 2022, 2023, 2024, 2025, 1962, 1963, 1964, 1965, 1966, 1967,
       1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978,
       1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
       1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 1955, 1956,
       1957, 1958, 1959, 1960, 1961, 1945, 1946, 1947, 1948, 1949, 1950,
       1951, 1952, 1953, 1954, 1934, 1935, 1936, 1937, 1938, 1939, 1940,
       1941, 1942, 1943, 1944, 1925, 1926, 1927, 1928, 1929, 1930, 1931,
       1932, 1933, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913,
       1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924,
       1904])

In [58]:
# # Assign Primary Use Category

# import sys
# sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
# import AssignPrimaryUseCategoryFile # Use Custom import file

# outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
# outdf['in_PrimaryUseCategory'].unique()

In [59]:
# change in_TimeframeEnd to be months end if a monthly value
import datetime
import calendar

def MonthsEndFunc(date, timestep):
    timestep = str(timestep).strip()
    if timestep == "Monthly":
        last_day_of_month  = calendar.monthrange(date.year, date.month)[1]
        outString = date.replace(day=last_day_of_month)
    else:
        outString = date
    return outString

outdf['in_TimeframeEnd'] = outdf.apply(lambda row: MonthsEndFunc(row['in_TimeframeEnd'], row['in_AggregationIntervalUnitCV']), axis=1)
outdf['in_TimeframeEnd'].unique()

<DatetimeArray>
['2021-10-31 00:00:00', '2021-11-30 00:00:00', '2021-12-31 00:00:00',
 '2022-01-31 00:00:00', '2022-02-28 00:00:00', '2022-03-31 00:00:00',
 '2022-04-30 00:00:00', '2022-05-31 00:00:00', '2022-06-30 00:00:00',
 '2022-07-31 00:00:00',
 ...
 '1974-08-29 00:00:00', '1974-09-28 00:00:00', '1974-10-29 00:00:00',
 '1974-11-28 00:00:00', '1974-12-29 00:00:00', '1975-05-29 00:00:00',
 '1975-06-28 00:00:00', '1975-07-29 00:00:00', '1975-08-29 00:00:00',
 '1975-09-28 00:00:00']
Length: 13991, dtype: datetime64[ns]

In [60]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

array(['Water Supply_Monthly_Storage Volume_Surface Water',
       'Water Supply_Monthly_Reservoir Elevation_Surface Water',
       'Water Supply_Monthly_Inflow_Surface Water',
       'Water Supply_Monthly_Outflow_Surface Water',
       'Water Supply_Daily_Mean Flow_Surface Water',
       'Water Supply_Daily_Monthly Forecasted Natural Flow_Surface Water',
       'Water Supply_Daily_Monthly Flow_Surface Water',
       'Water Supply_Monthly_Snow Depth_Surface Water'], dtype=object)

In [61]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1'], dtype=object)

In [62]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['JNN', 'BRV', 'THD', 'TFR', 'TMT', 'LUS', 'SLF', 'CFW', 'ANT',
       'DAV', 'FRD', 'ORO', 'ONF', 'SNL', 'PYM', 'CAS', 'PRR', 'DLV',
       'TAB', 'SLW', 'SPM', 'GNT', 'CRW', 'PVR', 'TNM', 'HWE', 'FMT',
       'SVT', 'LOT', 'GBL', 'LVQ', 'BIO', 'USL', 'CHB', 'SPB', 'PAR',
       'CMN', 'CPL', 'SIV', 'JNK', 'RLC', 'EDN', 'ENR', 'HTH', 'RTD',
       'HMT', 'BQC', 'CGS', 'SGB', 'SLJ', 'NCA', 'KNT', 'APN', 'MCS',
       'MHW', 'DMV', 'SKN', 'MDO', 'ATN', 'NCM', 'HNN', 'CUY', 'JCK',
       'BWN', 'FRL', 'RLL', 'SFL', 'BRD', 'SVO', 'MCO', 'IRC', 'BIT',
       'PT7', 'PT6', 'MMW', 'ALM', 'BTV', 'BCL', 'BWS', 'LVY', 'LWB',
       'SLS', 'RLF', 'SWB', 'LYS', 'CNV', 'CTG', 'WSN', 'FMD', 'HHL',
       'VIL', 'LON', 'UNV', 'ICH', 'SLB', 'STD', 'HDG', 'MMR', 'ELC',
       'MRR', 'MOR', 'BRT', 'SAT', 'CRY', 'SNN', 'CVE', 'WHR', 'LNG',
       'LRA', 'CYC', 'SGC', 'LVD', 'SW3', 'LGV', 'SLC', 'MPL', 'RDN',
       'TAE', 'FLR', 'SDB', 'GLK', 'TLC', 'PRU', 'WRS', 'SLN', 'BLB',
       'ENG', 'NHG',

## Export Outputs

In [63]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3664710 entries, 0 to 3664709
Data columns (total 50 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   WaDEUUID                          object        
 1   in_MethodUUID                     object        
 2   in_AggregationIntervalUnitCV      object        
 3   in_AmountUnitCV                   object        
 4   in_VariableCV                     object        
 5   in_OrganizationUUID               object        
 6   in_Geometry                       object        
 7   in_GNISFeatureNameCV              object        
 8   in_WaterQualityIndicatorCV        object        
 9   in_WaterSourceName                object        
 10  in_WaterSourceNativeID            object        
 11  in_WaterSourceTypeCV              object        
 12  in_CoordinateAccuracy             object        
 13  in_CoordinateMethodCV             object        
 14  in_County         

In [64]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_AggregationIntervalUnitCV,in_AmountUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart,in_VariableSpecificCV
0,in10,CAwsss_M1,Monthly,AF,Water Supply,CAwsss_O1,,,,Wade Blank,wadeId1,Surface Water,,,San Diego,4326,,,,32.85400,-116.89200,,,,Lake Jennings,JNN,,Reservoir,CA,,,,,Storage Volume,,,,,,,,,,,Storage Volume,2021,,2021-10-31,2021-10-01,Water Supply_Monthly_Storage Volume_Surface Water
1,in10,CAwsss_M1,Monthly,AF,Water Supply,CAwsss_O1,,,,Wade Blank,wadeId1,Surface Water,,,San Diego,4326,,,,32.85400,-116.89200,,,,Lake Jennings,JNN,,Reservoir,CA,,,,,Storage Volume,,,,,,,,,,,Storage Volume,2021,,2021-10-31,2021-10-02,Water Supply_Monthly_Storage Volume_Surface Water
2,in10,CAwsss_M1,Monthly,AF,Water Supply,CAwsss_O1,,,,Wade Blank,wadeId1,Surface Water,,,San Diego,4326,,,,32.85400,-116.89200,,,,Lake Jennings,JNN,,Reservoir,CA,,,,,Storage Volume,,,,,,,,,,,Storage Volume,2021,,2021-10-31,2021-10-03,Water Supply_Monthly_Storage Volume_Surface Water
3,in10,CAwsss_M1,Monthly,AF,Water Supply,CAwsss_O1,,,,Wade Blank,wadeId1,Surface Water,,,San Diego,4326,,,,32.85400,-116.89200,,,,Lake Jennings,JNN,,Reservoir,CA,,,,,Storage Volume,,,,,,,,,,,Storage Volume,2021,,2021-10-31,2021-10-04,Water Supply_Monthly_Storage Volume_Surface Water
4,in10,CAwsss_M1,Monthly,AF,Water Supply,CAwsss_O1,,,,Wade Blank,wadeId1,Surface Water,,,San Diego,4326,,,,32.85400,-116.89200,,,,Lake Jennings,JNN,,Reservoir,CA,,,,,Storage Volume,,,,,,,,,,,Storage Volume,2021,,2021-10-31,2021-10-05,Water Supply_Monthly_Storage Volume_Surface Water
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3664705,in3164,CAwsss_M1,Monthly,INCHES,Water Supply,CAwsss_O1,,,,Wade Blank,wadeId1,Surface Water,,,,4326,,,,37.03103,-118.91478,,,,West Woodchuck Meadow,WWC,,Snow Depth,CA,,18.00000,,,Snow Depth,,,,,,,,,,,Snow Depth,2024,,2024-12-31,2024-12-28,Water Supply_Monthly_Snow Depth_Surface Water
3664706,in3164,CAwsss_M1,Monthly,INCHES,Water Supply,CAwsss_O1,,,,Wade Blank,wadeId1,Surface Water,,,,4326,,,,37.03103,-118.91478,,,,West Woodchuck Meadow,WWC,,Snow Depth,CA,,17.00000,,,Snow Depth,,,,,,,,,,,Snow Depth,2024,,2024-12-31,2024-12-29,Water Supply_Monthly_Snow Depth_Surface Water
3664707,in3164,CAwsss_M1,Monthly,INCHES,Water Supply,CAwsss_O1,,,,Wade Blank,wadeId1,Surface Water,,,,4326,,,,37.03103,-118.91478,,,,West Woodchuck Meadow,WWC,,Snow Depth,CA,,15.00000,,,Snow Depth,,,,,,,,,,,Snow Depth,2024,,2024-12-31,2024-12-30,Water Supply_Monthly_Snow Depth_Surface Water
3664708,in3164,CAwsss_M1,Monthly,INCHES,Water Supply,CAwsss_O1,,,,Wade Blank,wadeId1,Surface Water,,,,4326,,,,37.03103,-118.91478,,,,West Woodchuck Meadow,WWC,,Snow Depth,CA,,16.00000,,,Snow Depth,,,,,,,,,,,Snow Depth,2024,,2024-12-31,2024-12-31,Water Supply_Monthly_Snow Depth_Surface Water


In [65]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwsss_caMain.zip', compression=dict(method='zip', archive_name='Pwsss_caMain.csv'), index=False)  # The output, save as a zip