# Preprocessing "XX" Reservoir and Observation Site Time Series data for WaDE

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse
from bs4 import BeautifulSoup # text parser

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/California/SS_ReservoirsObservationSites"  # change here
os.chdir(workingDir)

## Input Files

In [3]:
# Input File: Reservoirs
fileInput = "RawInputData/Reservoirs.zip"
dfin1 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv('RawInputData/Reservoirs.zip', compression=dict(method='zip', archive_name='Reservoirs.csv'), index=False)

print(len(dfin1))
dfin1.head()

180


Unnamed: 0,Station,ID,Elev,Latitude,Longitude,County,Operating Agency,WaDEUUID
0,LAKE JENNINGS,JNN,707,32.854,-116.892,SAN DIEGO,None Specified,in10
1,BEAR VALLEY DAM,BRV,6743,34.242,-116.978,SAN BERNARDINO,Big Bear Municipal Water District,in11
2,THERMALITO DIVERS POOL,THD,230,39.528,-121.543,BUTTE,CA Dept of Water Resources,in12
3,THERMALITO FOREBAY,TFR,230,39.519,-121.629,BUTTE,CA Dept of Water Resources,in13
4,THERMALITO TOTAL,TMT,140,39.458,-121.638,BUTTE,CA Dept of Water Resources,in14


In [4]:
# Input File: StreamGages shp file
fileInput = "RawInputData/shapefiles/StreamGages.zip"
dfin2 = gpd.read_file(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv('RawInputData/StreamGages.zip', compression=dict(method='zip', archive_name='StreamGages.csv'), index=False)

print(len(dfin2))
dfin2.head()

2597


Unnamed: 0,siteid,sitename,gage_statu,operator,datasource,sitestatus,stage_yn,stage_por,stage_stat,stage_real,flow_yn,flow_por,flow_statu,flow_realt,watqual_yn,watqual_po,watqual_st,watqual_re,temp_yn,temp_por,temp_statu,temp_realt,strmorder,ucdstrmcla,streamtype,totdasqkm,totdasqmi,weblink,gnisid_med,rchcd_medr,comid_medr,wtrshdnm_h,huc8,wtrshdnm_1,huc10,wtrshdnm_2,huc12,gagegap_st,reactivate,gage_histo,addflow_2s,addflow_2w,addtelemet,addtemp_2f,infrastruc,waterbody,tier,primary_be,sb19_actio,cnrfc,reference_,refpotenti,ecosysmgmt,wtrsupply,wtrquality,pubsafety,wade_Latit,wade_Longi,geometry,WaDEUUID
0,ACZ,ALHAMBRA CREEK AT D STREET,Active-Limited Use,OTHER,CDEC,Active,Y,1454,Active,Y,N,0,,N,N,0,,N,N,0,,N,2,Rain and seasonal groundwater (RGW),Stream/River - Intermittent,42.8499,16.54443,http://cdec.water.ca.gov/cgi-progs/staMeta?sta...,,18050001006347,948050078,Suisun Bay,18050001,Mount Diablo Creek-Frontal Suisun Bay Estuaries,1805000103,Arroyo del Hambre-Frontal Suisun Bay Estuaries,180500010303,AWG,N,0,Y,N,N,Y,,,3,ecosystem,Upgrade,,,,B,,,,38.00331,-122.12981,POINT Z (-122.12981 38.00331 0.00000),in20
1,AMC,ARCADE CREEK AT WINDING WAY,Active-Limited Use,OTHER,CDEC,Active,Y,9223,Active,Y,N,0,,N,N,0,,N,N,0,,N,3,Winter storms (WS),Stream/River - Perennial,76.1904,29.41727,http://cdec.water.ca.gov/cgi-progs/staMeta?sta...,218341.0,18020111000048,15022615,Lower American,18020111,Steelhead Creek,1802011103,Arcade Creek,180201110302,AWG,N,0,Y,N,N,Y,,,0,,None - Needs Water Temperature,,,,,,,,38.64545,-121.34741,POINT Z (-121.34741 38.64545 0.00000),in21
2,ANH,SAN JOAQUIN RIVER AT ANTIOCH,Active-Limited Use,DWR,CDEC,Active,Y,13514,Active,Y,N,0,,N,Y,8752,Active,Y,Y,9496,Active,Y,7,Groundwater (GW),Artificial Path,107113.5387,41356.75152,https://waterdata.usgs.gov/nwis/inventory/?sit...,273488.0,18040003000002,1889652,San Joaquin Delta,18040003,Middle River-San Joaquin River,1804000309,Markley Canyon-San Joaquin River,180400030907,AWG,N,0,Y,Y,N,N,,,3,multi-benefit,Upgrade,Model,,,B,,B,Y,38.01784,-121.80298,POINT Z (-121.80298 38.01784 0.00000),in22
3,ARD,ARCADE CREEK AT SUNRISE BLVD,Active-Limited Use,OTHER,CDEC,Active,Y,8739,Active,Y,N,0,,N,N,0,,N,N,0,,N,2,Winter storms (WS),Stream/River - Perennial,13.2111,5.10083,http://cdec.water.ca.gov/cgi-progs/staMeta?sta...,218341.0,18020111000058,15022465,Lower American,18020111,Steelhead Creek,1802011103,Arcade Creek,180201110302,AWG,N,0,Y,N,N,Y,,,0,,None - Needs Water Temperature,,,,,,,,38.68436,-121.2715,POINT Z (-121.27150 38.68436 0.00000),in23
4,ARW,ARDEN WAY,Active-Limited Use,OTHER,CDEC,Active,Y,9223,Active,Y,N,0,,N,N,0,,N,N,0,,N,1,,Stream/River - Perennial,14.0544,5.42643,http://cdec.water.ca.gov/cgi-progs/staMeta?sta...,233663.0,18020111000066,15022723,Lower American,18020111,American River,1802011102,Lower American River,180201110202,AWG,N,0,Y,N,N,Y,,,3,multi-benefit,Upgrade,,,,B,B,B,,38.59601,-121.41314,POINT Z (-121.41314 38.59601 0.00000),in24


## Get Medata
- https://cdec.water.ca.gov/dynamicapp/staMeta

In [5]:
%%time

# get Reservoirs metadata
tempList = dfin1['ID'].tolist()
dftemp = pd.DataFrame()

for i in range(len(tempList)):
    idString = str(tempList[i]).strip()   
    url = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + idString
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find_all('table')
        rawData = pd.read_html(str(table))[1]
        rawData["ID"] = idString
        dftemp = pd.concat([dftemp, rawData])
    except:
        print(f' did not work, {url}')

dftemp.to_csv('RawInputData/Reservoirs_Metadata.zip', compression=dict(method='zip', archive_name='Reservoirs_Metadata.csv'), index=False)

print(len(dftemp))
dftemp.head()



 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=CHY




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=LSB




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=PLL




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=DNL




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=MNC




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=DWN




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=TAH




1280
CPU times: total: 1min 7s
Wall time: 2min 35s




Unnamed: 0,0,1,2,3,4,5,ID
0,"RESERVOIR STORAGE, AF",15,(daily),(STORAGE),MANUAL ENTRY,10/01/2021 to present,JNN
1,"RESERVOIR STORAGE, AF",15,(monthly),(STORAGE),MANUAL ENTRY,10/01/1962 to present,JNN
0,"RESERVOIR STORAGE, AF",15,(daily),(STORAGE),MANUAL ENTRY,10/01/2020 to present,BRV
1,"RESERVOIR STORAGE, AF",15,(monthly),(STORAGE),MANUAL ENTRY,10/01/1955 to present,BRV
0,"RESERVOIR STORAGE, AF",15,(daily),(STORAGE),DATA XCHG-DWR O&M,12/26/2019 to present,THD


In [6]:
# left-join by "ID" to reservoir data
dfin1 = pd.merge(dfin1, dftemp, left_on='ID', right_on='ID', how='left')
print(len(dfin1))
dfin1.head(1)

1287


Unnamed: 0,Station,ID,Elev,Latitude,Longitude,County,Operating Agency,WaDEUUID,0,1,2,3,4,5
0,LAKE JENNINGS,JNN,707,32.854,-116.892,SAN DIEGO,None Specified,in10,"RESERVOIR STORAGE, AF",15,(daily),(STORAGE),MANUAL ENTRY,10/01/2021 to present


In [7]:
%%time

# get StreamGages metadata
tempList = dfin2['siteid'].unique().tolist()
dftemp = pd.DataFrame()

for i in range(len(tempList)):
    idString = str(tempList[i]).strip()   
    url = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + idString
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find_all('table')
        rawData = pd.read_html(str(table))[1]
        rawData["siteid"] = idString
        dftemp = pd.concat([dftemp, rawData])
    except:
        print(f' did not work, {url}')

dftemp.to_csv('RawInputData/StreamGages_Metadata.zip', compression=dict(method='zip', archive_name='StreamGages_Metadata.csv'), index=False)

print(len(dftemp))
dftemp.head()



 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=LRS




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=MJB




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=09429490
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=103087865
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=103087889
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=103087891
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11049250
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11159500
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11250110
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11252975
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11293372
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11293462
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11295340
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11295910
 did not work, https://cdec.water.ca.



 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10254050
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10254730
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10254970
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10255550
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10255810
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10255895
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10255897
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10256500
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10257500
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10257548
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10257549
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10257600
 did not work, https://cdec.water.ca.gov



 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11042900
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11043000
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11044000
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11044250
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11044300
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11044350
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11044800
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11045300
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11045600
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11045700
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11046000
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11046050
 did not work, https://cdec.water.ca.gov



 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11150500
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11151300
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11151700
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11151870
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11152000
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11152050
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11152300
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11152500
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11152650
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11153000
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11153650
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11156500
 did not work, https://cdec.water.ca.gov



 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=VNO




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=09429160
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=09429170
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=09429190
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=09527000
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=09527500
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10256000
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11049500
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11372350




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=BNC




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=CJC




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=SJW
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=SKF




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10264510




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=WCM




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10250800
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10254670
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10256400
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10260470
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10260480
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10260865
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10263630
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10270940
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10296700
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10296750
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=103366097
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=10336775
 did not work, https://cdec.water.ca.go



 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=9524700
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=9526200
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11136040
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=9527590
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11525630




 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11253130
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11336680
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11455167
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11455350
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11459150
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11475610
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=11526500
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=381148122024801
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=9429000
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=9523000
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=9523200
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=9523400
 did not work, https://cdec.water.ca.



 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A00268
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A00647
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A00910
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A00928
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A02380
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A02926
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A02933
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A02963
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A02965
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A02971
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A02980
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=A04120
 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?stat



 did not work, https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=MNM
2850
CPU times: total: 15min 41s
Wall time: 41min 55s




Unnamed: 0,0,1,2,3,4,5,siteid,Zero Datum,Adj To NGVD,Peak of Record,Monitor Stage,Flood Stage,Guidance Plots,Danger Stage,Top of Levee
0,"RIVER STAGE, FEET",1.0,(event),(RIV STG),DATA XCHG-CCC,01/07/2017 to present,ACZ,,,,,,,,
0,"PRECIPITATION, TIPPING BUCKET, INCHES",16.0,(event),(RAINTIP),DATA XCHG-NWS,10/01/1995 to present,AMC,,,,,,,,
1,"RIVER STAGE, FEET",1.0,(event),(RIV STG),DATA XCHG-NWS,10/01/1995 to present,AMC,,,,,,,,
0,"ELECTRICAL CONDUCTIVTY MICRO S, uS/cm",100.0,(daily),(EL COND),COMPUTED,01/18/2008 to present,ANH,,,,,,,,
1,"TEMPERATURE, WATER, DEG F",25.0,(daily),(TEMP W),COMPUTED,01/01/2002 to present,ANH,,,,,,,,


In [8]:
# left-join by "siteid" to StreamGages data
dfin2 = pd.merge(dfin2, dftemp, left_on='siteid', right_on='siteid', how='left')
print(len(dfin2))
dfin2.head(1)

4960


Unnamed: 0,siteid,sitename,gage_statu,operator,datasource,sitestatus,stage_yn,stage_por,stage_stat,stage_real,flow_yn,flow_por,flow_statu,flow_realt,watqual_yn,watqual_po,watqual_st,watqual_re,temp_yn,temp_por,temp_statu,temp_realt,strmorder,ucdstrmcla,streamtype,totdasqkm,totdasqmi,weblink,gnisid_med,rchcd_medr,comid_medr,wtrshdnm_h,huc8,wtrshdnm_1,huc10,wtrshdnm_2,huc12,gagegap_st,reactivate,gage_histo,addflow_2s,addflow_2w,addtelemet,addtemp_2f,infrastruc,waterbody,tier,primary_be,sb19_actio,cnrfc,reference_,refpotenti,ecosysmgmt,wtrsupply,wtrquality,pubsafety,wade_Latit,wade_Longi,geometry,WaDEUUID,0,1,2,3,4,5,Zero Datum,Adj To NGVD,Peak of Record,Monitor Stage,Flood Stage,Guidance Plots,Danger Stage,Top of Levee
0,ACZ,ALHAMBRA CREEK AT D STREET,Active-Limited Use,OTHER,CDEC,Active,Y,1454,Active,Y,N,0,,N,N,0,,N,N,0,,N,2,Rain and seasonal groundwater (RGW),Stream/River - Intermittent,42.8499,16.54443,http://cdec.water.ca.gov/cgi-progs/staMeta?sta...,,18050001006347,948050078,Suisun Bay,18050001,Mount Diablo Creek-Frontal Suisun Bay Estuaries,1805000103,Arroyo del Hambre-Frontal Suisun Bay Estuaries,180500010303,AWG,N,0,Y,N,N,Y,,,3,ecosystem,Upgrade,,,,B,,,,38.00331,-122.12981,POINT Z (-122.12981 38.00331 0.00000),in20,"RIVER STAGE, FEET",1.0,(event),(RIV STG),DATA XCHG-CCC,01/07/2017 to present,,,,,,,,


In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Method Info
df['in_MethodUUID'] = ""

# Variable Info
df['in_AggregationIntervalUnitCV'] = ""
df['in_VariableCV'] = ""

# Organization Info
df['in_OrganizationUUID'] = ""

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "" # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = "" # need this for auto fill below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = ""
df['in_Longitude'] = ""
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = ""
df['in_SiteName'] = ""
df['in_SiteNativeID'] = ""
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = ""
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = ""
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] =  ""
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = ""
df['in_TimeframeStart'] = ""

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

In [None]:
# Concatenate dataframes
frames = [outdf1]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

In [None]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

In [None]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

In [None]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

In [None]:
# extract year out
# outdf['in_ReportYearCV'] = pd.to_datetime(outdf['in_ReportYearCV'], utc=True, errors = 'coerce').fillna("")
# outdf['in_ReportYearCV'] = pd.to_datetime(outdf["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
# outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].dt.year
# outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].fillna(0).astype(int)
outdf['in_ReportYearCV'].unique()

In [None]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pssro_xxMain.zip', compression=dict(method='zip', archive_name='Pssro_xxMain.csv'), index=False)  # The output, save as a zip