# Preprocessing Water Supply Site Time Series data for WaDE

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse
from bs4 import BeautifulSoup # text parser

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/WaDE Data Folder/California/WaterSupply_SiteSpecific"  # change here
os.chdir(workingDir)

## Input Files
- site info for reservoirs
- site info for streamgages

In [3]:
# Input File: Reservoirs
fileInput = "RawInputData/Reservoirs.zip"
dfr = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfr:
    dfr['WaDEUUID'] = "in1" + dfr.index.astype(str)
    dfr.to_csv('RawInputData/Reservoirs.zip', compression=dict(method='zip', archive_name='Reservoirs.csv'), index=False)

print(len(dfr))
dfr.head(1)

180


Unnamed: 0,Station,ID,Elev,Latitude,Longitude,County,Operating Agency,WaDEUUID
0,LAKE JENNINGS,JNN,707,32.854,-116.892,SAN DIEGO,None Specified,in10


In [4]:
# Input File: StreamGages shp file
fileInput = "RawInputData/shapefiles/StreamGages.zip"
dfsg = gpd.read_file(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsg:
    dfsg['WaDEUUID'] = "in2" + dfsg.index.astype(str)
    dfsg.to_csv('RawInputData/StreamGages.zip', compression=dict(method='zip', archive_name='StreamGages.csv'), index=False)

print(len(dfsg))
dfsg.head(1)

2597


Unnamed: 0,siteid,sitename,gage_statu,operator,datasource,sitestatus,stage_yn,stage_por,stage_stat,stage_real,flow_yn,flow_por,flow_statu,flow_realt,watqual_yn,watqual_po,watqual_st,watqual_re,temp_yn,temp_por,temp_statu,temp_realt,strmorder,ucdstrmcla,streamtype,totdasqkm,totdasqmi,weblink,gnisid_med,rchcd_medr,comid_medr,wtrshdnm_h,huc8,wtrshdnm_1,huc10,wtrshdnm_2,huc12,gagegap_st,reactivate,gage_histo,addflow_2s,addflow_2w,addtelemet,addtemp_2f,infrastruc,waterbody,tier,primary_be,sb19_actio,cnrfc,reference_,refpotenti,ecosysmgmt,wtrsupply,wtrquality,pubsafety,wade_Latit,wade_Longi,geometry,WaDEUUID
0,ACZ,ALHAMBRA CREEK AT D STREET,Active-Limited Use,OTHER,CDEC,Active,Y,1454,Active,Y,N,0,,N,N,0,,N,N,0,,N,2,Rain and seasonal groundwater (RGW),Stream/River - Intermittent,42.8499,16.54443,http://cdec.water.ca.gov/cgi-progs/staMeta?sta...,,18050001006347,948050078,Suisun Bay,18050001,Mount Diablo Creek-Frontal Suisun Bay Estuaries,1805000103,Arroyo del Hambre-Frontal Suisun Bay Estuaries,180500010303,AWG,N,0,Y,N,N,Y,,,3,ecosystem,Upgrade,,,,B,,,,38.00331,-122.12981,POINT Z (-122.12981 38.00331 0.00000),in20


In [None]:
# Input File: Reservoirs_timeseries

fileInput = "RawInputData/Reservoirs_timeseries.zip"
dfr_ts = pd.read_csv(fileInput).replace(np.nan, "")
print(len(dfr_ts))
dfr_ts.head(1)

In [None]:
# Input File: StreamGages_timeseries

fileInput = "RawInputData/StreamGages_timeseries.zip"
dfsg_ts = pd.read_csv(fileInput).replace(np.nan, "")
print(len(dfsg_ts))
dfsg_ts.head(1)

In [None]:
# left-join by reservoir metadata to reservoir site data

dfin1 = pd.merge(dfr, dfr_ts, left_on='ID', right_on='STATION_ID', how='left')
print(len(dfin1))
dfin1.head(1)

In [None]:
# left-join by streamgage metadata to streamgage site data

dfin2 = pd.merge(dfsg, dfsg_ts, left_on='siteid', right_on='STATION_ID', how='left')
print(len(dfin2))
dfin2.head(1)

## Get metadata & timeseries data
- https://cdec.water.ca.gov/dynamicapp/staMeta
- this is out of order. But essnetialy steps include 1) use site info to get site ids; 2) use site ids with metadata api to determine what timeseries is available; 3) retreive timeseries data for sites based on available metadata.
- metadata and timeseries data already retreived, use hard copies for inputs instead now.

In [None]:
# already done

# %%time
# # get Reservoirs metadata

# tempList = dfin1['ID'].tolist()
# dftemp = pd.DataFrame()

# for i in range(len(tempList)):
#     idString = str(tempList[i]).strip()   
#     url = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + idString
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         table = soup.find_all('table')
#         rawData = pd.read_html(str(table))[1]
#         rawData["ID"] = idString
#         dftemp = pd.concat([dftemp, rawData])
#     except:
#         print(f' did not work, {url}')

# dftemp.to_csv('RawInputData/Reservoirs_Metadata.zip', compression=dict(method='zip', archive_name='Reservoirs_Metadata.csv'), index=False)

# print(len(dftemp))
# dftemp.head()

In [None]:
# already done

# %%time
# # get StreamGages metadata

# tempList = dfin2['siteid'].unique().tolist()
# dftemp = pd.DataFrame()

# for i in range(len(tempList)):
#     idString = str(tempList[i]).strip()   
#     url = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + idString
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         table = soup.find_all('table')
#         rawData = pd.read_html(str(table))[1]
#         rawData["siteid"] = idString
#         dftemp = pd.concat([dftemp, rawData])
#     except:
#         print(f' did not work, {url}')

# dftemp.to_csv('RawInputData/StreamGages_Metadata.zip', compression=dict(method='zip', archive_name='StreamGages_Metadata.csv'), index=False)

# print(len(dftemp))
# dftemp.head()

In [5]:
# Input File: Reservoirs_Metadata

fileInput = "RawInputData/Reservoirs_Metadata.zip"
dfr_m = pd.read_csv(fileInput).replace(np.nan, "")
print(len(dfr_m))
dfr_m.head(1)

1279


Unnamed: 0,0,1,2,3,4,5,ID
0,"RESERVOIR STORAGE, AF",15,(daily),(STORAGE),MANUAL ENTRY,10/01/2021 to present,JNN


In [6]:
# Input File: StreamGages_Metadata

fileInput = "RawInputData/StreamGages_Metadata.zip"
dfsg_m = pd.read_csv(fileInput).replace(np.nan, "")
print(len(dfsg_m))
dfsg_m.head(1)

2850


Unnamed: 0,0,1,2,3,4,5,siteid,Zero Datum,Adj To NGVD,Peak of Record,Monitor Stage,Flood Stage,Guidance Plots,Danger Stage,Top of Levee
0,"RIVER STAGE, FEET",1.0,(event),(RIV STG),DATA XCHG-CCC,01/07/2017 to present,ACZ,,,,,,,,


In [7]:
# Clean up reservoir metadata

dfr_m = dfr_m.rename(columns={"0": "Sensor Description",  "1": "SensorNums", "2" : "Duration", "3" : "Plot", "4" : "Data Collection", "5" : "Data Available"})
dfr_m[['Start Date', 'End Date']] = dfr_m['Data Available'].str.split('to', n=1, expand=True)
dfr_m['Start Date'] = dfr_m['Start Date'].str.strip()
dfr_m['End Date'] = dfr_m['End Date'].str.replace('present','09/01/2024').str.strip()
dfr_m.head()

dfr_m.head(1)

Unnamed: 0,Sensor Description,SensorNums,Duration,Plot,Data Collection,Data Available,ID,Start Date,End Date
0,"RESERVOIR STORAGE, AF",15,(daily),(STORAGE),MANUAL ENTRY,10/01/2021 to present,JNN,10/01/2021,09/01/2024


In [8]:
# Clean up streamgage metadata

dfsg_m = dfsg_m.rename(columns={"0": "Sensor Description",  "1": "SensorNums", "2" : "Duration", "3" : "Plot", "4" : "Data Collection", "5" : "Data Available"})
dfsg_m[['Start Date', 'End Date']] = dfsg_m['Data Available'].str.split('to', n=1, expand=True)
dfsg_m['Start Date'] = dfsg_m['Start Date'].str.strip()
dfsg_m['End Date'] = dfsg_m['End Date'].str.replace('present','09/01/2024').str.strip()
dfsg_m.head(1)

Unnamed: 0,Sensor Description,SensorNums,Duration,Plot,Data Collection,Data Available,siteid,Zero Datum,Adj To NGVD,Peak of Record,Monitor Stage,Flood Stage,Guidance Plots,Danger Stage,Top of Levee,Start Date,End Date
0,"RIVER STAGE, FEET",1.0,(event),(RIV STG),DATA XCHG-CCC,01/07/2017 to present,ACZ,,,,,,,,,01/07/2017,09/01/2024


In [9]:
# abbreviate Duration 

durationDict = {
"(daily)" : "d",
"(monthly)" : "m",
"(event)" : "e", 
"(hourly)" : "h"
}

def CreateDurationAPIValueFunc(val):
    val = str(val).strip()
    try:
        outString = durationDict[val]
    except:
        outString = ""
    return outString

dfr_m['Duration_abb'] = dfr_m.apply(lambda row: CreateDurationAPIValueFunc(row['Duration']), axis=1)
dfsg_m['Duration_abb'] = dfsg_m.apply(lambda row: CreateDurationAPIValueFunc(row['Duration']), axis=1)

In [12]:
# left-join by reservoir metadata to reservoir site data

dfr = pd.merge(dfr, dfr_m, left_on='ID', right_on='ID', how='left')
print(len(dfr))
dfr.head(1)

1286


Unnamed: 0,Station,ID,Elev,Latitude,Longitude,County,Operating Agency,WaDEUUID,Sensor Description,SensorNums,Duration,Plot,Data Collection,Data Available,Start Date,End Date,Duration_abb
0,LAKE JENNINGS,JNN,707,32.854,-116.892,SAN DIEGO,None Specified,in10,"RESERVOIR STORAGE, AF",15,(daily),(STORAGE),MANUAL ENTRY,10/01/2021 to present,10/01/2021,09/01/2024,d


In [10]:
# left-join by streamgage metadata to streamgage site data

dfsg = pd.merge(dfsg, dfsg_m, left_on='siteid', right_on='siteid', how='left')
print(len(dfsg))
dfsg.head(1)

4960


Unnamed: 0,siteid,sitename,gage_statu,operator,datasource,sitestatus,stage_yn,stage_por,stage_stat,stage_real,flow_yn,flow_por,flow_statu,flow_realt,watqual_yn,watqual_po,watqual_st,watqual_re,temp_yn,temp_por,temp_statu,temp_realt,strmorder,ucdstrmcla,streamtype,totdasqkm,totdasqmi,weblink,gnisid_med,rchcd_medr,comid_medr,wtrshdnm_h,huc8,wtrshdnm_1,huc10,wtrshdnm_2,huc12,gagegap_st,reactivate,gage_histo,addflow_2s,addflow_2w,addtelemet,addtemp_2f,infrastruc,waterbody,tier,primary_be,sb19_actio,cnrfc,reference_,refpotenti,ecosysmgmt,wtrsupply,wtrquality,pubsafety,wade_Latit,wade_Longi,geometry,WaDEUUID,Sensor Description,SensorNums,Duration,Plot,Data Collection,Data Available,Zero Datum,Adj To NGVD,Peak of Record,Monitor Stage,Flood Stage,Guidance Plots,Danger Stage,Top of Levee,Start Date,End Date,Duration_abb
0,ACZ,ALHAMBRA CREEK AT D STREET,Active-Limited Use,OTHER,CDEC,Active,Y,1454,Active,Y,N,0,,N,N,0,,N,N,0,,N,2,Rain and seasonal groundwater (RGW),Stream/River - Intermittent,42.8499,16.54443,http://cdec.water.ca.gov/cgi-progs/staMeta?sta...,,18050001006347,948050078,Suisun Bay,18050001,Mount Diablo Creek-Frontal Suisun Bay Estuaries,1805000103,Arroyo del Hambre-Frontal Suisun Bay Estuaries,180500010303,AWG,N,0,Y,N,N,Y,,,3,ecosystem,Upgrade,,,,B,,,,38.00331,-122.12981,POINT Z (-122.12981 38.00331 0.00000),in20,"RIVER STAGE, FEET",1.0,(event),(RIV STG),DATA XCHG-CCC,01/07/2017 to present,,,,,,,,,01/07/2017,09/01/2024,e


In [13]:
# drop rows that do not contain monthly (m) or dailiy (d) data.
timestep = ['m', 'd']

dfr = dfr[dfr['Duration_abb'].isin(timestep)]
dfsg = dfsg[dfsg['Duration_abb'].isin(timestep)]

In [14]:
%%time
# get timeseries for reservoirs

stationsList = dfr['ID'].tolist()
sensorNumsList = dfr['SensorNums'].tolist()
dur_codeList =  dfr['Duration_abb'].tolist()
plotList = dfr['Plot'].tolist()
datacollectionList = dfr['Data Collection'].tolist()
startList = dfr['Start Date'].tolist()
endList = dfr['End Date'].tolist()

# Time Series Dataframe
dfr_ts = pd.DataFrame()

for i in range(len(stationsList)):
    stationStr = str(stationsList[i]).strip()
    sensorNumsStr = str(sensorNumsList[i]).strip()
    dur_codeStr = str(dur_codeList[i]).strip()
    plotStr = str(plotList[i]).strip()
    datacollectionStr = str(datacollectionList[i]).strip()
    startStr = str(startList[i]).strip()
    endStr = str(endList[i]).strip()   
    urlInput = "https://cdec.water.ca.gov/dynamicapp/req/CSVDataServlet?Stations=" + stationStr + "&SensorNums=" + sensorNumsStr + "&dur_code=" + dur_codeStr + "&Start=" + startStr +"&End=" + endStr
    try:
        tempdf = pd.read_csv(urlInput).replace(np.nan, "")
        dfr_ts = pd.concat([dfr_ts, tempdf])
        dfr_ts['Duration_abbe'] = dur_codeStr
        dfr_ts['Plot'] = plotStr
        dfr_ts['Data Collection'] = datacollectionStr
        dfr_ts['Start Date'] = startStr
        dfr_ts['End Date'] = endStr
              
    except:
        print("...bad reponse")

dfr_ts.to_csv('RawInputData/Reservoirs_timeseries.zip', compression=dict(method='zip', archive_name='Reservoirs_timeseries.csv'), index=False)
print(len(dfr_ts))
dfr_ts.head()

5082260
CPU times: total: 8min 4s
Wall time: 17min 25s


Unnamed: 0,STATION_ID,DURATION,SENSOR_NUMBER,SENSOR_TYPE,DATE TIME,OBS DATE,VALUE,DATA_FLAG,UNITS,Duration_abbe,Plot,Data Collection,Start Date,End Date
0,JNN,D,15,STORAGE,20211001 0000,20211001 0000,---,,AF,d,(PPT INC),DATA XCHG-USACE SAC,01/23/1989,04/23/1997
1,JNN,D,15,STORAGE,20211002 0000,20211002 0000,---,,AF,d,(PPT INC),DATA XCHG-USACE SAC,01/23/1989,04/23/1997
2,JNN,D,15,STORAGE,20211003 0000,20211003 0000,---,,AF,d,(PPT INC),DATA XCHG-USACE SAC,01/23/1989,04/23/1997
3,JNN,D,15,STORAGE,20211004 0000,20211004 0000,---,,AF,d,(PPT INC),DATA XCHG-USACE SAC,01/23/1989,04/23/1997
4,JNN,D,15,STORAGE,20211005 0000,20211005 0000,---,,AF,d,(PPT INC),DATA XCHG-USACE SAC,01/23/1989,04/23/1997


In [16]:
%%time
# get timeseries for streamgages

dfsg['SensorNums'] = dfsg['SensorNums'].astype(int).astype(str)

stationsList = dfsg['siteid'].tolist()
sensorNumsList = dfsg['SensorNums'].tolist()
dur_codeList =  dfsg['Duration_abb'].tolist()
plotList = dfsg['Plot'].tolist()
datacollectionList = dfsg['Data Collection'].tolist()
startList = dfsg['Start Date'].tolist()
endList = dfsg['End Date'].tolist()

# Time Series Dataframe
dfsg_ts = pd.DataFrame()

for i in range(len(stationsList)):
    stationStr = str(stationsList[i]).strip()
    sensorNumsStr = str(sensorNumsList[i]).strip()
    dur_codeStr = str(dur_codeList[i]).strip()
    plotStr = str(plotList[i]).strip()
    datacollectionStr = str(datacollectionList[i]).strip()
    startStr = str(startList[i]).strip()
    endStr = str(endList[i]).strip()   
    urlInput = "https://cdec.water.ca.gov/dynamicapp/req/CSVDataServlet?Stations=" + stationStr + "&SensorNums=" + sensorNumsStr + "&dur_code=" + dur_codeStr + "&Start=" + startStr +"&End=" + endStr
    try:
        tempdf = pd.read_csv(urlInput).replace(np.nan, "")
        dfsg_ts = pd.concat([dfsg_ts, tempdf])
        dfsg_ts['Duration_abbe'] = dur_codeStr
        dfsg_ts['Plot'] = plotStr
        dfsg_ts['Data Collection'] = datacollectionStr
        dfsg_ts['Start Date'] = startStr
        dfsg_ts['End Date'] = endStr
              
    except:
        print("...bad reponse")

dfsg_ts.to_csv('RawInputData/StreamGages_timeseries.zip', compression=dict(method='zip', archive_name='StreamGages_timeseries.csv'), index=False)
print(len(dfsg_ts))
dfsg_ts.head()

3250992
CPU times: total: 3min 54s
Wall time: 9min 16s


Unnamed: 0,STATION_ID,DURATION,SENSOR_NUMBER,SENSOR_TYPE,DATE TIME,OBS DATE,VALUE,DATA_FLAG,UNITS,Duration_abbe,Plot,Data Collection,Start Date,End Date
0,ANH,D,100,EL COND,20080118 0000,20080118 0000,---,,uS/cm,m,(STORAGE),MANUAL ENTRY,01/01/1956,09/01/2024
1,ANH,D,100,EL COND,20080119 0000,20080119 0000,526,,uS/cm,m,(STORAGE),MANUAL ENTRY,01/01/1956,09/01/2024
2,ANH,D,100,EL COND,20080120 0000,20080120 0000,628,,uS/cm,m,(STORAGE),MANUAL ENTRY,01/01/1956,09/01/2024
3,ANH,D,100,EL COND,20080121 0000,20080121 0000,658,,uS/cm,m,(STORAGE),MANUAL ENTRY,01/01/1956,09/01/2024
4,ANH,D,100,EL COND,20080122 0000,20080122 0000,603,,uS/cm,m,(STORAGE),MANUAL ENTRY,01/01/1956,09/01/2024


## WaDE Data

In [None]:
dfin1.head(1)

In [None]:
dfin1['SENSOR_TYPE'].value_counts()

In [None]:
# reservoir data
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Method Info
df['in_MethodUUID'] = ""

# Variable Info
df['in_AggregationIntervalUnitCV'] = ""
df['in_AmountUnitCV'] = ""
df['in_VariableCV'] = ""

# Organization Info
df['in_OrganizationUUID'] = ""

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "" # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = "" # need this for auto fill below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = ""
df['in_Longitude'] = ""
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = ""
df['in_SiteName'] = ""
df['in_SiteNativeID'] = ""
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = ""
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = ""
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] =  ""
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = ""
df['in_TimeframeStart'] = ""

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

In [None]:
# Concatenate dataframes
frames = [outdf1]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

In [None]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

In [None]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

In [None]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

In [None]:
# extract year out
# outdf['in_ReportYearCV'] = pd.to_datetime(outdf['in_ReportYearCV'], utc=True, errors = 'coerce').fillna("")
# outdf['in_ReportYearCV'] = pd.to_datetime(outdf["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
# outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].dt.year
# outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].fillna(0).astype(int)
outdf['in_ReportYearCV'].unique()

In [None]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwsss_caMain.zip', compression=dict(method='zip', archive_name='Pssro_xxMain.csv'), index=False)  # The output, save as a zip