# Pre-processing "XX" Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Utah/SS_PublicSupplyWaterUse" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/Utah/SS_PublicSupplyWaterUse


## Data Input

In [3]:
# Input File - water_use_data_Source
fileInput = "RawInputData/water_use_data_Source.zip"
dfin1 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/water_use_data_Source.zip", compression=dict(method='zip', archive_name="water_use_data_Source.csv"), index=False)

print(len(dfin1))
dfin1.head(1)

8836


Unnamed: 0,System ID,System Name,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Use Type,Year,Units,Method of Measurement,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total,Source Comments,WaDEUUID
0,1000,Leeds Domestic Water Users Association,10000001,Oak Grove Spring (WS001),Active,37.30895,-113.42877,Spring,Withdrawal,Water Supplier,,,,,,,,,,,,,,,,,*Source 1 springs and Source 2 well are combin...,in10


In [4]:
# remove special characters from 'Source Name', including commas
def cleanSourceNameFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.,;/\)(-]", "", Val).replace("  ", " ").strip()
    return Val

dfin1['Source Name'] = dfin1.apply(lambda row: cleanSourceNameFunc(row['Source Name']), axis=1)
dfin1['Source Name'].unique()

array(['Oak Grove Spring WS001', 'Leeds Well No 1 WS002',
       'Leeds Well No4 WS006', ..., 'Tropic Well',
       'Zion Under Canvas Well WS001',
       'Bryce Canyon Under Canvas Well WS001'], dtype=object)

In [5]:
# Input File - water_use_data_System
fileInput = "RawInputData/water_use_data_System.zip"
dfin2 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv("RawInputData/water_use_data_System.zip", compression=dict(method='zip', archive_name="water_use_data_System.csv"), index=False)

print(len(dfin2))
dfin2.head(1)

22514


  dfin2 = pd.read_csv(fileInput).replace(np.nan, "")


Unnamed: 0,System ID,System Name,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Method of Measurement,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units,ERC Yes/No,ERC Value,Secondary Domestic Percent,Secondary Industrial Percent,Secondary Commercial Percent,Secondary Institutional Percent,Secondary Agriculture Percent,Secondary Domestic Connections,Secondary Industrial Connections,Secondary Commercial Connections,Secondary Institutional Connections,Secondary Agriculture Connections,Secondary Irrigation (Lawn and Garden) Acres,Secondary Irrigation (Agriculture) Acres,Secondary Metered Domestic Connections,Secondary Metered Industrial Connections,Secondary Metered Commercial Connections,Secondary Metered Institutional Connections,Secondary Metered Agriculture Connections,Secondary Metered Domestic Use,Secondary Metered Commercial Use,Secondary Metered Industrial Use,Secondary Metered Institutional Use,Secondary Metered Agriculture Use,System Comments,WaDEUUID
0,1000,Leeds Domestic Water Users Association,Public,Active,1960,12/31/1960,Washington,,,,,,,,,27010.0,0.0,0.0,0.0,0.0,,18301270.92,,0.0,0.0,0.0,,52.0,,,,,,,,Y,Not Avail,,,,,,,,,,,,,,,,,,,,,,,(1988)1. Spring totals do not include overflow...,in20


In [6]:
# remove special characters from 'System Name', including commas
def cleanSystemNameFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.,;/\)(-]", "", Val).replace("  ", " ").strip()
    return Val

dfin2['System Name'] = dfin2.apply(lambda row: cleanSystemNameFunc(row['System Name']), axis=1)
dfin2['System Name'].unique()

array(['Leeds Domestic Water Users Association', 'Riverdale City',
       'Fillmore Municipal Water System', ..., 'Tropic Town Misc',
       'Zion Under Canvas', 'Under Canvas Bryce Canyon'], dtype=object)

In [7]:
# Input File - CulinaryWaterServiceArea.shp
shapefileInput = "RawInputData/shapefiles/CulinaryWaterServiceArea.zip" # ziped folder of the shp file
dfPoUshapetemp = gpd.read_file(shapefileInput)
dfPoUshapetemp['geometry'] = dfPoUshapetemp['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
dfPoUshapetemp.head(1)

Unnamed: 0,FID_1,WRENAME,WRNAME,DWNAME,SYSTEMTYPE,WATERRESID,WRID,DWSYSNUM,WRLINK,WHOLESALER,LABEL,STATE,COUNTY,BASIN,SUBAREA,SUBAREANAM,LANUM,LANAME,ENDYEAR,DATASOURCE,SOURCEDATE,EDITOR,EDITDATE,STATUS,wade_cent_,wade_cent1,Shape_Leng,SHAPE_Le_1,Shape_Le_2,Shape_Area,geometry
0,1,Irontown,Irontown,Irontown,C,564,11358,UTAH11070,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-03-01,Escalante Desert,06-03-01a,Escalante Desert,2022,DWRe/Supplier,2021-09-08,ADAMCLARK,2021-11-19,Active,-113.44596,37.60243,6132.9725,6132.9725,0.06338,7e-05,"POLYGON ((-113.45270 37.60395, -113.45335 37.6..."


## Work with Data

### Source POD Data

In [8]:
# left merge
dfin1_2 = dfin1.merge(dfin2[['System ID', 'System Name', 'County']], left_on='System ID', right_on='System ID', how='left').replace(np.nan, "")
print(len(dfin1_2))
dfin1_2.head(1)

187006


Unnamed: 0,System ID,System Name_x,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Use Type,Year,Units,Method of Measurement,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total,Source Comments,WaDEUUID,System Name_y,County
0,1000,Leeds Domestic Water Users Association,10000001,Oak Grove Spring WS001,Active,37.30895,-113.42877,Spring,Withdrawal,Water Supplier,,,,,,,,,,,,,,,,,*Source 1 springs and Source 2 well are combin...,in10,Leeds Domestic Water Users Association,Washington


In [9]:
# fix blank / null in_MethodTypeCV

def fixMethodTypeCV(valA):
    valA = str(valA).strip().title()
    if valA == "" or pd.isnull(valA):
        outString = "WaDE Blank"
    else:
        outString = valA
    return outString

dfin1_2['in_MethodTypeCV'] = dfin1_2.apply(lambda row: fixMethodTypeCV(row['Method of Measurement']), axis=1)
dfin1_2['in_MethodTypeCV'].unique()

array(['WaDE Blank', 'Weir', 'Estimated', 'Calculated'], dtype=object)

In [10]:
# fix blank / null WaterSourcetypeCV
# simplify to WaDE specific categories

wsTypeDict = {
    "Well" : "Groundwater",
    "Well/Spring" : "Groundwater",
    "Well Field" : "Groundwater",
    "Well/Stream" : "Groundwater",
    "Tunnel" : "Groundwater",
    "Drain" : "Groundwater",
    "Stream" : "Surface Water",
    "Spring" : "Surface Water",
    "Reservoir" : "Surface Water",
    "Lake" : "Surface Water"}

def fixWaterSourceTypeCV(valA):
    valA = str(valA).strip()
    if valA == "" or pd.isnull(valA):
        outString = "WaDE Blank"
    else:
        try:
            outString = wsTypeDict[valA]
        except:
            outString = "WaDE Blank"
    return outString

dfin1_2['in_WaterSourceTypeCV'] = dfin1_2.apply(lambda row: fixWaterSourceTypeCV(row['Source Type']), axis=1)
dfin1_2['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater'], dtype=object)

In [11]:
# Loop data list.  Use this to search for specific fields.
amountList   =  ['Jan',     'Feb',     'Mar',     'Apr',     'May',    'Jun',      'Jul',     'Aug',     'Sep',     'Oct',     'Nov',     'Dec',     'Total']
aggregatList =  ["Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Annual"]
startDateList = ["01/01/",  "02/01/",  "03/01/",  "04/01/",  "05/01/",  "06/01/",  "07/01/",  "08/01/",  "09/01/",  "10/01/",  "11/01/",  "12/01/",  "01/01/"]
endDateList =   ["01/31/",  "02/28/",  "03/31/",  "04/30/",  "05/31/",  "06/30/",  "07/31/",  "08/31/",  "09/30/",  "10/31/",  "11/30/",  "12/31/",  "12/31/"]

In [12]:
# create output POD dataframe
outdf1 = pd.DataFrame()

# for each value in variableTypeList
for x in range(len(amountList)):

    df = pd.DataFrame()
   
    # Data Assessment UUID
    df['WaDEUUID'] = dfin1_2['WaDEUUID']
    
    # Method Info
    df['in_MethodUUID'] = "" # will create sa portion below
    df['in_MethodTypeCV'] = dfin1_2['in_MethodTypeCV'] # see above
    
    # Variable Info
    df['in_VariableSpecificUUID'] = "" # will create sa portion below
    df['in_AggregationIntervalUnitCV'] = aggregatList[x]
    df['in_VariableCV'] = dfin1_2['Diversion Type']
    
    # Organization Info
    df['in_OrganizationUUID'] = "UTssps_O1"
    
    # WaterSource Info
    df['in_Geometry'] = ""
    df['in_GNISFeatureNameCV'] = ""
    df['in_WaterQualityIndicatorCV'] = "Fresh"
    df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
    df['in_WaterSourceNativeID'] = "" # auto fill in below
    df['in_WaterSourceTypeCV'] = dfin1_2['in_WaterSourceTypeCV'] # see above
    
    # Site Info
    df['in_CoordinateAccuracy'] = "WaDE Blank"
    df['in_CoordinateMethodCV'] = "WaDE Blank"
    df['in_County'] = dfin1_2['County']
    df['in_EPSGCodeCV'] = 4326
    df['in_Geometry'] = ""
    df['in_GNISCodeCV'] = ""
    df['in_HUC12'] = ""
    df['in_HUC8'] = ""
    df['in_Latitude'] = dfin1_2['Lat NAD83']
    df['in_Longitude'] = dfin1_2['Lon NAD83']
    df['in_NHDNetworkStatusCV'] = ""
    df['in_NHDProductCV'] = ""
    df['in_PODorPOUSite'] = "POD"
    df['in_SiteName'] = dfin1_2['Source Name']
    df['in_SiteNativeID'] = "d" + dfin1_2['Source ID'].astype(str)
    df['in_SitePoint'] = ""
    df['in_SiteTypeCV'] = dfin1_2['Source Type']
    df['in_StateCV'] = "UT"
    df['in_USGSSiteID'] = ""
       
    # Site VariableAmounts Info
    df['in_Amount'] = dfin1_2[amountList[x]]
    df['in_AllocationCropDutyAmount'] = ""
    df['in_AssociatedNativeAllocationIDs'] = ""
    df['in_BeneficialUseCategory'] = dfin1_2['Use Type']
    df['in_CommunityWaterSupplySystem'] = dfin1_2['System Name_y']
    df['in_CropTypeCV'] = ""
    df['in_CustomerTypeCV'] = ""
    df['in_DataPublicationDate'] = ""
    df['in_DataPublicationDOI'] = ""
    df['in_Geometry'] = ""
    df['in_IrrigatedAcreage'] = ""
    df['in_IrrigationMethodCV'] = ""
    df['in_PopulationServed'] = ""
    df['in_PowerGeneratedGWh'] = ""
    df['in_PowerType'] = ""
    df['in_PrimaryUseCategory'] = "" # auto fill in below
    df['in_ReportYearCV'] =  dfin1_2['Year'].replace("", 0).fillna(0).astype(int).astype(str)
    df['in_SDWISIdentifier'] = ""
    df['in_TimeframeEnd'] = endDateList[x] + dfin1_2['Year'].replace("", 0).fillna(0).astype(int).astype(str)
    df['in_TimeframeStart'] = startDateList[x] + dfin1_2['Year'].replace("", 0).fillna(0).astype(int).astype(str)

    outdf1 = pd.concat([outdf1, df])

outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

114868


Unnamed: 0,WaDEUUID,in_MethodUUID,in_MethodTypeCV,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,in10,,WaDE Blank,,Monthly,Withdrawal,UTssps_O1,,,Fresh,WaDE Blank,,Surface Water,WaDE Blank,WaDE Blank,Washington,4326,,,,37.30895,-113.42877,,,POD,Oak Grove Spring WS001,d10000001,,Spring,UT,,,,,Water Supplier,Leeds Domestic Water Users Association,,,,,,,,,,,0,,01/31/0,01/01/0
1,in11,,WaDE Blank,,Monthly,Withdrawal,UTssps_O1,,,Fresh,WaDE Blank,,Groundwater,WaDE Blank,WaDE Blank,Washington,4326,,,,37.26708,-113.36336,,,POD,Leeds Well No 1 WS002,d10000002,,Well,UT,,,,,Water Supplier,Leeds Domestic Water Users Association,,,,,,,,,,,0,,01/31/0,01/01/0
2,in12,,WaDE Blank,,Monthly,Withdrawal,UTssps_O1,,,Fresh,WaDE Blank,,Groundwater,WaDE Blank,WaDE Blank,Washington,4326,,,,37.26668,-113.36341,,,POD,Leeds Well No4 WS006,d108547239,,Well,UT,,,,,Water Supplier,Leeds Domestic Water Users Association,,,,,,,,,,,0,,01/31/0,01/01/0
3,in13,,WaDE Blank,,Monthly,Withdrawal,UTssps_O1,,,Fresh,WaDE Blank,,Groundwater,WaDE Blank,WaDE Blank,Weber,4326,,,,41.16996,-112.00563,,,POD,5190S 1050W #1 Well WS003,d10010001,,Well,UT,,,,,Water Supplier,Riverdale City,,,,,,,,,,,0,,01/31/0,01/01/0
4,in14,,WaDE Blank,,Monthly,Withdrawal,UTssps_O1,,,Fresh,WaDE Blank,,Groundwater,WaDE Blank,WaDE Blank,Millard,4326,,,,38.96439,-112.38104,,,POD,FC Well #2 WS003,d10030001,,Well,UT,,,,,Water Supplier,Fillmore Municipal Water System,,,,,,,,,,,0,,01/31/0,01/01/0


### System POU Data

In [13]:
# left merge dfPoUshapetemp shp info for lat & long info
dfin2_2 = dfin2.merge(dfPoUshapetemp[['WRID', 'wade_cent1', 'wade_cent_']], left_on='System ID', right_on='WRID', how='left').replace(np.nan, "")
print(len(dfin2_2))
dfin2_2.head(1)

22902


Unnamed: 0,System ID,System Name,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Method of Measurement,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units,ERC Yes/No,ERC Value,Secondary Domestic Percent,Secondary Industrial Percent,Secondary Commercial Percent,Secondary Institutional Percent,Secondary Agriculture Percent,Secondary Domestic Connections,Secondary Industrial Connections,Secondary Commercial Connections,Secondary Institutional Connections,Secondary Agriculture Connections,Secondary Irrigation (Lawn and Garden) Acres,Secondary Irrigation (Agriculture) Acres,Secondary Metered Domestic Connections,Secondary Metered Industrial Connections,Secondary Metered Commercial Connections,Secondary Metered Institutional Connections,Secondary Metered Agriculture Connections,Secondary Metered Domestic Use,Secondary Metered Commercial Use,Secondary Metered Industrial Use,Secondary Metered Institutional Use,Secondary Metered Agriculture Use,System Comments,WaDEUUID,WRID,wade_cent1,wade_cent_
0,1000,Leeds Domestic Water Users Association,Public,Active,1960,12/31/1960,Washington,,,,,,,,,27010.0,0.0,0.0,0.0,0.0,,18301270.92,,0.0,0.0,0.0,,52.0,,,,,,,,Y,Not Avail,,,,,,,,,,,,,,,,,,,,,,,(1988)1. Spring totals do not include overflow...,in20,1000.0,37.23727,-113.34673


In [14]:
# Loop data list.  Use this to search for specific fields.
benuseList = ["Domestic",     "Commercial",     "Industrial",     "Institutional"]
amountList = ["Domestic Use", "Commercial Use", "Industrial Use", "Institutional Use"]

In [15]:
# create output POD dataframe
outdf2 = pd.DataFrame()

# for each value in variableTypeList
for x in range(len(amountList)):

    df = pd.DataFrame()
   
    # Data Assessment UUID
    df['WaDEUUID'] = dfin2_2['WaDEUUID']
    
    # Method Info
    df['in_MethodUUID'] = "" # will create sa portion below
    df['in_MethodTypeCV'] = "Metered" # I think all POU system data is coming from a meter value
    
    # Variable Info
    df['in_VariableSpecificUUID'] = "" # will create sa portion below
    df['in_AggregationIntervalUnitCV'] = "Annual"
    df['in_VariableCV'] = "Consumptive Use"
    
    # Organization Info
    df['in_OrganizationUUID'] = "UTssps_O1"
    
    # WaterSource Info
    df['in_Geometry'] = ""
    df['in_GNISFeatureNameCV'] = ""
    df['in_WaterQualityIndicatorCV'] = "Fresh"
    df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
    df['in_WaterSourceNativeID'] = "" # auto fill in below
    df['in_WaterSourceTypeCV'] = "WaDE Blank"
    
    # Site Info
    df['in_CoordinateAccuracy'] = "WaDE Blank"
    df['in_CoordinateMethodCV'] = "WaDE Blank"
    df['in_County'] = dfin2_2['County']
    df['in_EPSGCodeCV'] = 4326
    df['in_Geometry'] = ""
    df['in_GNISCodeCV'] = ""
    df['in_HUC12'] = ""
    df['in_HUC8'] = ""
    df['in_Latitude'] = dfin2_2['wade_cent1']
    df['in_Longitude'] = dfin2_2['wade_cent_']
    df['in_NHDNetworkStatusCV'] = ""
    df['in_NHDProductCV'] = ""
    df['in_PODorPOUSite'] = "POU"
    df['in_SiteName'] = dfin2_2['System Name']
    df['in_SiteNativeID'] = "u" + dfin2_2['System ID'].astype(str)
    df['in_SitePoint'] = ""
    df['in_SiteTypeCV'] = dfin2_2['System Type']
    df['in_StateCV'] = "UT"
    df['in_USGSSiteID'] = ""
    
    # Site VariableAmounts Info
    df['in_Amount'] = dfin2_2[amountList[x]]
    df['in_AllocationCropDutyAmount'] = ""
    df['in_AssociatedNativeAllocationIDs'] = ""
    df['in_BeneficialUseCategory'] = benuseList[x]
    df['in_CommunityWaterSupplySystem'] = dfin2_2['System Name']
    df['in_CropTypeCV'] = ""
    df['in_CustomerTypeCV'] = ""
    df['in_DataPublicationDate'] = ""
    df['in_DataPublicationDOI'] = ""
    df['in_Geometry'] = ""
    df['in_IrrigatedAcreage'] = ""
    df['in_IrrigationMethodCV'] = ""
    df['in_PopulationServed'] = dfin2_2['Population']
    df['in_PowerGeneratedGWh'] = ""
    df['in_PowerType'] = ""
    df['in_PrimaryUseCategory'] = "" # auto fill in below
    df['in_ReportYearCV'] =  dfin2_2['History Year'].replace("", 0).fillna(0).astype(int).astype(str)
    df['in_SDWISIdentifier'] = ""
    df['in_TimeframeEnd'] = "12/31/" + dfin2_2['History Year'].replace("", 0).fillna(0).astype(int).astype(str)
    df['in_TimeframeStart'] = "01/01/" + dfin2_2['History Year'].replace("", 0).fillna(0).astype(int).astype(str)

    outdf2 = pd.concat([outdf2, df])

outdf2 = outdf2.drop_duplicates().reset_index(drop=True)
print(len(outdf2))
outdf2.head()

91608


Unnamed: 0,WaDEUUID,in_MethodUUID,in_MethodTypeCV,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,in20,,Metered,,Annual,Consumptive Use,UTssps_O1,,,Fresh,WaDE Blank,,WaDE Blank,WaDE Blank,WaDE Blank,Washington,4326,,,,37.23727,-113.34673,,,POU,Leeds Domestic Water Users Association,u1000,,Public,UT,,0.0,,,Domestic,Leeds Domestic Water Users Association,,,,,,,0.0,,,,1960,,12/31/1960,01/01/1960
1,in21,,Metered,,Annual,Consumptive Use,UTssps_O1,,,Fresh,WaDE Blank,,WaDE Blank,WaDE Blank,WaDE Blank,Washington,4326,,,,37.23727,-113.34673,,,POU,Leeds Domestic Water Users Association,u1000,,Public,UT,,0.0,,,Domestic,Leeds Domestic Water Users Association,,,,,,,0.0,,,,1962,,12/31/1962,01/01/1962
2,in22,,Metered,,Annual,Consumptive Use,UTssps_O1,,,Fresh,WaDE Blank,,WaDE Blank,WaDE Blank,WaDE Blank,Washington,4326,,,,37.23727,-113.34673,,,POU,Leeds Domestic Water Users Association,u1000,,Public,UT,,0.0,,,Domestic,Leeds Domestic Water Users Association,,,,,,,0.0,,,,1963,,12/31/1963,01/01/1963
3,in23,,Metered,,Annual,Consumptive Use,UTssps_O1,,,Fresh,WaDE Blank,,WaDE Blank,WaDE Blank,WaDE Blank,Washington,4326,,,,37.23727,-113.34673,,,POU,Leeds Domestic Water Users Association,u1000,,Public,UT,,0.0,,,Domestic,Leeds Domestic Water Users Association,,,,,,,0.0,,,,1964,,12/31/1964,01/01/1964
4,in24,,Metered,,Annual,Consumptive Use,UTssps_O1,,,Fresh,WaDE Blank,,WaDE Blank,WaDE Blank,WaDE Blank,Washington,4326,,,,37.23727,-113.34673,,,POU,Leeds Domestic Water Users Association,u1000,,Public,UT,,0.0,,,Domestic,Leeds Domestic Water Users Association,,,,,,,0.0,,,,1965,,12/31/1965,01/01/1965


## Concatenate POD and POU Data.  Make needed changes

In [16]:
# Concatenate dataframes
frames = [outdf1, outdf2]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

206476


## Clean Data / data types

In [17]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [18]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Wade Blank'], dtype=object)

In [19]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Washington', 'Weber', 'Millard', 'Davis', 'Tooele', 'Iron',
       'Box Elder', 'Salt Lake', 'Utah', 'Morgan', 'Carbon', 'Piute',
       'Summit', 'Beaver', 'Duchesne', 'Cache', 'Kane', 'Emery', 'Wayne',
       'Juab', 'Uintah', 'Grand', 'Garfield', 'Sevier', 'Wasatch',
       'Sanpete', 'San Juan', 'Rich', 'Daggett'], dtype=object)

In [20]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Oak Grove Spring Ws001', 'Leeds Well No 1 Ws002',
       'Leeds Well No4 Ws006', ..., 'Tropic Town Misc',
       'Zion Under Canvas', 'Under Canvas Bryce Canyon'], dtype=object)

In [21]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [22]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Wade Blank'], dtype=object)

In [23]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater', 'WaDE Blank'], dtype=object)

In [24]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Spring', 'Well', 'Reservoir', 'Stream', 'Drain', 'Tunnel', 'Lake',
       'Public', 'Industrial', 'Mgt Plan', 'Secondary'], dtype=object)

In [25]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Oak Grove Spring Ws001', 'Leeds Well No 1 Ws002',
       'Leeds Well No4 Ws006', ..., 'Tropic Town Misc',
       'Zion Under Canvas', 'Under Canvas Bryce Canyon'], dtype=object)

In [26]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Washington', 'Weber', 'Millard', 'Davis', 'Tooele', 'Iron',
       'Box Elder', 'Salt Lake', 'Utah', 'Morgan', 'Carbon', 'Piute',
       'Summit', 'Beaver', 'Duchesne', 'Cache', 'Kane', 'Emery', 'Wayne',
       'Juab', 'Uintah', 'Grand', 'Garfield', 'Sevier', 'Wasatch',
       'Sanpete', 'San Juan', 'Rich', 'Daggett'], dtype=object)

In [27]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['Agricultural',
 'Commercial',
 'Domestic',
 'Geothermal',
 'Industrial',
 'Institutional',
 'Irrigation',
 'Mining',
 'Power (Fossil-Fuel)',
 'Power (Geothermal)',
 'Power (Hydro-Elec)',
 'Water Supplier']

In [28]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([37.30895298, 37.26707725, 37.2666804, ..., 40.9540068992,
       40.1441288757, 40.8695941542], dtype=object)

In [29]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-113.4287713, -113.363361, -113.3634141, ..., -110.815260029,
       -111.016678598, -111.585571449], dtype=object)

In [30]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

array(['', 1798000.0, 3441000.0, ..., 192249.0, 922159.54, 343257.5],
      dtype=object)

In [31]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).astype(str)
outdf['in_PopulationServed'].unique()

array(['0', '255', '269', ..., '369362', '376117', '3514'], dtype=object)

In [32]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

  outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")


<DatetimeArray>
['2000-01-31 00:00:00', '2019-01-31 00:00:00', '2020-01-31 00:00:00',
 '2021-01-31 00:00:00', '2022-01-31 00:00:00', '2017-01-31 00:00:00',
 '2018-01-31 00:00:00', '1996-01-31 00:00:00', '1997-01-31 00:00:00',
 '1998-01-31 00:00:00',
 ...
 '1991-12-31 00:00:00', '1992-12-31 00:00:00', '1993-12-31 00:00:00',
 '1994-12-31 00:00:00', '1959-12-31 00:00:00', '1970-12-31 00:00:00',
 '1974-12-31 00:00:00', '1961-12-31 00:00:00', '1957-12-31 00:00:00',
 '1958-12-31 00:00:00']
Length: 408, dtype: datetime64[ns]

In [33]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

  outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")


<DatetimeArray>
['2000-01-01 00:00:00', '2019-01-01 00:00:00', '2020-01-01 00:00:00',
 '2021-01-01 00:00:00', '2022-01-01 00:00:00', '2017-01-01 00:00:00',
 '2018-01-01 00:00:00', '1996-01-01 00:00:00', '1997-01-01 00:00:00',
 '1998-01-01 00:00:00',
 ...
 '1991-01-01 00:00:00', '1992-01-01 00:00:00', '1993-01-01 00:00:00',
 '1994-01-01 00:00:00', '1959-01-01 00:00:00', '1970-01-01 00:00:00',
 '1974-01-01 00:00:00', '1961-01-01 00:00:00', '1957-01-01 00:00:00',
 '1958-01-01 00:00:00']
Length: 408, dtype: datetime64[ns]

In [34]:
# extract year out
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].replace("", 0).fillna(0).astype(int).astype(str)
outdf['in_ReportYearCV'].unique()

array(['0', '2019', '2020', '2021', '2022', '2017', '2018', '1996',
       '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2006',
       '2007', '2008', '2009', '2015', '2013', '2016', '2004', '2005',
       '2023', '2014', '2010', '2011', '2012', '1975', '1988', '1995',
       '1960', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1971', '1972', '1973', '1976', '1977', '1978', '1979',
       '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987',
       '1989', '1990', '1991', '1992', '1993', '1994', '1959', '1970',
       '1974', '1961', '1957', '1958'], dtype=object)

In [35]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

array(['Public Supply', 'Domestic', 'Commercial/Industrial',
       'Agriculture Irrigation', 'Geothermal', 'Mining', 'Hydroelectric',
       'Municipal Irrigation'], dtype=object)

In [36]:
# Creating WaDE MethodUUID
# ----------------------------------------------------------------------------------------------------

mtDict = {"WaDE Blank" : "UTssps_M1",
          "Weir" : "UTssps_M2",
          "Estimated" : "UTssps_M3",
          "Calculated" : "UTssps_M4",
          "Metered" : "UTssps_M5"}

def fillInMethodUUID(valA):
    valA = str(valA).strip()
    outString = mtDict[valA]
    return outString

outdf['in_MethodUUID'] = outdf.apply(lambda row: fillInMethodUUID(row['in_MethodTypeCV']), axis=1)
outdf['in_MethodUUID'].unique()

array(['UTssps_M1', 'UTssps_M2', 'UTssps_M3', 'UTssps_M4', 'UTssps_M5'],
      dtype=object)

In [37]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

array(['Withdrawal_Monthly_Public Supply_Surface Water',
       'Withdrawal_Monthly_Public Supply_Groundwater',
       'Withdrawal_Monthly_Domestic_Groundwater',
       'Withdrawal_Monthly_Commercial/Industrial_Groundwater',
       'Transfer Out_Monthly_Public Supply_Groundwater',
       'Transfer In_Monthly_Public Supply_Groundwater',
       'Withdrawal_Monthly_Domestic_Surface Water',
       'Transfer In_Monthly_Public Supply_Surface Water',
       'Withdrawal_Monthly_Agriculture Irrigation_Groundwater',
       'Transfer Out_Monthly_Public Supply_Surface Water',
       'Return_Monthly_Agriculture Irrigation_Surface Water',
       'Delivery_Monthly_Agriculture Irrigation_Surface Water',
       'Return_Monthly_Public Supply_Groundwater',
       'Delivery_Monthly_Public Supply_Surface Water',
       'Transfer Out_Monthly_Agriculture Irrigation_Surface Water',
       'Withdrawal_Monthly_Agriculture Irrigation_Surface Water',
       'Withdrawal_Monthly_Commercial/Industrial_Surface Water'

In [38]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1', 'wadeId2', 'wadeId3'], dtype=object)

In [39]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['d10000001', 'd10000002', 'd108547239', ..., 'u12023', 'u12024',
       'u12026'], dtype=object)

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [40]:
# PoU Shapefile Data
# see above for input

print(len(dfPoUshapetemp))
dfPoUshapetemp.head()

1350


Unnamed: 0,FID_1,WRENAME,WRNAME,DWNAME,SYSTEMTYPE,WATERRESID,WRID,DWSYSNUM,WRLINK,WHOLESALER,LABEL,STATE,COUNTY,BASIN,SUBAREA,SUBAREANAM,LANUM,LANAME,ENDYEAR,DATASOURCE,SOURCEDATE,EDITOR,EDITDATE,STATUS,wade_cent_,wade_cent1,Shape_Leng,SHAPE_Le_1,Shape_Le_2,Shape_Area,geometry
0,1,Irontown,Irontown,Irontown,C,564,11358,UTAH11070,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-03-01,Escalante Desert,06-03-01a,Escalante Desert,2022,DWRe/Supplier,2021-09-08,ADAMCLARK,2021-11-19,Active,-113.44596,37.60243,6132.9725,6132.9725,0.06338,7e-05,"POLYGON ((-113.45270 37.60395, -113.45335 37.6..."
1,2,Mountain View SSD,Mountain View Special Service District,Mt. View Spec. Serv. Dist,C,533,11169,UTAH11037,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2022,,,ADAMCLARK,2021-11-19,Active,-113.18932,37.69904,3106.90342,3106.90342,0.03145,5e-05,"POLYGON ((-113.19375 37.70101, -113.19003 37.7..."
2,3,Park West Water Co.,Park West Culinary Water,Park West Water Company,NP,509,1195,UTAH11009,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2005,,,ADAMCLARK,2017-05-31,Inactive,-113.14798,37.7132,3769.21561,3769.21561,0.03821,9e-05,"POLYGON ((-113.15327 37.71748, -113.14286 37.7..."
3,4,Fifetown Water System,Central Iron County Water Conservancy District,Fifetown Water System,NP,559,11047,UTAH11065,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2005,,,ADAMCLARK,2017-05-31,Inactive,-113.10044,37.73981,2986.4397,2986.4397,0.02963,5e-05,"POLYGON ((-113.10386 37.74385, -113.09725 37.7..."
4,5,Eagle Valley Ranch,Eagle Valley Ranch,Eagle Valley Ranch,NP,551,11047,UTAH11056,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2005,,,ADAMCLARK,2017-06-01,Inactive,-113.08533,37.7776,6569.1712,6569.1712,0.06486,0.00013,"POLYGON ((-113.09064 37.76492, -113.09055 37.7..."


In [41]:
# create temp dataframe to hold native ID and geometry from shapefile input
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)

# assing values to temp dataframe based on shapefile input
# for in_SiteNativeID assure ID value is the same as that listed above for POU info.
dfPoUshape['in_SiteNativeID'] = "u" + dfPoUshapetemp['WRID'].astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
print(len(dfPoUshape))
dfPoUshape.head()

1350


Unnamed: 0,in_SiteNativeID,geometry
0,u11358,"POLYGON ((-113.45270 37.60395, -113.45335 37.6..."
1,u11169,"POLYGON ((-113.19375 37.70101, -113.19003 37.7..."
2,u1195,"POLYGON ((-113.15327 37.71748, -113.14286 37.7..."
3,u11047,"POLYGON ((-113.10386 37.74385, -113.09725 37.7..."
4,u11047,"POLYGON ((-113.09064 37.76492, -113.09055 37.7..."


## Export Outputs

In [42]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206476 entries, 0 to 206475
Data columns (total 51 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   WaDEUUID                          206476 non-null  object        
 1   in_MethodUUID                     206476 non-null  object        
 2   in_MethodTypeCV                   206476 non-null  object        
 3   in_VariableSpecificUUID           206476 non-null  object        
 4   in_AggregationIntervalUnitCV      206476 non-null  object        
 5   in_VariableCV                     206476 non-null  object        
 6   in_OrganizationUUID               206476 non-null  object        
 7   in_Geometry                       206476 non-null  object        
 8   in_GNISFeatureNameCV              206476 non-null  object        
 9   in_WaterQualityIndicatorCV        206476 non-null  object        
 10  in_WaterSourceName              

In [43]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_MethodTypeCV,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart,in_VariableSpecificCV
0,in10,UTssps_M1,WaDE Blank,,Monthly,Withdrawal,UTssps_O1,,,Fresh,Wade Blank,wadeId1,Surface Water,WaDE Blank,WaDE Blank,Washington,4326,,,,37.30895,-113.42877,,,POD,Oak Grove Spring Ws001,d10000001,,Spring,UT,,,,,Water Supplier,Leeds Domestic Water Users Association,,,,,,,0,,,Public Supply,0,,2000-01-31,2000-01-01,Withdrawal_Monthly_Public Supply_Surface Water
1,in11,UTssps_M1,WaDE Blank,,Monthly,Withdrawal,UTssps_O1,,,Fresh,Wade Blank,wadeId2,Groundwater,WaDE Blank,WaDE Blank,Washington,4326,,,,37.26708,-113.36336,,,POD,Leeds Well No 1 Ws002,d10000002,,Well,UT,,,,,Water Supplier,Leeds Domestic Water Users Association,,,,,,,0,,,Public Supply,0,,2000-01-31,2000-01-01,Withdrawal_Monthly_Public Supply_Groundwater
2,in12,UTssps_M1,WaDE Blank,,Monthly,Withdrawal,UTssps_O1,,,Fresh,Wade Blank,wadeId2,Groundwater,WaDE Blank,WaDE Blank,Washington,4326,,,,37.26668,-113.36341,,,POD,Leeds Well No4 Ws006,d108547239,,Well,UT,,,,,Water Supplier,Leeds Domestic Water Users Association,,,,,,,0,,,Public Supply,0,,2000-01-31,2000-01-01,Withdrawal_Monthly_Public Supply_Groundwater
3,in13,UTssps_M1,WaDE Blank,,Monthly,Withdrawal,UTssps_O1,,,Fresh,Wade Blank,wadeId2,Groundwater,WaDE Blank,WaDE Blank,Weber,4326,,,,41.16996,-112.00563,,,POD,5190S 1050W #1 Well Ws003,d10010001,,Well,UT,,,,,Water Supplier,Riverdale City,,,,,,,0,,,Public Supply,0,,2000-01-31,2000-01-01,Withdrawal_Monthly_Public Supply_Groundwater
4,in14,UTssps_M1,WaDE Blank,,Monthly,Withdrawal,UTssps_O1,,,Fresh,Wade Blank,wadeId2,Groundwater,WaDE Blank,WaDE Blank,Millard,4326,,,,38.96439,-112.38104,,,POD,Fc Well #2 Ws003,d10030001,,Well,UT,,,,,Water Supplier,Fillmore Municipal Water System,,,,,,,0,,,Public Supply,0,,2000-01-31,2000-01-01,Withdrawal_Monthly_Public Supply_Groundwater
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206471,in222509,UTssps_M5,Metered,,Annual,Consumptive Use,UTssps_O1,,,Fresh,Wade Blank,wadeId3,WaDE Blank,WaDE Blank,WaDE Blank,Garfield,4326,,,,,,,,POU,West Panguitch Irrigation And Reservoir Company,u12016,,Secondary,UT,,,,,Institutional,West Panguitch Irrigation and Reservoir Company,,,,,,,0,,,Municipal Irrigation,2023,,2023-12-31,2023-01-01,Consumptive Use_Annual_Municipal Irrigation_Wa...
206472,in222510,UTssps_M5,Metered,,Annual,Consumptive Use,UTssps_O1,,,Fresh,Wade Blank,wadeId3,WaDE Blank,WaDE Blank,WaDE Blank,Washington,4326,,,,,,,,POU,Virgin Irrigation Company,u12018,,Secondary,UT,,,,,Institutional,Virgin Irrigation Company,,,,,,,0,,,Municipal Irrigation,2023,,2023-12-31,2023-01-01,Consumptive Use_Annual_Municipal Irrigation_Wa...
206473,in222511,UTssps_M5,Metered,,Annual,Consumptive Use,UTssps_O1,,,Fresh,Wade Blank,wadeId3,WaDE Blank,WaDE Blank,WaDE Blank,Garfield,4326,,,,,,,,POU,Tropic Town Misc,u12023,,Mgt Plan,UT,,,,,Institutional,Tropic Town Misc,,,,,,,0,,,Municipal Irrigation,2023,,2023-12-31,2023-01-01,Consumptive Use_Annual_Municipal Irrigation_Wa...
206474,in222512,UTssps_M5,Metered,,Annual,Consumptive Use,UTssps_O1,,,Fresh,Wade Blank,wadeId3,WaDE Blank,WaDE Blank,WaDE Blank,Washington,4326,,,,,,,,POU,Zion Under Canvas,u12024,,Public,UT,,,,,Institutional,Zion Under Canvas,,,,,,,0,,,Municipal Irrigation,2023,,2023-12-31,2023-01-01,Consumptive Use_Annual_Municipal Irrigation_Wa...


In [44]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pssps_Main.zip', compression=dict(method='zip', archive_name='Pssps_Main.csv'), index=False)  # The output, save as a zip
dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.