# Preprocessing Utah Site Specific data for WaDEQA upload.
- Date Updated: 11/14/2021
- Purpose:  To preprocess the UDWRi and UDWRe data into one master file for simple DataFrame creation and extraction

Notes:
- Will treat UDWRi System + UDWRe data as POUs, and UDWRi Source data as PODs.
- For annual data, assume start = 01/01 & end =  12/31 for now.
- Seperate out water use System data data by customer type / benefical use (e.g. Domestic, Commerical, Industrial, Insitutional).  Do not use the Total fields.
- Seperate out the water use Source data by monthly and again by annual.
- create missing elements (water source type).

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Utah/SS_PublicSupplyWaterUse/RawInputData"
os.chdir(workingDir)

## Place of Use Data

#### UDWRi_SystemData_PerUse_input

In [3]:
# Input File - UDWRi_SystemData_PerUse_input.csv
fileInput = "UDWRi_SystemData_PerUse_input.csv"
dfsyspu = pd.read_csv(fileInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsyspu:
    dfsyspu['WaDEUUID'] = "utsyspu" + dfsyspu.index.astype(str)
    dfsyspu.to_csv('UDWRi_SystemData_PerUse_input.csv', index=False)

print(len(dfsyspu))
dfsyspu.head(1)

11070


Unnamed: 0,System ID,System Name,System Type,System Status,History Year,Date Received,County,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,sumUse?,Total Use,Method of Measurement,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units,Secondary Domestic Percent,Secondary Industrial Percent,Secondary Commercial Percent,Secondary Institutional Percent,Secondary Agriculture Percent,Secondary Domestic Connections,Secondary Industrial Connections,Secondary Commercial Connections,Secondary Institutional Connections,Secondary Agriculture Connections,Secondary Irrigation (Lawn and Garden) Acres,Secondary Irrigation (Agriculture) Acres,Secondary Metered Domestic Connections,Secondary Metered Industrial Connections,Secondary Metered Commercial Connections,Secondary Metered Institutional Connections,Secondary Metered Agriculture Connections,Secondary Metered Domestic Use,Secondary Metered Commercial Use,Secondary Metered Industrial Use,Secondary Metered Institutional Use,Secondary Metered Agriculture Use,System Comments,WaDEUUID
0,11843,Pinecrest Pipeline Operating Company,Public,Active,2021,6/15/2022,Salt Lake,18129,30,4724846,-,-,-,4724846.0,4724846.0,estimate,28,0,0,0.0,28,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,utsys21599


In [4]:
# clean up input data
# ---------------------------

#active recods only
dfsyspu = dfsyspu[dfsyspu['System Status'] == "Active"]

# drop rows with a null Year value
dfsyspu = dfsyspu.dropna(subset=['History Year'])

# Adjust data type of fields
dfsyspu['Year'] = dfsyspu['History Year'].astype(int)

print(len(dfsyspu))

11070


In [5]:
useTypeList = ['Domestic', 'Commercial', 'Industrial', 'Institutional']

In [6]:
%%time

dfsyspuOut = pd.DataFrame()

for x in useTypeList:
    useTypeString = str(x)
    print(useTypeString)
    
    dftemp = pd.DataFrame(index=dfsyspu.index)
    # Variable Info
    dftemp['in_VariableCV'] = "Delivered Water Use"
    dftemp['in_VariableSpecificCV'] = "Delivered Water Use_Annual_" + useTypeString
    
    # SiteVariableAmounts_fact Info
    amountUseStr = useTypeString + " Use"
    dftemp['in_Amount'] = dfsyspu[amountUseStr]
    dftemp['in_PopulationServed'] = dfsyspu['Domestic Connections']
    dftemp['in_BenUse'] = useTypeString
#     dftemp['in_CustomerTypeCV'] = useTypeString
    dftemp['in_ReportYearCV'] = dfsyspu['History Year']
    dftemp['in_TimeframeEnd'] = '12/31/' + dfsyspu['History Year'].astype(str)
    dftemp['in_TimeframeStart'] = '01/01/' + dfsyspu['History Year'].astype(str)
    
    # Water Source Info
    dftemp['in_WaterSourceTypeCV'] = "Unspecified"
    
    # link to site data
    dftemp['linkKey'] = dfsyspu['System ID']
    
    dfsyspuOut = pd.concat([dfsyspuOut, dftemp])   

print(len(dfsyspuOut))
dfsyspuOut.head(1)

Domestic
Commercial
Industrial
Institutional
44280
Wall time: 54 ms


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey
0,Delivered Water Use,Delivered Water Use_Annual_Domestic,4724846,28,Domestic,2021,12/31/2021,01/01/2021,Unspecified,11843


#### UDWRi_SystemData_Total_no0Null_input

In [7]:
# Input File - UDWRi_SystemData_Total_no0Null_input.csv
fileInput = "UDWRi_SystemData_Total_no0Null_input.csv"
dfsyst = pd.read_csv(fileInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsyst:
    dfsyst['WaDEUUID'] = "utsyst" + dfsyst.index.astype(str)
    dfsyst.to_csv('UDWRi_SystemData_Total_no0Null_input.csv', index=False)

print(len(dfsyst))
dfsyst.head(1)

3650


Unnamed: 0,System ID,System Name,System Type,System Status,History Year,Date Received,County,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,sumUse?,Total Use,Method of Measurement,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units,Secondary Domestic Percent,Secondary Industrial Percent,Secondary Commercial Percent,Secondary Institutional Percent,Secondary Agriculture Percent,Secondary Domestic Connections,Secondary Industrial Connections,Secondary Commercial Connections,Secondary Institutional Connections,Secondary Agriculture Connections,Secondary Irrigation (Lawn and Garden) Acres,Secondary Irrigation (Agriculture) Acres,Secondary Metered Domestic Connections,Secondary Metered Industrial Connections,Secondary Metered Commercial Connections,Secondary Metered Institutional Connections,Secondary Metered Agriculture Connections,Secondary Metered Domestic Use,Secondary Metered Commercial Use,Secondary Metered Industrial Use,Secondary Metered Institutional Use,Secondary Metered Agriculture Use,System Comments,WaDEUUID
0,11157,Twin Oaks Local District,Public,Active,2011,1/30/2012,Sanpete,20014,101,0.0,0.0,0.0,0.0,0.0,4139312.0,,21,0,0,3.0,24,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Emailed David Asay with some questions on 1/30...,utsys18949


In [8]:
# clean up input data
# ---------------------------

#active recods only
dfsyst = dfsyst[dfsyst['System Status'] == "Active"]

# drop rows with a null Year value
dfsyst = dfsyst.dropna(subset=['History Year'])

# Adjust data type of fields
dfsyst['Year'] = dfsyst['History Year'].astype(int)

print(len(dfsyst))

3650


In [9]:
useTypeList = ['Total Use']

In [10]:
%%time

dfsystOut = pd.DataFrame()

for x in useTypeList:
    useTypeString = str(x)
    print(useTypeString)
    
    dftemp = pd.DataFrame(index=dfsyst.index)
    # Variable Info
    dftemp['in_VariableCV'] = "Delivered Water Use"
    dftemp['in_VariableSpecificCV'] = "Delivered Water Use_Annual_DCII"
    
    # SiteVariableAmounts_fact Info
    amountUseStr = useTypeString
    dftemp['in_Amount'] = dfsyst[amountUseStr]
    dftemp['in_PopulationServed'] = dfsyst['Domestic Connections']
    dftemp['in_BenUse'] = "DCII"
#     dftemp['in_CustomerTypeCV'] = "DCII"
    dftemp['in_ReportYearCV'] = dfsyst['History Year']
    dftemp['in_TimeframeEnd'] = '12/31/' + dfsyst['History Year'].astype(str)
    dftemp['in_TimeframeStart'] = '01/01/' + dfsyst['History Year'].astype(str)
    
    # Water Source Info
    dftemp['in_WaterSourceTypeCV'] = "Unspecified"
    
    # link to site data
    dftemp['linkKey'] = dfsyst['System ID']
    
    dfsystOut = pd.concat([dfsystOut, dftemp])   

print(len(dfsystOut))
dfsystOut.head(1)

Total Use
3650
Wall time: 8 ms


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey
0,Delivered Water Use,Delivered Water Use_Annual_DCII,4139312.0,21,DCII,2011,12/31/2011,01/01/2011,Unspecified,11157


#### UDWRe Culinary Service Area Data
- Need to tie the UDWRi system data to the UDWRe site info

In [11]:
# PoU Shapefile Data
# Shapefile input
ShapeFileInput = gpd.read_file('shapefile/CulinaryWaterServiceAreas.shp')
dfcsa = pd.DataFrame(ShapeFileInput)
dfcsa.head(3)

Unnamed: 0,FID_1,WRENAME,WRNAME,DWNAME,SYSTEMTYPE,WATERRESID,WRID,DWSYSNUM,WRLINK,WHOLESALER,LABEL,STATE,COUNTY,BASIN,SUBAREA,SUBAREANAM,LANUM,LANAME,ENDYEAR,DATASOURCE,SOURCEDATE,EDITOR,EDITDATE,STATUS,Shape_Leng,Shape_Area,Latitude,Longitude,geometry
0,1,Irontown,Irontown,Irontown,C,564,11358,UTAH11070,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-03-01,Escalante Desert,06-03-01a,Escalante Desert,2021,DWRe/Supplier,2021-09-08,ADAMCLARK,2021-11-19,Active,0.063384,7.1e-05,37.602435,-113.44596,"POLYGON ((-113.45270 37.60395, -113.45335 37.6..."
1,2,Mountain View SSD,Mountain View Special Service District,Mt. View Spec. Serv. Dist,C,533,11169,UTAH11037,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2021,,,ADAMCLARK,2021-11-19,Active,0.031452,4.9e-05,37.699043,-113.189322,"POLYGON ((-113.18669 37.70173, -113.18473 37.7..."
2,3,Park West Water Co.,Park West Culinary Water,Park West Water Company,NP,509,1195,UTAH11009,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2005,,,ADAMCLARK,2017-05-31,Inactive,0.038206,9.1e-05,37.713197,-113.147984,"POLYGON ((-113.15327 37.71748, -113.14286 37.7..."


In [12]:
# # Input File - StreamGageGetStationList.csv
# fileInput = "UDWRe_CulinaryWaterServiceAreas_input.csv"
# dfcsa = pd.read_csv(fileInput)

# # WaDE UUID tracker for data assessment
# if 'WaDEUUID' not in dfcsa:
#     dfcsa['WaDEUUID'] = "utcsa" + dfcsa.index.astype(str)
#     dfcsa.to_csv('UDWRe_CulinaryWaterServiceAreas_input.csv', index=False)

# print(len(dfcsa))
# dfcsa.head(1)

In [13]:
# Output dataframe for Culinary Service Area Data

dfcsa_Out = pd.DataFrame(index=dfcsa.index)
dfcsa_Out['linkKey'] = dfcsa['WRID']

# Site Info
dfcsa_Out['in_CoordinateMethodCV'] = "Centroid of Area"
dfcsa_Out['in_County'] = dfcsa['COUNTY']
dfcsa_Out['in_Latitude'] = dfcsa['Latitude'].astype(float)
dfcsa_Out['in_Longitude'] = dfcsa['Longitude'].astype(float)
dfcsa_Out['in_PODorPOUSite'] = "POU"
dfcsa_Out['in_SiteName'] = dfcsa['WRENAME']
dfcsa_Out['in_SiteNativeID'] = "POU" + dfcsa['WRID'].astype(str)
dfcsa_Out['in_SiteTypeCV'] = "Unspecified"

dfcsa_Out = dfcsa_Out.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfcsa_Out))
dfcsa_Out.head(1)

1311


Unnamed: 0,linkKey,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV
0,11358,Centroid of Area,Iron,37.602435,-113.44596,POU,Irontown,POU11358,Unspecified


In [14]:
# Merge site data to System_Per Use data
dfsyspuOut = pd.merge(dfsyspuOut, dfcsa_Out, on='linkKey', how='left')
dfsyspuOut = dfsyspuOut.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfsyspuOut))
dfsyspuOut.head()

45512


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV
0,Delivered Water Use,Delivered Water Use_Annual_Domestic,4724846,28,Domestic,2021,12/31/2021,01/01/2021,Unspecified,11843,,,,,,,,
1,Delivered Water Use,Delivered Water Use_Annual_Domestic,1228800,14,Domestic,2021,12/31/2021,01/01/2021,Unspecified,11830,Centroid of Area,Millard,38.820959,-112.405148,POU,Kanosh Band Of Paiute Indians,POU11830,Unspecified
2,Delivered Water Use,Delivered Water Use_Annual_Domestic,2443886,15,Domestic,2021,12/31/2021,01/01/2021,Unspecified,11821,,,,,,,,
3,Delivered Water Use,Delivered Water Use_Annual_Domestic,3360055,24,Domestic,2021,12/31/2021,01/01/2021,Unspecified,11815,Centroid of Area,Iron,37.559081,-113.225921,POU,Harmony Mtn Ranches,POU11815,Unspecified
4,Delivered Water Use,Delivered Water Use_Annual_Domestic,-,0,Domestic,2021,12/31/2021,01/01/2021,Unspecified,11814,Centroid of Area,Weber,41.360983,-111.896901,POU,Camps Ben Lomond - Shawnee,POU11814,Unspecified


In [15]:
# Merge site data to System_Total data
dfsystOut = pd.merge(dfsystOut, dfcsa_Out, on='linkKey', how='left')
dfsystOut = dfsystOut.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfsystOut))
dfsystOut.head()

3699


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV
0,Delivered Water Use,Delivered Water Use_Annual_DCII,4139312.0,21,DCII,2011,12/31/2011,01/01/2011,Unspecified,11157,Centroid of Area,Sanpete,39.500479,-111.416612,POU,Twin Oaks Local District,POU11157,Unspecified
1,Delivered Water Use,Delivered Water Use_Annual_DCII,236815784.2,0,DCII,2012,12/31/2012,01/01/2012,Unspecified,11094,,,,,,,,
2,Delivered Water Use,Delivered Water Use_Annual_DCII,236492213.8,0,DCII,2014,12/31/2014,01/01/2014,Unspecified,11094,,,,,,,,
3,Delivered Water Use,Delivered Water Use_Annual_DCII,234893260.8,0,DCII,2011,12/31/2011,01/01/2011,Unspecified,11094,,,,,,,,
4,Delivered Water Use,Delivered Water Use_Annual_DCII,229070295.8,0,DCII,2010,12/31/2010,01/01/2010,Unspecified,11094,,,,,,,,


In [16]:
# Concatenate all System Data into one long dataframe.
frames = [dfsyspuOut, dfsystOut]
dfsysOut = pd.concat(frames)

dfsysOut = dfsysOut.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfsysOut))
dfsysOut.head(1)

49211


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV
0,Delivered Water Use,Delivered Water Use_Annual_Domestic,4724846,28,Domestic,2021,12/31/2021,01/01/2021,Unspecified,11843,,,,,,,,


## UDWRi Source (POD) Data

#### UDWRi_SourceData_Monthly_input

In [17]:
# Input File - UDWRi_SourceData_Monthly_input.csv
fileInput = "UDWRi_SourceData_Monthly_input.csv"
dfsourm = pd.read_csv(fileInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsourm:
    dfsourm['WaDEUUID'] = "utsourm" + dfsourm.index.astype(str)
    dfsourm.to_csv('UDWRi_SourceData_Monthly_input.csv', index=False)

print(len(dfsourm))
dfsourm.head(1)

44002


  dfsourm = pd.read_csv(fileInput)


Unnamed: 0,Source ID,Source Name,Source Status,System ID,System Name,Source Type,Diversion Type,Use Type,Year,Units,Method of Measurement,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,sumMonthly?,Total,Lat NAD83,Lon NAD83,WaDEUUID
0,108546750,"Delivered to Utah Iron, LLC Mine",Active,11047,Central Iron County Water Conservancy District,,Transfer Out,Water Supplier,2021,gallons,Individual Meter,7529860.0,8355560.0,11315000.0,9358720.0,11353730.0,11240990.0,12543470.0,11926400.0,12702750.0,11609960.0,11772120.0,12013090.0,131721650.0,131721650.0,40.783814,-111.796239,utsour52585


In [18]:
# clean up input data
# ---------------------------

#active recods only
dfsourm = dfsourm[dfsourm['Source Status'] == "Active"]

# drop rows with a null Year value
dfsourm = dfsourm.dropna(subset=['Year'])

# Adjust data type of fields
dfsourm['Year'] = dfsourm['Year'].astype(int)

print(len(dfsourm))

44002


In [19]:
# Loop data list.  Use this to search for specific fields.
variableTypeList = ['Withdrawal', 'Transfer In', 'Transfer Out', 'Delivery', 'Return']

useTypeList = ['Industrial', 'Irrigation', 'Domestic', 'Commercial', 'Geothermal', 'Agricultural', 'Mining', 
               'Power (Fossil-Fuel)', 'Power (Geothermal)', 'Power (Hydro-Elec)', 'Sewage Treatment', 'Water Supplier']

monthUseList = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
startDateList = ["01/01/", "02/01/", "03/01/", "04/01/", "05/01/", "06/01/", "07/01/", "08/01/", "09/01/", "10/01/", "11/01/", "12/01/"]
endDateList = ["01/31/", "02/28/", "03/31/", "04/30/", "05/31/", "06/30/", "07/31/", "08/31/", "09/30/", "10/31/", "11/30/", "12/31/"]

In [20]:
# create time series data

# the output dataframe
dfsourm_Out = pd.DataFrame()

# for each value in variableTypeList
for x in variableTypeList:
    
    # trim down input data to match variable Type
    variableTypeStr = str(x)
    print(variableTypeStr) 
    dfsourm2 = dfsourm.copy()
    dfsourm2 = dfsourm2[dfsourm2['Diversion Type'] == variableTypeStr].reset_index(drop=True)
    print(f"length of dfsour2 ", variableTypeStr, " is ", len(dfsourm2))
          
    # for each value in useTypeList
    for y in useTypeList:
        
        # trim down input data to match Use Type
        useTypeString = str(y)
        print(f"..", useTypeString)
        dfsourm3 = dfsourm2.copy()
        dfsourm3 = dfsourm3[dfsourm3['Use Type'] == useTypeString].reset_index(drop=True)
        print(f"..length of df ", useTypeString, " is ", len(dfsourm3))
    
        # Create Output for that Use Type
        dftemp2 = pd.DataFrame(index=dfsourm3.index)
    
        # Variable Info
        dftemp2['in_VariableCV'] = dfsourm3['Diversion Type'].astype(str)
        dftemp2['in_VariableSpecificCV'] = dfsourm3['Diversion Type'].astype(str) + "_Monthly_" + useTypeString
        
        # SiteVariableAmounts_fact Info
        dftemp2['in_PopulationServed'] = ""
        dftemp2['in_BenUse'] = dfsourm3['Use Type']
#         dftemp2['in_CustomerTypeCV'] = dfsourm3['Use Type']
        dftemp2['in_ReportYearCV'] = dfsourm3['Year']
    
        # Water Source Info
        dftemp2['in_WaterSourceTypeCV'] = dfsourm3['Source Type']
    
        # Site Info
        dftemp2['in_CoordinateMethodCV'] = "Representation Node"
        dftemp2['in_Latitude'] = dfsourm3['Lat NAD83'].astype(float)
        dftemp2['in_Longitude'] = dfsourm3['Lon NAD83'].astype(float)
        dftemp2['in_PODorPOUSite'] = "POD"
        dftemp2['in_SiteName'] = dfsourm3['Source Name']
        dftemp2['in_SiteNativeID'] = "POD" + dfsourm3['Source ID'].astype(str)
        dftemp2['in_SiteTypeCV'] = dfsourm3['Source Type']
        
        # link to other sites
        dftemp2['linkKey'] = dfsourm3['System ID']
        
        # for each value in monthUseList
        for z in range(len(monthUseList)):
            monthUseString = str(monthUseList[z])
            startDateString = str(startDateList[z])
            endDateString = str(endDateList[z])
            dftemp3 = dftemp2.copy()
            dftemp3['monthCheck'] = monthUseString
            dftemp3['in_Amount'] = dfsourm3[monthUseList[z]] # insert column name here
            dftemp3['in_TimeframeStart'] = startDateString + dfsourm3['Year'].astype(str)
            dftemp3['in_TimeframeEnd'] = endDateString + dfsourm3['Year'].astype(str)
            dfsourm_Out = pd.concat([dfsourm_Out, dftemp3])

print(len(dfsourm_Out))

Withdrawal
length of dfsour2  Withdrawal  is  37920
.. Industrial
..length of df  Industrial  is  3192
.. Irrigation
..length of df  Irrigation  is  3519
.. Domestic
..length of df  Domestic  is  867
.. Commercial
..length of df  Commercial  is  496
.. Geothermal
..length of df  Geothermal  is  113
.. Agricultural
..length of df  Agricultural  is  1627
.. Mining
..length of df  Mining  is  1840
.. Power (Fossil-Fuel)
..length of df  Power (Fossil-Fuel)  is  576
.. Power (Geothermal)
..length of df  Power (Geothermal)  is  241
.. Power (Hydro-Elec)
..length of df  Power (Hydro-Elec)  is  0
.. Sewage Treatment
..length of df  Sewage Treatment  is  0
.. Water Supplier
..length of df  Water Supplier  is  25449
Transfer In
length of dfsour2  Transfer In  is  3664
.. Industrial
..length of df  Industrial  is  201
.. Irrigation
..length of df  Irrigation  is  378
.. Domestic
..length of df  Domestic  is  70
.. Commercial
..length of df  Commercial  is  24
.. Geothermal
..length of df  Geother

#### UDWRi_SourceData_Annual_no0Null_input

In [21]:
# Input File - UDWRi_SourceData_Annual_no0Null_input.csv
fileInput = "UDWRi_SourceData_Annual_no0Null_input.csv"
dfsoura = pd.read_csv(fileInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsoura:
    dfsoura['WaDEUUID'] = "utsoura" + dfsoura.index.astype(str)
    dfsoura.to_csv('UDWRi_SourceData_Annual_no0Null_input.csv', index=False)

print(len(dfsoura))
dfsoura.head(1)

8581


Unnamed: 0,Source ID,Source Name,Source Status,System ID,System Name,Source Type,Diversion Type,Use Type,Year,Units,Method of Measurement,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,sumMonthly,Total,Lat NAD83,Lon NAD83,WaDEUUID
0,108546815,Delivered to Newton Town Sprinkler Co.,Active,1371,Newton Town Water,Spring,Transfer Out,Irrigation,,,,,,,,,,,,,,,,0,,40.783814,-111.796239,utsour52588


In [22]:
# clean up input data
# ---------------------------

#active recods only
dfsoura = dfsoura[dfsoura['Source Status'] == "Active"]

# drop rows with a null Year value
dfsoura = dfsoura.dropna(subset=['Year'])

# Adjust data type of fields
dfsoura['Year'] = dfsoura['Year'].astype(int)

print(len(dfsoura))

8484


In [23]:
# Loop data list.  Use this to search for specific fields.
variableTypeList = ['Withdrawal', 'Transfer In', 'Transfer Out', 'Delivery', 'Return']

useTypeList = ['Industrial', 'Irrigation', 'Domestic', 'Commercial', 'Geothermal', 'Agricultural', 'Mining', 
               'Power (Fossil-Fuel)', 'Power (Geothermal)', 'Power (Hydro-Elec)', 'Sewage Treatment', 'Water Supplier']

monthUseList = ['Total']

In [24]:
# create time series data

# the output dataframe
dfsoura_Out = pd.DataFrame()

# for each value in variableTypeList
for x in variableTypeList:
    
    # trim down input data to match variable Type
    variableTypeStr = str(x)
    print(variableTypeStr) 
    dfsoura2 = dfsoura.copy()
    dfsoura2 = dfsoura2[dfsoura2['Diversion Type'] == variableTypeStr].reset_index(drop=True)
    print(f"length of dfsour2 ", variableTypeStr, " is ", len(dfsoura2))
          
    # for each value in useTypeList
    for y in useTypeList:
        
        # trim down input data to match Use Type
        useTypeString = str(y)
        print(f"..", useTypeString)
        dfsoura3 = dfsoura2.copy()
        dfsoura3 = dfsoura3[dfsoura3['Use Type'] == useTypeString].reset_index(drop=True)
        print(f"..length of df ", useTypeString, " is ", len(dfsoura3))
    
        # Create Output for that Use Type
        dftemp2 = pd.DataFrame(index=dfsoura3.index)
    
        # Variable Info
        dftemp2['in_VariableCV'] = dfsoura3['Diversion Type'].astype(str)
        dftemp2['in_VariableSpecificCV'] = dfsoura3['Diversion Type'].astype(str) + "_Annual_" + useTypeString
        
        # SiteVariableAmounts_fact Info
        dftemp2['in_PopulationServed'] = ""
        dftemp2['in_BenUse'] = dfsoura3['Use Type']
#         dftemp2['in_CustomerTypeCV'] = dfsoura3['Use Type']
        dftemp2['in_ReportYearCV'] = dfsoura3['Year']
    
        # Water Source Info
        dftemp2['in_WaterSourceTypeCV'] = dfsoura3['Source Type']
    
        # Site Info
        dftemp2['in_CoordinateMethodCV'] = "Representation Node"
        dftemp2['in_Latitude'] = dfsoura3['Lat NAD83'].astype(float)
        dftemp2['in_Longitude'] = dfsoura3['Lon NAD83'].astype(float)
        dftemp2['in_PODorPOUSite'] = "POD"
        dftemp2['in_SiteName'] = dfsoura3['Source Name']
        dftemp2['in_SiteNativeID'] = "POD" + dfsoura3['Source ID'].astype(str)
        dftemp2['in_SiteTypeCV'] = dfsoura3['Source Type']
        
        # link to other sites
        dftemp2['linkKey'] = dfsoura3['System ID']
        
        # for each value in monthUseList
        for z in range(len(monthUseList)):
            monthUseString = str(monthUseList[z])
            startDateString = str(startDateList[z])
            endDateString = str(endDateList[z])
            dftemp3 = dftemp2.copy()
            dftemp3['monthCheck'] = monthUseString
            dftemp3['in_Amount'] = dfsoura3[monthUseList[z]] # insert column name here
            dftemp3['in_TimeframeEnd'] = '12/31/' + dfsoura3['Year'].astype(str)
            dftemp3['in_TimeframeStart'] = '01/01/' + dfsoura3['Year'].astype(str)
            dfsoura_Out = pd.concat([dfsoura_Out, dftemp3])

print(len(dfsoura_Out))

Withdrawal
length of dfsour2  Withdrawal  is  7363
.. Industrial
..length of df  Industrial  is  698
.. Irrigation
..length of df  Irrigation  is  758
.. Domestic
..length of df  Domestic  is  153
.. Commercial
..length of df  Commercial  is  127
.. Geothermal
..length of df  Geothermal  is  69
.. Agricultural
..length of df  Agricultural  is  245
.. Mining
..length of df  Mining  is  272
.. Power (Fossil-Fuel)
..length of df  Power (Fossil-Fuel)  is  53
.. Power (Geothermal)
..length of df  Power (Geothermal)  is  18
.. Power (Hydro-Elec)
..length of df  Power (Hydro-Elec)  is  0
.. Sewage Treatment
..length of df  Sewage Treatment  is  0
.. Water Supplier
..length of df  Water Supplier  is  4970
Transfer In
length of dfsour2  Transfer In  is  539
.. Industrial
..length of df  Industrial  is  15
.. Irrigation
..length of df  Irrigation  is  30
.. Domestic
..length of df  Domestic  is  10
.. Commercial
..length of df  Commercial  is  2
.. Geothermal
..length of df  Geothermal  is  0
..

In [25]:
# Concatenate all Source Data into one long dataframe.
frames = [dfsourm_Out, dfsoura_Out]
dfsour_Out = pd.concat(frames)

dfsour_Out = dfsour_Out.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfsour_Out))
dfsour_Out.head(1)

536508


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_PopulationServed,in_BenUse,in_ReportYearCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,linkKey,monthCheck,in_Amount,in_TimeframeStart,in_TimeframeEnd
0,Withdrawal,Withdrawal_Monthly_Industrial,,Industrial,2021,Stream,Representation Node,40.132955,-110.001197,POD,Pleasant Valley Pump Station,POD108546294,Stream,11787,Jan,-,01/01/2021,01/31/2021


# Concatenate System Data (POUs) with Source Data (PODs).

In [26]:
# Concatenate Source Data into one long dataframe.
frames = [dfsysOut, dfsour_Out]
dfout = pd.concat(frames)

dfout = dfout.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfout))
dfout.head(1)

585719


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,monthCheck
0,Delivered Water Use,Delivered Water Use_Annual_Domestic,4724846,28,Domestic,2021,12/31/2021,01/01/2021,Unspecified,11843,,,,,,,,,


# Clean data

In [27]:
# fix blank / null WaterSourcetypeCV
# simplify to WaDE specific categories

wsTypeDict = {
    "Well" : "Groundwater",
    "Well/Spring" : "Groundwater",
    "Well Field" : "Groundwater",
    "Well/Stream" : "Groundwater",
    "Tunnel" : "Groundwater",
    "Drain" : "Groundwater",
    "Stream" : "Surface Water",
    "Spring" : "Surface Water",
    "Reservoir" : "Surface Water",
    "Lake" : "Surface Water"}

def fixWaterSourceTypeCV(valA):
    valA = str(valA).strip()
    if valA == "" or pd.isnull(valA):
        outString = "Unspecified"
    else:
        try:
            outString = wsTypeDict[valA]
        except:
            outString = "Unspecified"
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: fixWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceTypeCV'].unique()

array(['Unspecified', 'Surface Water', 'Groundwater'], dtype=object)

In [28]:
# Convert History Year to string.

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], errors = 'coerce')
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], errors = 'coerce')
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.tail(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,monthCheck
585718,Return,Return_Annual_Power (Geothermal),-,,Power (Geothermal),2019,2019-12-31,2019-01-01,Groundwater,2715,Representation Node,,38.568977,-112.58701,POD,Geothermal Injection Well 61-12,POD108543012,Well,Total


In [29]:
# Converting in_PopulationServed to int value.
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].replace(np.nan, 0, regex=True)
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].replace('', 0, regex=True)
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].astype(int)

dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,monthCheck
0,Delivered Water Use,Delivered Water Use_Annual_Domestic,4724846,28,Domestic,2021,2021-12-31,2021-01-01,Unspecified,11843,,,,,,,,,


In [30]:
# Converting Latitude & Longitude to float values
dfout['in_Latitude'] = dfout['in_Latitude'].replace(np.nan, 0, regex=True)
dfout['in_Latitude'] = dfout['in_Latitude'].replace('', 0, regex=True)
dfout['in_Latitude'] = dfout['in_Latitude'].astype(float)

dfout['in_Longitude'] = dfout['in_Longitude'].replace(np.nan, 0, regex=True)
dfout['in_Longitude'] = dfout['in_Longitude'].replace('', 0, regex=True)
dfout['in_Longitude'] = dfout['in_Longitude'].astype(float)

dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,monthCheck
0,Delivered Water Use,Delivered Water Use_Annual_Domestic,4724846,28,Domestic,2021,2021-12-31,2021-01-01,Unspecified,11843,,,0.0,0.0,,,,,


In [31]:
# Converting in_Amount to float values
# issue of some values are strings.
dfout['in_Amount'] = dfout['in_Amount'].replace('-','', regex=True)
dfout['in_Amount'] = dfout['in_Amount'].replace(',','', regex=True)
dfout['in_Amount'] = dfout['in_Amount'].replace('FALSE','', regex=True)
dfout['in_Amount'] = dfout['in_Amount'].str.strip()
dfout['in_Amount'] = pd.to_numeric(dfout['in_Amount'])

dfout['in_Amount'] = dfout['in_Amount'].astype(float)

dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,monthCheck
0,Delivered Water Use,Delivered Water Use_Annual_Domestic,4724846.0,28,Domestic,2021,2021-12-31,2021-01-01,Unspecified,11843,,,0.0,0.0,,,,,


## WaDE Custom Elements (due to missing info)

In [33]:
# updaing VariableSpecificCV to include a water source type

def updateVariableSpecificCV(valA, valB):
    valA = str(valA).strip()
    valB = str(valB).strip()       
    outString = valA + "_" + valB
    return outString

dfout['in_VariableSpecificCV'] = dfout.apply(lambda row: updateVariableSpecificCV(row['in_VariableSpecificCV'], row['in_WaterSourceTypeCV']), axis=1)
dfout['in_VariableSpecificCV'].unique()

array(['Delivered Water Use_Annual_Domestic_Unspecified',
       'Delivered Water Use_Annual_Commercial_Unspecified',
       'Delivered Water Use_Annual_Industrial_Unspecified',
       'Delivered Water Use_Annual_Institutional_Unspecified',
       'Delivered Water Use_Annual_DCII_Unspecified',
       'Withdrawal_Monthly_Industrial_Surface Water',
       'Withdrawal_Monthly_Industrial_Groundwater',
       'Withdrawal_Monthly_Industrial_Unspecified',
       'Withdrawal_Monthly_Irrigation_Surface Water',
       'Withdrawal_Monthly_Irrigation_Groundwater',
       'Withdrawal_Monthly_Irrigation_Unspecified',
       'Withdrawal_Monthly_Domestic_Groundwater',
       'Withdrawal_Monthly_Domestic_Surface Water',
       'Withdrawal_Monthly_Commercial_Groundwater',
       'Withdrawal_Monthly_Commercial_Surface Water',
       'Withdrawal_Monthly_Geothermal_Groundwater',
       'Withdrawal_Monthly_Agricultural_Groundwater',
       'Withdrawal_Monthly_Agricultural_Surface Water',
       'Withdrawal_

In [34]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEUT_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

array(['WaDEUT_WS1', 'WaDEUT_WS2', 'WaDEUT_WS3'], dtype=object)

## Shapefile Data
- For attaching gemetry to POU csv inputs.

In [35]:
# PoU Shapefile Data
# Shapefile input
ShapeFileInput = gpd.read_file('shapefile/CulinaryWaterServiceAreas.shp')
dfPoUshapetemp = pd.DataFrame(ShapeFileInput)
dfPoUshapetemp.head(3)

Unnamed: 0,FID_1,WRENAME,WRNAME,DWNAME,SYSTEMTYPE,WATERRESID,WRID,DWSYSNUM,WRLINK,WHOLESALER,LABEL,STATE,COUNTY,BASIN,SUBAREA,SUBAREANAM,LANUM,LANAME,ENDYEAR,DATASOURCE,SOURCEDATE,EDITOR,EDITDATE,STATUS,Shape_Leng,Shape_Area,Latitude,Longitude,geometry
0,1,Irontown,Irontown,Irontown,C,564,11358,UTAH11070,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-03-01,Escalante Desert,06-03-01a,Escalante Desert,2021,DWRe/Supplier,2021-09-08,ADAMCLARK,2021-11-19,Active,0.063384,7.1e-05,37.602435,-113.44596,"POLYGON ((-113.45270 37.60395, -113.45335 37.6..."
1,2,Mountain View SSD,Mountain View Special Service District,Mt. View Spec. Serv. Dist,C,533,11169,UTAH11037,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2021,,,ADAMCLARK,2021-11-19,Active,0.031452,4.9e-05,37.699043,-113.189322,"POLYGON ((-113.18669 37.70173, -113.18473 37.7..."
2,3,Park West Water Co.,Park West Culinary Water,Park West Water Company,NP,509,1195,UTAH11009,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2005,,,ADAMCLARK,2017-05-31,Inactive,0.038206,9.1e-05,37.713197,-113.147984,"POLYGON ((-113.15327 37.71748, -113.14286 37.7..."


In [36]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['WRID'].astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

Unnamed: 0,in_SiteNativeID,geometry
0,POU11358,"POLYGON ((-113.45270 37.60395, -113.45335 37.6..."
1,POU11169,"POLYGON ((-113.18669 37.70173, -113.18473 37.7..."
2,POU1195,"POLYGON ((-113.15327 37.71748, -113.14286 37.7..."


## Export Data

In [37]:
#check datatype
print(len(dfout))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

585719
in_VariableCV                     object
in_VariableSpecificCV             object
in_Amount                        float64
in_PopulationServed                int32
in_BenUse                         object
in_ReportYearCV                    int64
in_TimeframeEnd           datetime64[ns]
in_TimeframeStart         datetime64[ns]
in_WaterSourceTypeCV              object
linkKey                            int64
in_CoordinateMethodCV             object
in_County                         object
in_Latitude                      float64
in_Longitude                     float64
in_PODorPOUSite                   object
in_SiteName                       object
in_SiteNativeID                   object
in_SiteTypeCV                     object
monthCheck                        object
in_WaterSourceNativeID            object
dtype: object


In [38]:
# Exporting output files.
dfout.to_csv('P_MasterUTSiteSpecific.csv', index=False)  # The output.
dfPoUshape.to_csv('P_utSSGeometry.csv', index=False) # The output geometry.