# Pre-processing Utah Site Specific data for WaDEQA upload.
Date Updated: 12/16/2020
Purpose:  To pre-process the UDWRi and UDWRe data into one master file for simple DataFrame creation and extraction

Notes:
1) Seperate out water use data by customer type (e.g. Domestic, Commerical, Industrial, Insitutional, Total).
2) Use UDWRi_SystemData, and left joing with UDWRe_CulinaryWaterServiceArea for lat long location of centroid of area.

In [1]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Utah/SiteSpecificAmounts/UDWRi/RawInputData"
os.chdir(workingDir)

In [3]:
# Dataframe Creation - location data
loc_Input = "UDWRe_CulinaryWaterServiceAreas_input.csv"
df_loc = pd.read_csv(loc_Input, encoding = "ISO-8859-1")
df_loc.head(1)

Unnamed: 0,FID_1,WRENAME,WRNAME,DWNAME,SYSTEMTYPE,WATERRESID,WRID,DWSYSNUM,WRLINK,WHOLESALER,LABEL,STATE,COUNTY,BASIN,SUBAREA,SUBAREANAM,LANUM,LANAME,ENDYEAR,DATASOURCE,SOURCEDATE,EDITOR,EDITDATE,SHAPE_Leng,Shape__Are,Shape__Len,Shape_Length,Shape_Area,Longitude,Latitude
0,1,Irontown,Irontown,Irontown,C,564,11358,UTAH11070,http://waterrights.utah.gov/cgi-bin/wuseview.e...,,,Utah,Iron,Cedar/Beaver,6/3/2001,Escalante Desert,06-03-01a,Escalante Desert,2019,DWRe/Supplier,5/21/2019 0:00,ADAMCLARK,10/19/2020 0:00,5567.248025,656517.818,5567.248025,5567.248025,656517.8179,-113.44603,37.602583


In [4]:
# Dropping locatoin data that isn't needed for upload.
df_loc = df_loc.drop(['FID_1', 'WRENAME', 'WRNAME', 'DWNAME', 'SYSTEMTYPE', 'WATERRESID',
                      'DWSYSNUM', 'WRLINK', 'WHOLESALER', 'LABEL', 'STATE', 'COUNTY',
                      'BASIN', 'SUBAREA', 'SUBAREANAM', 'LANUM', 'LANAME', 'ENDYEAR',
                      'DATASOURCE', 'SOURCEDATE', 'EDITOR', 'EDITDATE', 'SHAPE_Leng',
                      'Shape__Are', 'Shape__Len', 'Shape_Length', 'Shape_Area'], axis=1)
df_loc.head(1)

Unnamed: 0,WRID,Longitude,Latitude
0,11358,-113.44603,37.602583


In [5]:
# Dataframe Creation - system Data
sys_Input = "UDWRi_SystemData_input.csv"
df_sys = pd.read_csv(sys_Input, encoding = "ISO-8859-1")
df_sys.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,,,,,,27010,800.0,62166.25,14418.28,110.34,2749.36,79444.23,349.0,16.0,1.0,5.0,371.0,7/30/2019,309.3,kgallons,Indoor and Outdoor,,,


In [6]:
# Dropping system data that isn't needed for upload.
df_sys = df_sys.drop(['Use Cooling Percent',
                      'Use Process Percent', 
                      'Use Domestic Percent',
                      'Use Miscellaneous Percent', 
                      'Irrigation (Lawn and Garden) Percent',
                      'Peak Date', 
                      'Peak Demand', 
                      'Peak Demand Units',
                      'Peak Use Include', 
                      'Peak Measurement Type', 
                      'Peak Wholesale Volume',
                      'Peak Wholesale Volume Units'], axis=1)
df_sys.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,14418.28,110.34,2749.36,79444.23,349.0,16.0,1.0,5.0,371.0


In [7]:
# Domestic
df_sys_Domestic = df_sys
df_sys_Domestic['WaterUse'] = df_sys_Domestic['Domestic Use']
df_sys_Domestic['NumOfConnections'] = df_sys_Domestic['Domestic Connections']
df_sys_Domestic['BenUse'] = "Domestic"
df_sys_Domestic = df_sys_Domestic.drop(['Domestic Use', 'Commercial Use', 'Industrial Use', 'Institutional Use', 'Total Use'], axis=1)
df_sys_Domestic = df_sys_Domestic.drop(['Domestic Connections', 'Commercial Connections','Industrial Connections', 'Institutional Connections', 'Total Connections'], axis=1)
df_sys_Domestic.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic


In [8]:
# Commercial
df_sys_Commercial = df_sys
df_sys_Commercial['WaterUse'] = df_sys_Commercial['Commercial Use']
df_sys_Commercial['NumOfConnections'] = df_sys_Commercial['Commercial Connections']
df_sys_Commercial['BenUse'] = "Commercial"
df_sys_Commercial = df_sys_Commercial.drop(['Domestic Use', 'Commercial Use', 'Industrial Use', 'Institutional Use', 'Total Use'], axis=1)
df_sys_Commercial = df_sys_Commercial.drop(['Domestic Connections', 'Commercial Connections','Industrial Connections', 'Institutional Connections', 'Total Connections'], axis=1)
df_sys_Commercial.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,14418.28,16.0,Commercial


In [9]:
# Industrial
df_sys_Industrial = df_sys
df_sys_Industrial['WaterUse'] = df_sys_Industrial['Industrial Use']
df_sys_Industrial['NumOfConnections'] = df_sys_Industrial['Industrial Connections']
df_sys_Industrial['BenUse'] = "Industrial"
df_sys_Industrial = df_sys_Industrial.drop(['Domestic Use', 'Commercial Use', 'Industrial Use', 'Institutional Use', 'Total Use'], axis=1)
df_sys_Industrial = df_sys_Industrial.drop(['Domestic Connections', 'Commercial Connections','Industrial Connections', 'Institutional Connections', 'Total Connections'], axis=1)
df_sys_Industrial.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,110.34,1.0,Industrial


In [10]:
# Institutional
df_sys_Institutional = df_sys
df_sys_Institutional['WaterUse'] = df_sys_Institutional['Institutional Use']
df_sys_Institutional['NumOfConnections'] = df_sys_Institutional['Institutional Connections']
df_sys_Institutional['BenUse'] = "Institutional"
df_sys_Institutional = df_sys_Institutional.drop(['Domestic Use', 'Commercial Use', 'Industrial Use', 'Institutional Use', 'Total Use'], axis=1)
df_sys_Institutional = df_sys_Institutional.drop(['Domestic Connections', 'Commercial Connections','Industrial Connections', 'Institutional Connections', 'Total Connections'], axis=1)
df_sys_Institutional.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,2749.36,5.0,Institutional


In [11]:
# Total
df_sys_Total = df_sys
df_sys_Total['WaterUse'] = df_sys_Total['Total Use']
df_sys_Total['NumOfConnections'] = df_sys_Total['Total Connections']
df_sys_Total['BenUse'] = "Total"
df_sys_Total = df_sys_Total.drop(['Domestic Use', 'Commercial Use', 'Industrial Use', 'Institutional Use', 'Total Use'], axis=1)
df_sys_Total = df_sys_Total.drop(['Domestic Connections', 'Commercial Connections','Industrial Connections', 'Institutional Connections', 'Total Connections'], axis=1)
df_sys_Total.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,79444.23,371.0,Total


In [12]:
# Concatenate
frames = [df_sys_Domestic, df_sys_Commercial, df_sys_Industrial, df_sys_Institutional, df_sys_Total]
df_sys_all = pd.concat(frames)
df_sys_all.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic


In [13]:
# Create output dataframe.  Merge the two dataframes into one.
dfout = pd.merge(df_sys_all, df_loc, left_on='System ID', right_on='WRID', how='inner')
dfout.head(3)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,WRID,Longitude,Latitude
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,1000,-113.347061,37.237197
1,Leeds Domestic Water Users Association,1000,Public,Active,2018,4/30/2019,Washington,,,,27010,800.0,61670.86,335.0,Domestic,1000,-113.347061,37.237197
2,Leeds Domestic Water Users Association,1000,Public,Active,2017,4/5/2018,Washington,,,,27010,800.0,58510.0,333.0,Domestic,1000,-113.347061,37.237197


In [14]:
# Convert History Year to string.
# Create WaDE Timeframe start and end date.  Assume start = 01/01 & end =  12/31 for now.

dfout['History Year'] = dfout['History Year'].astype(str)
dfout['TimeframeEnd'] = '12/31/' + dfout['History Year']
dfout['TimeframeStart'] = '01/01/' + dfout['History Year']

dfout['TimeframeEnd'] = pd.to_datetime(dfout['TimeframeEnd'], errors = 'coerce')
dfout['TimeframeEnd'] = pd.to_datetime(dfout["TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['TimeframeStart'] = pd.to_datetime(dfout['TimeframeStart'], errors = 'coerce')
dfout['TimeframeStart'] = pd.to_datetime(dfout["TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,WRID,Longitude,Latitude,TimeframeEnd,TimeframeStart
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,1000,-113.347061,37.237197,2019-12-31,2019-01-01


In [15]:
# Converting Population to float value.
dfout['Population'] = dfout['Population'].astype(float).replace(np.nan, 0, regex=True)
dfout['Population'] = dfout['Population'].astype(int)
dfout['Population']

0        800
1        800
2        800
3        780
4        760
        ... 
87650     25
87651     25
87652     25
87653     25
87654     25
Name: Population, Length: 87655, dtype: int32

In [16]:
#check datatype
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

System Name                         object
System ID                            int64
System Type                         object
System Status                       object
History Year                        object
Date Received                       object
County                              object
Acres Irrigated                    float64
Irrigation (Agriculture)           float64
Acres Irrigated.1                  float64
DEQ ID                              object
Population                           int32
WaterUse                           float64
NumOfConnections                   float64
BenUse                              object
WRID                                 int64
Longitude                          float64
Latitude                           float64
TimeframeEnd                datetime64[ns]
TimeframeStart              datetime64[ns]
dtype: object


In [17]:
# Exporting output files.
dfout.to_csv('P_MasterUTSiteSpecific.csv', index=False)  # The output.