# Pre-processing Utah Site Specific data for WaDEQA upload.
Date Updated: 11/03/2020
Purpose:  To pre-process the UDWRi and UDWRe data into one master file for simple DataFrame creation and extraction

Notes:
1) Need to left join of System Data and Source Data.  Made custom link ID by hand for both System and Source Data by combining SystemID + Year: SystemID_Year_ID

2) Converted UDWRe shapefile to WKT, will use for gemoetry field.  Will use seperate ouput file to save on space. Link via Source ID = WRID.

In [1]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Utah/SiteSpecificAmounts/UDWRi/RawInputData"
os.chdir(workingDir)

In [3]:
# Dataframe Creation - System Data
System_Input = "SystemData_input.csv"
df_sys = pd.read_csv(System_Input, encoding = "ISO-8859-1")
df_sys.head(3)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,,,,,,27010,800.0,62166.25,14418.28,110.34,2749.36,79444.23,349.0,16.0,1.0,5.0,371.0,7/30/2019,309.3,kgallons,Indoor and Outdoor,,,
1,Leeds Domestic Water Users Association,1000,Public,Active,2018,4/30/2019,Washington,,,,,,,,,27010,800.0,61670.86,7050.8,350.1,1750.0,70821.76,335.0,22.0,1.0,5.0,363.0,,,,,,,
2,Leeds Domestic Water Users Association,1000,Public,Active,2017,4/5/2018,Washington,,,,,,,,,27010,800.0,58510.0,5021.73,301.084,1640.86,65473.674,333.0,22.0,1.0,5.0,361.0,,,,,,,


In [4]:
# Dataframe Creation - Source Data
Source_Input = "SourceData_input.csv"
df_sor = pd.read_csv(Source_Input, encoding = "ISO-8859-1")
df_sor.head(3)

Unnamed: 0,System Name,System ID,Source Name,Source ID,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Use Type,Year,Units,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,Leeds Domestic Water Users Association,1000,Oak Grove Spring (WS001),10000001,Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,,,,,,,,,,,,,,,,,,
1,Leeds Domestic Water Users Association,1000,Leeds Well (WS002),10000002,Active,37.267014,-113.36306,Well,Withdrawal,Water Supplier,,,,,,,,,,,,,,,,,,
2,Leeds Domestic Water Users Association,1000,"El Dorado Hills Well (8 in, 335 ft deep)",108540479,Inactive,37.265809,-113.351593,Well,Withdrawal,Water Supplier,,,,,,,,,,,,,,,,,,


In [5]:
# UDWRe shapefile
UDWRe_CulArea = gpd.read_file('UDWRe_CulinaryWaterServiceAreas-shp/CulinaryWaterServiceAreas.shp')
dfshape = pd.DataFrame(UDWRe_CulArea)
dfshape.head(3)

Unnamed: 0,FID,WRENAME,WRNAME,DWNAME,SYSTEMTYPE,WATERRESID,WRID,DWSYSNUM,WRLINK,WHOLESALER,LABEL,STATE,COUNTY,BASIN,SUBAREA,SUBAREANAM,LANUM,LANAME,ENDYEAR,DATASOURCE,SOURCEDATE,EDITOR,EDITDATE,SHAPE_Leng,Shape__Are,Shape__Len,geometry
0,1,Irontown,Irontown,Irontown,C,564,11358,UTAH11070,http://waterrights.utah.gov/cgi-bin/wuseview.e...,,,Utah,Iron,Cedar/Beaver,06-03-01,Escalante Desert,06-03-01a,Escalante Desert,2019,DWRe/Supplier,2019-05-21,ADAMCLARK,2020-10-19,5567.248025,656517.817993,5567.248025,"POLYGON ((-113.44925 37.60357, -113.44945 37.6..."
1,2,Mountain View SSD,Mountain View Special Service District,Mt. View Spec. Serv. Dist,C,533,11169,UTAH11037,http://waterrights.utah.gov/cgi-bin/wuseview.e...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2019,,1970-01-01,ADAMCLARK,2020-10-19,2818.216577,467752.984253,2818.216577,"POLYGON ((-113.19367 37.69586, -113.19364 37.7..."
2,3,Park West Water Co.,Park West Culinary Water,Park West Water Company,NP,509,1195,UTAH11009,http://waterrights.utah.gov/cgi-bin/wuseview.e...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2005,,1970-01-01,ADAMCLARK,2017-05-31,3769.215614,886913.409912,3769.215614,"POLYGON ((-113.15327 37.71748, -113.14286 37.7..."


In [6]:
# Rename System Data Coloumns
# Include new columns SystemID_Year

RenameSysColumnDict = {
'System Name' : 'System Name_Sys',
'System ID' : 'System ID_Sys',
'System Type' : 'System Type_Sys',
'System Status' : 'System Status_Sys',
'History Year' : 'History Year_Sys',
'Date Received' : 'Date Received_Sys',
'County' : 'County_Sys',
'Use Cooling Percent' : 'Use Cooling Percent_Sys',
'Use Process Percent' : 'Use Process Percent_Sys',
'Use Domestic Percent' : 'Use Domestic Percent_Sys',
'Use Miscellaneous Percent' : 'Use Miscellaneous Percent_Sys',
'Irrigation (Lawn and Garden) Percent' : 'Irrigation (Lawn and Garden) Percent_Sys',
'Acres Irrigated' : 'Acres Irrigated_Sys',
'Irrigation (Agriculture)' : 'Irrigation (Agriculture)_Sys',
'Acres Irrigated.1' : 'Acres Irrigated.1_Sys',
'DEQ ID' : 'DEQ ID_Sys',
'Population' : 'Population_Sys',
'Domestic Use' : 'Domestic Use_Sys',
'Commercial Use' : 'Commercial Use_Sys',
'Industrial Use' : 'Industrial Use_Sys',
'Institutional Use' : 'Institutional Use_Sys',
'Total Use' : 'Total Use_Sys',
'Domestic Connections' : 'Domestic Connections_Sys',
'Commercial Connections' : 'Commercial Connections_Sys',
'Industrial Connections' : 'Industrial Connections_Sys',
'Institutional Connections' : 'Institutional Connections_Sys',
'Total Connections' : 'Total Connections_Sys',
'Peak Date' : 'Peak Date_Sys',
'Peak Demand' : 'Peak Demand_Sys',
'Peak Demand Units' : 'Peak Demand Units_Sys',
'Peak Use Include' : 'Peak Use Include_Sys',
'Peak Measurement Type' : 'Peak Measurement Type_Sys',
'Peak Wholesale Volume' : 'Peak Wholesale Volume_Sys',
'Peak Wholesale Volume Units' : 'Peak Wholesale Volume Units_Sys'}

df_sys = df_sys.rename(columns=RenameSysColumnDict)
df_sys['SystemID_Year_ID'] = df_sys['System ID_Sys'].astype(str) + df_sys['History Year_Sys'].astype(str)
df_sys.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,,,,,,27010,800.0,62166.25,14418.28,110.34,2749.36,79444.23,349.0,16.0,1.0,5.0,371.0,7/30/2019,309.3,kgallons,Indoor and Outdoor,,,,10002019
1,Leeds Domestic Water Users Association,1000,Public,Active,2018,4/30/2019,Washington,,,,,,,,,27010,800.0,61670.86,7050.8,350.1,1750.0,70821.76,335.0,22.0,1.0,5.0,363.0,,,,,,,,10002018
2,Leeds Domestic Water Users Association,1000,Public,Active,2017,4/5/2018,Washington,,,,,,,,,27010,800.0,58510.0,5021.73,301.084,1640.86,65473.674,333.0,22.0,1.0,5.0,361.0,,,,,,,,10002017


In [7]:
# Rename Source Data Coloumns
# Include new columns SystemID_Year.
# Have to drop NULL rows of year.

RenameSourColumnDict = {
'System Name' : 'System Name_Sour',
'System ID' : 'System ID_Sour',
'Year' : 'Year_Sour',
'Source ID' : 'Source ID_Sour',
'Source Name' : 'Source Name_Sour',
'Source Status' : 'Source Status_Sour',
'Lat NAD83' : 'Lat NAD83_Sour',
'Lon NAD83' : 'Lon NAD83_Sour',
'Source Type' : 'Source Type_Sour',
'Diversion Type' : 'Diversion Type_Sour',
'Use Type' : 'Use Type_Sour',
'Units' : 'Units_Sour',
'Jan' : 'Jan_Sour',
'Feb' : 'Feb_Sour',
'Mar' : 'Mar_Sour',
'Apr' : 'Apr_Sour',
'May' : 'May_Sour',
'Jun' : 'Jun_Sour',
'Jul' : 'Jul_Sour',
'Aug' : 'Aug_Sour',
'Sep' : 'Sep_Sour',
'Oct' : 'Oct_Sour',
'Nov' : 'Nov_Sour',
'Dec' : 'Dec_Sour',
'Total' : 'Total_Sour'}

df_sor = df_sor.rename(columns=RenameSourColumnDict)
df_sor = df_sor[df_sor['Year_Sour'].notna()]
df_sor['Year_Sour'] = df_sor['Year_Sour'].astype(int)
df_sor['SystemID_Year_ID'] = df_sor['System ID_Sour'].astype(str) + df_sor['Year_Sour'].astype(str)
df_sor.head(3)

Unnamed: 0,System Name_Sour,System ID_Sour,Source Name_Sour,Source ID_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Year_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,Unnamed: 26,Unnamed: 27,SystemID_Year_ID
94,Provo City Water Resources Division (Culinary),1010,City Center Well,10100010,Inactive,40.232769,-111.664217,Well,Withdrawal,Industrial,2002,kgallons,17270.12571,16292.57143,17270.12571,16618.42286,17270.12571,16618.42286,17270.12571,10753.09714,0.0,0.0,16618.42286,17270.12571,163251.5657,,,,10102002
95,Provo City Water Resources Division (Culinary),1010,City Center Well,10100010,Inactive,40.232769,-111.664217,Well,Withdrawal,Industrial,1999,kgallons,17270.12571,16292.57143,17270.12571,16618.42286,17270.12571,16618.42286,17270.12571,17270.12571,16618.42286,17270.12571,16618.42286,17270.12571,203657.1429,,,,10101999
96,Provo City Water Resources Division (Culinary),1010,City Center Well,10100010,Inactive,40.232769,-111.664217,Well,Withdrawal,Industrial,1998,kgallons,17270.12571,16292.57143,17270.12571,16618.42286,17270.12571,16618.42286,17270.12571,17270.12571,16618.42286,17270.12571,16618.42286,17270.12571,203657.1429,,,,10101998


In [8]:
# Create output dataframe.  Merge the two dataframes into one.
df = pd.merge(df_sys, df_sor, left_on='SystemID_Year_ID', right_on='SystemID_Year_ID', how='inner')
df.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Source Name_Sour,Source ID_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Year_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,Rock Canyon Springs (WS024),10100017,Active,40.265481,-111.623018,Spring,Withdrawal,Water Supplier,2019,kgallons,1798.0,1601.0,1788.0,11016.0,68077.0,82768.0,67610.0,30793.0,14832.0,11202.0,8524.0,1433.0,301442.0,,,
1,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,"Parkside,Nunn Springs (WS015), (WS016)",10100018,Active,40.332652,-111.617667,Spring,Withdrawal,Water Supplier,2019,kgallons,43823.0,40471.0,36798.0,63852.0,44892.0,92913.0,92650.0,130635.0,129498.0,96308.0,66844.0,62329.0,901013.0,,,
2,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,South Fork Springs (Group 6),10100019,Active,40.339736,-111.604856,Spring,Withdrawal,Water Supplier,2019,kgallons,189204.0,170938.0,193310.0,196831.0,206183.0,201780.0,223938.0,249656.0,237404.0,239098.0,176947.0,219224.0,2504513.0,,,


In [9]:
# Create Purge dataframe to note data that was removed for WaDE purposes.
columnslist = df.columns.tolist()
dfpurge = pd.DataFrame(columns=columnslist)  # purge DataFrame
dfpurge = dfpurge.assign(ReasonRemoved='')
dfpurge

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Source Name_Sour,Source ID_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Year_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,Unnamed: 26,Unnamed: 27,ReasonRemoved


In [10]:
# Convert History Year to string.
# Create WaDE Timeframe start and end date.  Assume start = 01/01 & end =  12/31 for now.

df['History Year_Sys'] = df['History Year_Sys'].astype(str)
df['TimeframeEnd'] = '12/31/' + df['History Year_Sys']
df['TimeframeStart'] = '01/01/' + df['History Year_Sys']

df['TimeframeEnd'] = pd.to_datetime(df['TimeframeEnd'], errors = 'coerce')
df['TimeframeEnd'] = pd.to_datetime(df["TimeframeEnd"].dt.strftime('%m/%d/%Y'))

df['TimeframeStart'] = pd.to_datetime(df['TimeframeStart'], errors = 'coerce')
df['TimeframeStart'] = pd.to_datetime(df["TimeframeStart"].dt.strftime('%m/%d/%Y'))

df.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Source Name_Sour,Source ID_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Year_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,Unnamed: 26,Unnamed: 27,TimeframeEnd,TimeframeStart
0,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,Rock Canyon Springs (WS024),10100017,Active,40.265481,-111.623018,Spring,Withdrawal,Water Supplier,2019,kgallons,1798.0,1601.0,1788.0,11016.0,68077.0,82768.0,67610.0,30793.0,14832.0,11202.0,8524.0,1433.0,301442.0,,,,2019-12-31,2019-01-01
1,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,"Parkside,Nunn Springs (WS015), (WS016)",10100018,Active,40.332652,-111.617667,Spring,Withdrawal,Water Supplier,2019,kgallons,43823.0,40471.0,36798.0,63852.0,44892.0,92913.0,92650.0,130635.0,129498.0,96308.0,66844.0,62329.0,901013.0,,,,2019-12-31,2019-01-01
2,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,South Fork Springs (Group 6),10100019,Active,40.339736,-111.604856,Spring,Withdrawal,Water Supplier,2019,kgallons,189204.0,170938.0,193310.0,196831.0,206183.0,201780.0,223938.0,249656.0,237404.0,239098.0,176947.0,219224.0,2504513.0,,,,2019-12-31,2019-01-01


In [11]:
# Converting Population to float value.
df['Population_Sys'] = df['Population_Sys'].fillna(0)
df['Population_Sys'] = df['Population_Sys'].astype(float)
df['Population_Sys']

0       116713.0
1       116713.0
2       116713.0
3       116713.0
4       117331.0
          ...   
4398         0.0
4399         0.0
4400         0.0
4401         0.0
4402         0.0
Name: Population_Sys, Length: 4403, dtype: float64

In [12]:
# Removing empty 'Total Use_Sys' fields.  Need an amount value.
mask = df.loc[ (df['Total Use_Sys'].isnull()) ].assign(ReasonRemoved='Null Total Use_Sys').reset_index()
if len(mask.index) > 0:
    dfpurge = dfpurge.append(mask)  # Append to purge DataFrame
    dropIndex = df.loc[ (df['Total Use_Sys'].isnull()) ].index
    df = df.drop(dropIndex)
    df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Source Name_Sour,Source ID_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Year_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,Unnamed: 26,Unnamed: 27,TimeframeEnd,TimeframeStart
0,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,Rock Canyon Springs (WS024),10100017,Active,40.265481,-111.623018,Spring,Withdrawal,Water Supplier,2019,kgallons,1798.0,1601.0,1788.0,11016.0,68077.0,82768.0,67610.0,30793.0,14832.0,11202.0,8524.0,1433.0,301442.0,,,,2019-12-31,2019-01-01
1,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,"Parkside,Nunn Springs (WS015), (WS016)",10100018,Active,40.332652,-111.617667,Spring,Withdrawal,Water Supplier,2019,kgallons,43823.0,40471.0,36798.0,63852.0,44892.0,92913.0,92650.0,130635.0,129498.0,96308.0,66844.0,62329.0,901013.0,,,,2019-12-31,2019-01-01
2,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,South Fork Springs (Group 6),10100019,Active,40.339736,-111.604856,Spring,Withdrawal,Water Supplier,2019,kgallons,189204.0,170938.0,193310.0,196831.0,206183.0,201780.0,223938.0,249656.0,237404.0,239098.0,176947.0,219224.0,2504513.0,,,,2019-12-31,2019-01-01


In [13]:
# Removing empty 'Lat NAD83' fields.
mask = df.loc[ (df['Lat NAD83_Sour'] == 0) | (df['Lat NAD83_Sour'].isnull()) ].assign(ReasonRemoved='Null Lat NAD83').reset_index()
if len(mask.index) > 0:
    dfpurge = dfpurge.append(mask)  # Append to purge DataFrame
    dropIndex = df.loc[ (df['Lat NAD83_Sour'] == 0) | (df['Lat NAD83_Sour'].isnull()) ].index
    df = df.drop(dropIndex)
    df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Source Name_Sour,Source ID_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Year_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,Unnamed: 26,Unnamed: 27,TimeframeEnd,TimeframeStart
0,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,Rock Canyon Springs (WS024),10100017,Active,40.265481,-111.623018,Spring,Withdrawal,Water Supplier,2019,kgallons,1798.0,1601.0,1788.0,11016.0,68077.0,82768.0,67610.0,30793.0,14832.0,11202.0,8524.0,1433.0,301442.0,,,,2019-12-31,2019-01-01
1,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,"Parkside,Nunn Springs (WS015), (WS016)",10100018,Active,40.332652,-111.617667,Spring,Withdrawal,Water Supplier,2019,kgallons,43823.0,40471.0,36798.0,63852.0,44892.0,92913.0,92650.0,130635.0,129498.0,96308.0,66844.0,62329.0,901013.0,,,,2019-12-31,2019-01-01
2,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,South Fork Springs (Group 6),10100019,Active,40.339736,-111.604856,Spring,Withdrawal,Water Supplier,2019,kgallons,189204.0,170938.0,193310.0,196831.0,206183.0,201780.0,223938.0,249656.0,237404.0,239098.0,176947.0,219224.0,2504513.0,,,,2019-12-31,2019-01-01


In [14]:
# Removing empty 'Lon NAD83' fields.
mask = df.loc[ (df['Lon NAD83_Sour'] == 0) | (df['Lon NAD83_Sour'].isnull()) ].assign(ReasonRemoved='Null Lon NAD83').reset_index()
if len(mask.index) > 0:
    dfpurge = dfpurge.append(mask)  # Append to purge DataFrame
    dropIndex = df.loc[ (df['Lon NAD83_Sour'] == 0) | (df['Lon NAD83_Sour'].isnull()) ].index
    df = df.drop(dropIndex)
    df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Source Name_Sour,Source ID_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Year_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,Unnamed: 26,Unnamed: 27,TimeframeEnd,TimeframeStart
0,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,Rock Canyon Springs (WS024),10100017,Active,40.265481,-111.623018,Spring,Withdrawal,Water Supplier,2019,kgallons,1798.0,1601.0,1788.0,11016.0,68077.0,82768.0,67610.0,30793.0,14832.0,11202.0,8524.0,1433.0,301442.0,,,,2019-12-31,2019-01-01
1,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,"Parkside,Nunn Springs (WS015), (WS016)",10100018,Active,40.332652,-111.617667,Spring,Withdrawal,Water Supplier,2019,kgallons,43823.0,40471.0,36798.0,63852.0,44892.0,92913.0,92650.0,130635.0,129498.0,96308.0,66844.0,62329.0,901013.0,,,,2019-12-31,2019-01-01
2,Provo City Water Resources Division (Culinary),1010,Public,Active,2019,3/25/2020,Utah,,,,,,,,,25006,116713.0,4069883.0,2286247.0,9841.0,538437.0,6904408.0,17483.0,1936.0,17.0,171.0,19607.0,7/22/2019,49064.0,kgallons,Indoor and Outdoor,,,,10102019,Provo City Water Resources Division (Culinary),1010,South Fork Springs (Group 6),10100019,Active,40.339736,-111.604856,Spring,Withdrawal,Water Supplier,2019,kgallons,189204.0,170938.0,193310.0,196831.0,206183.0,201780.0,223938.0,249656.0,237404.0,239098.0,176947.0,219224.0,2504513.0,,,,2019-12-31,2019-01-01


In [15]:
#transfer WKT gemetry from dfshape

dfgeometry = pd.DataFrame(columns=['WRID', 'culGeometry'])  # purge DataFrame

dfgeometry['WRID'] = dfshape['WRID']
dfgeometry['culGeometry'] = dfshape['geometry']
dfgeometry.drop_duplicates(subset=['WRID', 'culGeometry'], keep='first')
dfgeometry

Unnamed: 0,WRID,culGeometry
0,11358,"POLYGON ((-113.44925 37.60357, -113.44945 37.6..."
1,11169,"POLYGON ((-113.19367 37.69586, -113.19364 37.7..."
2,1195,"POLYGON ((-113.15327 37.71748, -113.14286 37.7..."
3,11047,"POLYGON ((-113.10386 37.74385, -113.09725 37.7..."
4,11047,"POLYGON ((-113.09064 37.76492, -113.09055 37.7..."
...,...,...
1287,11178,"POLYGON ((-111.86190 41.58295, -111.85898 41.5..."
1288,0,"POLYGON ((-112.01529 38.16489, -112.01501 38.1..."
1289,0,"POLYGON ((-113.21681 37.54165, -113.23514 37.5..."
1290,0,"POLYGON ((-111.96749 40.64362, -111.96746 40.6..."


In [16]:
#check datatype
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

System Name_Sys                                     object
System ID_Sys                                        int64
System Type_Sys                                     object
System Status_Sys                                   object
History Year_Sys                                    object
Date Received_Sys                                   object
County_Sys                                          object
Use Cooling Percent_Sys                            float64
Use Process Percent_Sys                            float64
Use Domestic Percent_Sys                           float64
Use Miscellaneous Percent_Sys                      float64
Irrigation (Lawn and Garden) Percent_Sys           float64
Acres Irrigated_Sys                                float64
Irrigation (Agriculture)_Sys                       float64
Acres Irrigated.1_Sys                              float64
DEQ ID_Sys                                          object
Population_Sys                                     float

In [17]:
# Exporting output files.
df.to_csv('P_MasterUTSiteSpecific.csv', index=False)  # The output.
dfgeometry.to_csv('P_Geometry.csv', index=False)  # The output geometry.
dfpurge.to_csv('inputUDWRiDataRemoved.csv', index=False)  # Error check for states to see why we removed certaind data.