# Pre-processing Utah Site Specific data for WaDEQA upload.
Date Updated: 10/29/2020
Purpose:  To pre-process the UDWR and UDWRe data into one master file for simple DataFrame creation and extraction

Notes:
1) Need to left join of System Data and Source Data.  Made custom link ID by hand for both System and Source Data by combining SystemID + Year: SystemID_Year_ID

2) Converted UDWRe shapefile to WKT, will use for gemoetry field.  Will use seperate ouput file to save on space. Link via Source ID = WRID.

In [1]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Utah/SiteSpecificAmounts/UDWR/RawInputData"
os.chdir(workingDir)

In [3]:
# Dataframe Creation - System Data
System_Input = "SystemData_input.csv"
df_sys = pd.read_csv(System_Input, encoding = "ISO-8859-1")
df_sys.head(3)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units
0,Leeds Domestic Water Users Association,1000,Public,Active,1960,12/31/1960,Washington,,,,,,,,,27010,0.0,0.0,0.0,0.0,,56.164464,0.0,0.0,0.0,,52.0,,,,,,,
1,Leeds Domestic Water Users Association,1000,Public,Active,1962,12/31/1962,Washington,,,,,,,,,27010,0.0,0.0,0.0,0.0,,49.130095,0.0,0.0,0.0,,54.0,,,,,,,
2,Leeds Domestic Water Users Association,1000,Public,Active,1963,12/31/1963,Washington,,,,,,,,,27010,0.0,0.0,0.0,0.0,,54.559327,0.0,0.0,0.0,,52.0,,,,,,,


In [4]:
# Dataframe Creation - Source Data
Source_Input = "SourceData_input.csv"
df_sor = pd.read_csv(Source_Input, encoding = "ISO-8859-1")
df_sor.head(3)

Unnamed: 0,System Name,System ID,Year,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Use Type,Units,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total,Unnamed: 25
0,Leeds Domestic Water Users Association,1000,1979.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,89.07225,0.0
1,Leeds Domestic Water Users Association,1000,1980.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,13.74872,8.11426,8.57208,13.19438,15.59258,10.55542,9.63224,7.81374,5.86971,9.09407,102.1872,102.1872
2,Leeds Domestic Water Users Association,1000,1981.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,5.78218,5.57348,5.39854,9.31164,9.8917,11.1623,11.38328,9.56637,7.68502,5.00263,4.02972,3.63995,88.42681,88.42681


In [5]:
# UDWRe shapefile
UDWRe_CulArea = gpd.read_file('UtahRetailCulinaryWaterServiceAreasshp/RetailCulinaryWaterServiceAreas.shp')
dfshape = pd.DataFrame(UDWRe_CulArea)
dfshape.head(3)

Unnamed: 0,OBJECTID,WRENAME,WRNAME,DWNAME,SYSTEMTYPE,WATERRESID,WRID,DWSYSNUM,WRLINK,WHOLESALER,LABEL,STATE,COUNTY,BASIN,SUBAREA,SUBAREANAM,LANUM,LANAME,ENDYEAR,DATASOURCE,SOURCEDATE,EDITOR,EDITDATE,COLOR4,SHAPE_Leng,SHAPE_Area,geometry
0,1,Irontown,Irontown,Irontown,C,564,11358,UTAH11070,http://waterrights.utah.gov/cgi-bin/wuseview.e...,,,Utah,Iron,Cedar/Beaver,06-03-01,Escalante Desert,06-03-01a,Escalante Desert,2018,DWRe/Supplier,2019-05-21,ADAMCLARK,2019-10-16,,0.057271,6.7e-05,"POLYGON ((-113.44926 37.60357, -113.44946 37.6..."
1,2,Mountain View SSD,Mountain View Special Service District,Mt. View Spec. Serv. Dist,C,533,11169,UTAH11037,http://waterrights.utah.gov/cgi-bin/wuseview.e...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2018,,,ADAMCLARK,2019-10-16,,0.029013,4.8e-05,"POLYGON ((-113.19369 37.69587, -113.19365 37.7..."
2,3,Park West Water Co.,Park West Culinary Water,Park West Water Company,NP,509,1195,UTAH11009,http://waterrights.utah.gov/cgi-bin/wuseview.e...,,,Utah,Iron,Cedar/Beaver,06-02-02,Cedar,06-02-02a,Cedar,2005,,,ADAMCLARK,2017-05-31,,0.038206,9.1e-05,"POLYGON ((-113.15329 37.71748, -113.14287 37.7..."


In [6]:
# Rename System Data Coloumns
# Include new columns SystemID_Year

RenameSysColumnDict = {
'System Name' : 'System Name_Sys',
'System ID' : 'System ID_Sys',
'System Type' : 'System Type_Sys',
'System Status' : 'System Status_Sys',
'History Year' : 'History Year_Sys',
'Date Received' : 'Date Received_Sys',
'County' : 'County_Sys',
'Use Cooling Percent' : 'Use Cooling Percent_Sys',
'Use Process Percent' : 'Use Process Percent_Sys',
'Use Domestic Percent' : 'Use Domestic Percent_Sys',
'Use Miscellaneous Percent' : 'Use Miscellaneous Percent_Sys',
'Irrigation (Lawn and Garden) Percent' : 'Irrigation (Lawn and Garden) Percent_Sys',
'Acres Irrigated' : 'Acres Irrigated_Sys',
'Irrigation (Agriculture)' : 'Irrigation (Agriculture)_Sys',
'Acres Irrigated.1' : 'Acres Irrigated.1_Sys',
'DEQ ID' : 'DEQ ID_Sys',
'Population' : 'Population_Sys',
'Domestic Use' : 'Domestic Use_Sys',
'Commercial Use' : 'Commercial Use_Sys',
'Industrial Use' : 'Industrial Use_Sys',
'Institutional Use' : 'Institutional Use_Sys',
'Total Use' : 'Total Use_Sys',
'Domestic Connections' : 'Domestic Connections_Sys',
'Commercial Connections' : 'Commercial Connections_Sys',
'Industrial Connections' : 'Industrial Connections_Sys',
'Institutional Connections' : 'Institutional Connections_Sys',
'Total Connections' : 'Total Connections_Sys',
'Peak Date' : 'Peak Date_Sys',
'Peak Demand' : 'Peak Demand_Sys',
'Peak Demand Units' : 'Peak Demand Units_Sys',
'Peak Use Include' : 'Peak Use Include_Sys',
'Peak Measurement Type' : 'Peak Measurement Type_Sys',
'Peak Wholesale Volume' : 'Peak Wholesale Volume_Sys',
'Peak Wholesale Volume Units' : 'Peak Wholesale Volume Units_Sys'}

df_sys = df_sys.rename(columns=RenameSysColumnDict)
df_sys['SystemID_Year_ID'] = df_sys['System ID_Sys'].astype(str) + df_sys['History Year_Sys'].astype(str)
df_sys.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID
0,Leeds Domestic Water Users Association,1000,Public,Active,1960,12/31/1960,Washington,,,,,,,,,27010,0.0,0.0,0.0,0.0,,56.164464,0.0,0.0,0.0,,52.0,,,,,,,,10001960
1,Leeds Domestic Water Users Association,1000,Public,Active,1962,12/31/1962,Washington,,,,,,,,,27010,0.0,0.0,0.0,0.0,,49.130095,0.0,0.0,0.0,,54.0,,,,,,,,10001962
2,Leeds Domestic Water Users Association,1000,Public,Active,1963,12/31/1963,Washington,,,,,,,,,27010,0.0,0.0,0.0,0.0,,54.559327,0.0,0.0,0.0,,52.0,,,,,,,,10001963


In [7]:
# Rename Source Data Coloumns
# Include new columns SystemID_Year.
# Have to drop NULL rows of year.

RenameSourColumnDict = {
'System Name' : 'System Name_Sour',
'System ID' : 'System ID_Sour',
'Year' : 'Year_Sour',
'Source ID' : 'Source ID_Sour',
'Source Name' : 'Source Name_Sour',
'Source Status' : 'Source Status_Sour',
'Lat NAD83' : 'Lat NAD83_Sour',
'Lon NAD83' : 'Lon NAD83_Sour',
'Source Type' : 'Source Type_Sour',
'Diversion Type' : 'Diversion Type_Sour',
'Use Type' : 'Use Type_Sour',
'Units' : 'Units_Sour',
'Jan' : 'Jan_Sour',
'Feb' : 'Feb_Sour',
'Mar' : 'Mar_Sour',
'Apr' : 'Apr_Sour',
'May' : 'May_Sour',
'Jun' : 'Jun_Sour',
'Jul' : 'Jul_Sour',
'Aug' : 'Aug_Sour',
'Sep' : 'Sep_Sour',
'Oct' : 'Oct_Sour',
'Nov' : 'Nov_Sour',
'Dec' : 'Dec_Sour',
'Total' : 'Total_Sour'}

df_sor = df_sor.rename(columns=RenameSourColumnDict)
df_sor = df_sor[df_sor['Year_Sour'].notna()]
df_sor['Year_Sour'] = df_sor['Year_Sour'].astype(int)
df_sor['SystemID_Year_ID'] = df_sor['System ID_Sour'].astype(str) + df_sor['Year_Sour'].astype(str)
df_sor.head(3)

Unnamed: 0,System Name_Sour,System ID_Sour,Year_Sour,Source ID_Sour,Source Name_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,SystemID_Year_ID
0,Leeds Domestic Water Users Association,1000,1979,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,89.07225,0.0,10001979
1,Leeds Domestic Water Users Association,1000,1980,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,13.74872,8.11426,8.57208,13.19438,15.59258,10.55542,9.63224,7.81374,5.86971,9.09407,102.1872,102.1872,10001980
2,Leeds Domestic Water Users Association,1000,1981,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,5.78218,5.57348,5.39854,9.31164,9.8917,11.1623,11.38328,9.56637,7.68502,5.00263,4.02972,3.63995,88.42681,88.42681,10001981


In [8]:
# Create output dataframe.  Merge the two dataframes into one.
df = pd.merge(df_sys, df_sor, left_on='SystemID_Year_ID', right_on='SystemID_Year_ID', how='inner')
df.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Year_Sour,Source ID_Sour,Source Name_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25
0,Leeds Domestic Water Users Association,1000,Public,Active,1979,3/10/1980,Washington,,,,,,,,,27010,255.0,0.0,0.0,0.0,,89.071316,0.0,0.0,0.0,,101.0,,,,,,,,10001979,Leeds Domestic Water Users Association,1000,1979,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,89.07225,0.0
1,Leeds Domestic Water Users Association,1000,Public,Active,1980,3/9/1981,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,102.188634,95.0,7.0,0.0,,102.0,,,,,,,,10001980,Leeds Domestic Water Users Association,1000,1980,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,13.74872,8.11426,8.57208,13.19438,15.59258,10.55542,9.63224,7.81374,5.86971,9.09407,102.1872,102.1872
2,Leeds Domestic Water Users Association,1000,Public,Active,1981,1/25/1982,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,88.426806,103.0,6.0,0.0,,109.0,,,,,,,,10001981,Leeds Domestic Water Users Association,1000,1981,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,5.78218,5.57348,5.39854,9.31164,9.8917,11.1623,11.38328,9.56637,7.68502,5.00263,4.02972,3.63995,88.42681,88.42681


In [9]:
# Create Purge dataframe to note data that was removed for WaDE purposes.
columnslist = df.columns.tolist()
dfpurge = pd.DataFrame(columns=columnslist)  # purge DataFrame
dfpurge = dfpurge.assign(ReasonRemoved='')
dfpurge

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Year_Sour,Source ID_Sour,Source Name_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,ReasonRemoved


In [10]:
# Convert History Year to string.
# Create WaDE Timeframe start and end date.  Assume start = 01/01 & end =  12/31 for now.

df['History Year_Sys'] = df['History Year_Sys'].astype(str)
df['TimeframeEnd'] = '12/31/' + df['History Year_Sys']
df['TimeframeStart'] = '01/01/' + df['History Year_Sys']

df['TimeframeEnd'] = pd.to_datetime(df['TimeframeEnd'], errors = 'coerce')
df['TimeframeEnd'] = pd.to_datetime(df["TimeframeEnd"].dt.strftime('%m/%d/%Y'))

df['TimeframeStart'] = pd.to_datetime(df['TimeframeStart'], errors = 'coerce')
df['TimeframeStart'] = pd.to_datetime(df["TimeframeStart"].dt.strftime('%m/%d/%Y'))

df.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Year_Sour,Source ID_Sour,Source Name_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,TimeframeEnd,TimeframeStart
0,Leeds Domestic Water Users Association,1000,Public,Active,1979,3/10/1980,Washington,,,,,,,,,27010,255.0,0.0,0.0,0.0,,89.071316,0.0,0.0,0.0,,101.0,,,,,,,,10001979,Leeds Domestic Water Users Association,1000,1979,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,89.07225,0.0,1979-12-31,1979-01-01
1,Leeds Domestic Water Users Association,1000,Public,Active,1980,3/9/1981,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,102.188634,95.0,7.0,0.0,,102.0,,,,,,,,10001980,Leeds Domestic Water Users Association,1000,1980,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,13.74872,8.11426,8.57208,13.19438,15.59258,10.55542,9.63224,7.81374,5.86971,9.09407,102.1872,102.1872,1980-12-31,1980-01-01
2,Leeds Domestic Water Users Association,1000,Public,Active,1981,1/25/1982,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,88.426806,103.0,6.0,0.0,,109.0,,,,,,,,10001981,Leeds Domestic Water Users Association,1000,1981,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,5.78218,5.57348,5.39854,9.31164,9.8917,11.1623,11.38328,9.56637,7.68502,5.00263,4.02972,3.63995,88.42681,88.42681,1981-12-31,1981-01-01


In [11]:
# Removing empty 'Lat NAD83' fields.
mask = df.loc[ (df['Lat NAD83_Sour'] == 0) | (df['Lat NAD83_Sour'].isnull()) ].assign(ReasonRemoved='Null Lat NAD83').reset_index()
if len(mask.index) > 0:
    dfpurge = dfpurge.append(mask)  # Append to purge DataFrame
    dropIndex = df.loc[ (df['Lat NAD83_Sour'] == 0) | (df['Lat NAD83_Sour'].isnull()) ].index
    df = df.drop(dropIndex)
    df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Year_Sour,Source ID_Sour,Source Name_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,TimeframeEnd,TimeframeStart
0,Leeds Domestic Water Users Association,1000,Public,Active,1979,3/10/1980,Washington,,,,,,,,,27010,255.0,0.0,0.0,0.0,,89.071316,0.0,0.0,0.0,,101.0,,,,,,,,10001979,Leeds Domestic Water Users Association,1000,1979,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,89.07225,0.0,1979-12-31,1979-01-01
1,Leeds Domestic Water Users Association,1000,Public,Active,1980,3/9/1981,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,102.188634,95.0,7.0,0.0,,102.0,,,,,,,,10001980,Leeds Domestic Water Users Association,1000,1980,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,13.74872,8.11426,8.57208,13.19438,15.59258,10.55542,9.63224,7.81374,5.86971,9.09407,102.1872,102.1872,1980-12-31,1980-01-01
2,Leeds Domestic Water Users Association,1000,Public,Active,1981,1/25/1982,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,88.426806,103.0,6.0,0.0,,109.0,,,,,,,,10001981,Leeds Domestic Water Users Association,1000,1981,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,5.78218,5.57348,5.39854,9.31164,9.8917,11.1623,11.38328,9.56637,7.68502,5.00263,4.02972,3.63995,88.42681,88.42681,1981-12-31,1981-01-01


In [12]:
# Removing empty 'Lon NAD83' fields.
mask = df.loc[ (df['Lon NAD83_Sour'] == 0) | (df['Lon NAD83_Sour'].isnull()) ].assign(ReasonRemoved='Null Lon NAD83').reset_index()
if len(mask.index) > 0:
    dfpurge = dfpurge.append(mask)  # Append to purge DataFrame
    dropIndex = df.loc[ (df['Lon NAD83_Sour'] == 0) | (df['Lon NAD83_Sour'].isnull()) ].index
    df = df.drop(dropIndex)
    df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,System Name_Sys,System ID_Sys,System Type_Sys,System Status_Sys,History Year_Sys,Date Received_Sys,County_Sys,Use Cooling Percent_Sys,Use Process Percent_Sys,Use Domestic Percent_Sys,Use Miscellaneous Percent_Sys,Irrigation (Lawn and Garden) Percent_Sys,Acres Irrigated_Sys,Irrigation (Agriculture)_Sys,Acres Irrigated.1_Sys,DEQ ID_Sys,Population_Sys,Domestic Use_Sys,Commercial Use_Sys,Industrial Use_Sys,Institutional Use_Sys,Total Use_Sys,Domestic Connections_Sys,Commercial Connections_Sys,Industrial Connections_Sys,Institutional Connections_Sys,Total Connections_Sys,Peak Date_Sys,Peak Demand_Sys,Peak Demand Units_Sys,Peak Use Include_Sys,Peak Measurement Type_Sys,Peak Wholesale Volume_Sys,Peak Wholesale Volume Units_Sys,SystemID_Year_ID,System Name_Sour,System ID_Sour,Year_Sour,Source ID_Sour,Source Name_Sour,Source Status_Sour,Lat NAD83_Sour,Lon NAD83_Sour,Source Type_Sour,Diversion Type_Sour,Use Type_Sour,Units_Sour,Jan_Sour,Feb_Sour,Mar_Sour,Apr_Sour,May_Sour,Jun_Sour,Jul_Sour,Aug_Sour,Sep_Sour,Oct_Sour,Nov_Sour,Dec_Sour,Total_Sour,Unnamed: 25,TimeframeEnd,TimeframeStart
0,Leeds Domestic Water Users Association,1000,Public,Active,1979,3/10/1980,Washington,,,,,,,,,27010,255.0,0.0,0.0,0.0,,89.071316,0.0,0.0,0.0,,101.0,,,,,,,,10001979,Leeds Domestic Water Users Association,1000,1979,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,89.07225,0.0,1979-12-31,1979-01-01
1,Leeds Domestic Water Users Association,1000,Public,Active,1980,3/9/1981,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,102.188634,95.0,7.0,0.0,,102.0,,,,,,,,10001980,Leeds Domestic Water Users Association,1000,1980,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,0.0,0.0,13.74872,8.11426,8.57208,13.19438,15.59258,10.55542,9.63224,7.81374,5.86971,9.09407,102.1872,102.1872,1980-12-31,1980-01-01
2,Leeds Domestic Water Users Association,1000,Public,Active,1981,1/25/1982,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,88.426806,103.0,6.0,0.0,,109.0,,,,,,,,10001981,Leeds Domestic Water Users Association,1000,1981,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,acre feet,5.78218,5.57348,5.39854,9.31164,9.8917,11.1623,11.38328,9.56637,7.68502,5.00263,4.02972,3.63995,88.42681,88.42681,1981-12-31,1981-01-01


In [13]:
#transfer WKT gemetry from dfshape

dfgeometry = pd.DataFrame(columns=['WRID', 'culGeometry'])  # purge DataFrame

dfgeometry['WRID'] = dfshape['WRID']
dfgeometry['culGeometry'] = dfshape['geometry']
dfgeometry.drop_duplicates(subset=['WRID', 'culGeometry'], keep='first')
dfgeometry

Unnamed: 0,WRID,culGeometry
0,11358,"POLYGON ((-113.44926 37.60357, -113.44946 37.6..."
1,11169,"POLYGON ((-113.19369 37.69587, -113.19365 37.7..."
2,1195,"POLYGON ((-113.15329 37.71748, -113.14287 37.7..."
3,11047,"POLYGON ((-113.10387 37.74386, -113.09726 37.7..."
4,11047,"POLYGON ((-113.09066 37.76493, -113.09056 37.7..."
...,...,...
1280,11624,"POLYGON ((-112.74802 37.90889, -112.74846 37.9..."
1281,11461,"POLYGON ((-112.14716 38.64066, -112.14851 38.6..."
1282,11283,"POLYGON ((-111.48326 40.22774, -111.48372 40.2..."
1283,1452,"POLYGON ((-113.24420 37.46066, -113.24569 37.4..."


In [14]:
#check datatype
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

System Name_Sys                                     object
System ID_Sys                                        int64
System Type_Sys                                     object
System Status_Sys                                   object
History Year_Sys                                    object
Date Received_Sys                                   object
County_Sys                                          object
Use Cooling Percent_Sys                            float64
Use Process Percent_Sys                            float64
Use Domestic Percent_Sys                           float64
Use Miscellaneous Percent_Sys                      float64
Irrigation (Lawn and Garden) Percent_Sys           float64
Acres Irrigated_Sys                                float64
Irrigation (Agriculture)_Sys                       float64
Acres Irrigated.1_Sys                              float64
DEQ ID_Sys                                          object
Population_Sys                                     float

In [15]:
#Exporting to Finished File
df.to_csv('P_MasterUTSiteSpecific.csv', index=False)  # The output.
dfgeometry.to_csv('P_Geometry.csv', index=False)  # The output geometry.
dfpurge.to_csv('inputDataRemoved.csv', index=False)  # Error check for states to see why we removed certaind data.