# Pre-processing Utah Site Specific data for WaDEQA upload.
Date Updated: 07/30/2020
Purpose:  To pre-process the Utah data into one master file for simple DataFrame creation and extraction

Notes:
1) Need to left join of System Data and Source Data
2) Cut down on number of columns in Source Data to reduce redundecy (Source_temp).
3) Made custom link ID by hand for both System and Source Data by combining SystemID + Year: SystemID_Year_ID

In [1]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [9]:
#Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Utah/SiteSpecificAmounts/UDWR/RawInputData"
os.chdir(workingDir)
System_Input = "SystemData_temp.csv"
Source_Input = "SourceData_temp.csv"

#Dataframe creation
df_sys = pd.read_csv(System_Input, encoding = "ISO-8859-1")
df_sor = pd.read_csv(Source_Input, encoding = "ISO-8859-1")

In [10]:
df_sys.head(3)

Unnamed: 0,SystemID_Year_ID,System Name,System ID,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units
0,10001960,Leeds Domestic Water Users Association,1000,Public,Active,1960,12/31/1960,Washington,,,,,,,,,27010,0.0,0.0,0.0,0.0,,56.164464,0.0,0.0,0.0,,52.0,,,,,,,
1,10001962,Leeds Domestic Water Users Association,1000,Public,Active,1962,12/31/1962,Washington,,,,,,,,,27010,0.0,0.0,0.0,0.0,,49.130095,0.0,0.0,0.0,,54.0,,,,,,,
2,10001963,Leeds Domestic Water Users Association,1000,Public,Active,1963,12/31/1963,Washington,,,,,,,,,27010,0.0,0.0,0.0,0.0,,54.559327,0.0,0.0,0.0,,52.0,,,,,,,


In [11]:
df_sor.head(3)

Unnamed: 0,SystemID_Year_ID,System Name,System ID,Year,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Total
0,10001979,Leeds Domestic Water Users Association,1000,1979.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,89.07225
1,10001980,Leeds Domestic Water Users Association,1000,1980.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,102.1872
2,10001981,Leeds Domestic Water Users Association,1000,1981.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,88.42681


In [12]:
#Merge
df = pd.merge(df_sys, df_sor, left_on='SystemID_Year_ID', right_on='SystemID_Year_ID', how='inner')
df.head(3)

Unnamed: 0,SystemID_Year_ID,System Name_x,System ID_x,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units,System Name_y,System ID_y,Year,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Total
0,10001979,Leeds Domestic Water Users Association,1000,Public,Active,1979,3/10/1980,Washington,,,,,,,,,27010,255.0,0.0,0.0,0.0,,89.071316,0.0,0.0,0.0,,101.0,,,,,,,,Leeds Domestic Water Users Association,1000,1979.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,89.07225
1,10001980,Leeds Domestic Water Users Association,1000,Public,Active,1980,3/9/1981,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,102.188634,95.0,7.0,0.0,,102.0,,,,,,,,Leeds Domestic Water Users Association,1000,1980.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,102.1872
2,10001981,Leeds Domestic Water Users Association,1000,Public,Active,1981,1/25/1982,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,88.426806,103.0,6.0,0.0,,109.0,,,,,,,,Leeds Domestic Water Users Association,1000,1981.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,88.42681


In [13]:
df['Var_DiversionType'] = df['Diversion Type']
df['History Year'] = df['History Year'].astype(str)
df['TimeframeEnd'] = '12/31/' + df['History Year']
df['TimeframeStart'] = '01/01/' + df['History Year']

df['TimeframeEnd'] = pd.to_datetime(df['TimeframeEnd'], errors = 'coerce')
df['TimeframeEnd'] = pd.to_datetime(df["TimeframeEnd"].dt.strftime('%m/%d/%Y'))

df['TimeframeStart'] = pd.to_datetime(df['TimeframeStart'], errors = 'coerce')
df['TimeframeStart'] = pd.to_datetime(df["TimeframeStart"].dt.strftime('%m/%d/%Y'))

df.head(3)

Unnamed: 0,SystemID_Year_ID,System Name_x,System ID_x,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units,System Name_y,System ID_y,Year,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Total,Var_DiversionType,TimeframeEnd,TimeframeStart
0,10001979,Leeds Domestic Water Users Association,1000,Public,Active,1979,3/10/1980,Washington,,,,,,,,,27010,255.0,0.0,0.0,0.0,,89.071316,0.0,0.0,0.0,,101.0,,,,,,,,Leeds Domestic Water Users Association,1000,1979.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,89.07225,Withdrawal,1979-12-31,1979-01-01
1,10001980,Leeds Domestic Water Users Association,1000,Public,Active,1980,3/9/1981,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,102.188634,95.0,7.0,0.0,,102.0,,,,,,,,Leeds Domestic Water Users Association,1000,1980.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,102.1872,Withdrawal,1980-12-31,1980-01-01
2,10001981,Leeds Domestic Water Users Association,1000,Public,Active,1981,1/25/1982,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,88.426806,103.0,6.0,0.0,,109.0,,,,,,,,Leeds Domestic Water Users Association,1000,1981.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,88.42681,Withdrawal,1981-12-31,1981-01-01


In [14]:
#Removing empty 'Lat NAD83' fields.
dropIndex = df.loc[(df['Lat NAD83'] == 0) | (df['Lat NAD83'].isnull())].index

if len(dropIndex) > 0:
    df = df.drop(dropIndex)
    df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,SystemID_Year_ID,System Name_x,System ID_x,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units,System Name_y,System ID_y,Year,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Total,Var_DiversionType,TimeframeEnd,TimeframeStart
0,10001979,Leeds Domestic Water Users Association,1000,Public,Active,1979,3/10/1980,Washington,,,,,,,,,27010,255.0,0.0,0.0,0.0,,89.071316,0.0,0.0,0.0,,101.0,,,,,,,,Leeds Domestic Water Users Association,1000,1979.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,89.07225,Withdrawal,1979-12-31,1979-01-01
1,10001980,Leeds Domestic Water Users Association,1000,Public,Active,1980,3/9/1981,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,102.188634,95.0,7.0,0.0,,102.0,,,,,,,,Leeds Domestic Water Users Association,1000,1980.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,102.1872,Withdrawal,1980-12-31,1980-01-01
2,10001981,Leeds Domestic Water Users Association,1000,Public,Active,1981,1/25/1982,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,88.426806,103.0,6.0,0.0,,109.0,,,,,,,,Leeds Domestic Water Users Association,1000,1981.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,88.42681,Withdrawal,1981-12-31,1981-01-01


In [15]:
#Removing empty 'Lon NAD83' fields.
dropIndex = df.loc[(df['Lon NAD83'] == 0) | (df['Lon NAD83'].isnull())].index

if len(dropIndex) > 0:
    df = df.drop(dropIndex)
    df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,SystemID_Year_ID,System Name_x,System ID_x,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units,System Name_y,System ID_y,Year,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Total,Var_DiversionType,TimeframeEnd,TimeframeStart
0,10001979,Leeds Domestic Water Users Association,1000,Public,Active,1979,3/10/1980,Washington,,,,,,,,,27010,255.0,0.0,0.0,0.0,,89.071316,0.0,0.0,0.0,,101.0,,,,,,,,Leeds Domestic Water Users Association,1000,1979.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,89.07225,Withdrawal,1979-12-31,1979-01-01
1,10001980,Leeds Domestic Water Users Association,1000,Public,Active,1980,3/9/1981,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,102.188634,95.0,7.0,0.0,,102.0,,,,,,,,Leeds Domestic Water Users Association,1000,1980.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,102.1872,Withdrawal,1980-12-31,1980-01-01
2,10001981,Leeds Domestic Water Users Association,1000,Public,Active,1981,1/25/1982,Washington,,,,,,,,,27010,269.0,0.0,0.0,0.0,,88.426806,103.0,6.0,0.0,,109.0,,,,,,,,Leeds Domestic Water Users Association,1000,1981.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal,88.42681,Withdrawal,1981-12-31,1981-01-01


In [16]:
#check columns
df.columns

Index(['SystemID_Year_ID', 'System Name_x', 'System ID_x', 'System Type',
       'System Status', 'History Year', 'Date Received', 'County',
       'Use Cooling Percent', 'Use Process Percent', 'Use Domestic Percent',
       'Use Miscellaneous Percent', 'Irrigation (Lawn and Garden) Percent',
       'Acres Irrigated', 'Irrigation (Agriculture)', 'Acres Irrigated.1',
       'DEQ ID', 'Population', 'Domestic Use', 'Commercial Use',
       'Industrial Use', 'Institutional Use', 'Total Use',
       'Domestic Connections', 'Commercial Connections',
       'Industrial Connections', 'Institutional Connections',
       'Total Connections', 'Peak Date', 'Peak Demand', 'Peak Demand Units',
       'Peak Use Include', 'Peak Measurement Type', 'Peak Wholesale Volume',
       'Peak Wholesale Volume Units', 'System Name_y', 'System ID_y', 'Year',
       'Source ID', 'Source Name', 'Source Status', 'Lat NAD83', 'Lon NAD83',
       'Source Type', 'Diversion Type', 'Total', 'Var_DiversionType',
       '

In [17]:
#check datatype
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

SystemID_Year_ID                                 int64
System Name_x                                   object
System ID_x                                      int64
System Type                                     object
System Status                                   object
History Year                                    object
Date Received                                   object
County                                          object
Use Cooling Percent                            float64
Use Process Percent                            float64
Use Domestic Percent                           float64
Use Miscellaneous Percent                      float64
Irrigation (Lawn and Garden) Percent           float64
Acres Irrigated                                float64
Irrigation (Agriculture)                       float64
Acres Irrigated.1                              float64
DEQ ID                                          object
Population                                     float64
Domestic U

In [18]:
#Exporting to Finished File
df.to_csv('P_MasterUTSiteSpecific.csv', index=False)  # The output