# Pre-processing Utah Site Specific data for WaDEQA upload.
Date Updated: 07/30/2020
Purpose:  To pre-process the Utah data into one master file for simple DataFrame creation and extraction

Notes:
1) Need to left join of System Data and Source Data
2) Cut down on number of columns in Source Data to reduce redundecy (Source_temp).
3) Made custom link ID by hand for both System and Source Data by combining SystemID + Year: SystemID_Year_ID

In [10]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [11]:
#Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Utah/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)
System_Input = "SystemData_temp.csv"
Source_Input = "SourceData_temp.csv"

#Dataframe creation
df_sys = pd.read_csv(System_Input, encoding = "ISO-8859-1")
df_sor = pd.read_csv(Source_Input, encoding = "ISO-8859-1")

In [12]:
#Merge
df = pd.merge(df_sys, df_sor, left_on='SystemID_Year_ID', right_on='SystemID_Year_ID', how='inner')
df

Unnamed: 0,SystemID_Year_ID,System Name_x,System ID_x,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units,System Name_y,System ID_y,Year,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type
0,10001979,Leeds Domestic Water Users Association,1000,Public,Active,1979,3/10/1980,Washington,,,,,,,,,27010,255.0,0.000000,0.0,0.0,,89.071316,0.0,0.0,0.0,,101.0,,,,,,,,Leeds Domestic Water Users Association,1000,1979.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal
1,10001980,Leeds Domestic Water Users Association,1000,Public,Active,1980,3/9/1981,Washington,,,,,,,,,27010,269.0,0.000000,0.0,0.0,,102.188634,95.0,7.0,0.0,,102.0,,,,,,,,Leeds Domestic Water Users Association,1000,1980.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal
2,10001981,Leeds Domestic Water Users Association,1000,Public,Active,1981,1/25/1982,Washington,,,,,,,,,27010,269.0,0.000000,0.0,0.0,,88.426806,103.0,6.0,0.0,,109.0,,,,,,,,Leeds Domestic Water Users Association,1000,1981.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal
3,10001982,Leeds Domestic Water Users Association,1000,Public,Active,1982,3/24/1983,Washington,,,,,,,,,27010,218.0,0.000000,0.0,0.0,,80.223111,0.0,0.0,0.0,,90.0,,,,,,,,Leeds Domestic Water Users Association,1000,1982.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal
4,10001983,Leeds Domestic Water Users Association,1000,Public,Active,1983,2/21/1984,Washington,,,,,,,,,27010,250.0,0.000000,0.0,0.0,,69.705318,0.0,0.0,0.0,,99.0,,,,,,,,Leeds Domestic Water Users Association,1000,1983.0,10000001,Oak Grove Spring (WS001),Active,37.309077,-113.429412,Spring,Withdrawal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27299,12412018,"National Park Service, Glen Canyon Bullfrog Re...",1241,Public,Active,2018,7/8/2019,Kane,,,,,,,,,13014,220.0,58.189881,0.0,0.0,22.447347,80.637228,125.0,0.0,0.0,9.0,134.0,8/8/2018,1.592750,acre feet,Indoor and Outdoor,,,,"National Park Service, Glen Canyon Bullfrog Re...",1241,2018.0,12410002,Well No.2 (WS002),Active,37.531396,-110.711764,Well,Withdrawal
27300,12412018,"National Park Service, Glen Canyon Bullfrog Re...",1241,Public,Active,2018,7/8/2019,Kane,,,,,,,,,13014,220.0,58.189881,0.0,0.0,22.447347,80.637228,125.0,0.0,0.0,9.0,134.0,8/8/2018,1.592750,acre feet,Indoor and Outdoor,,,,"National Park Service, Glen Canyon Bullfrog Re...",1241,2018.0,108544946,Sold to Aramark,Active,,,,Transfer Out
27301,12412019,"National Park Service, Glen Canyon Bullfrog Re...",1241,Public,Active,2019,2/28/2020,Kane,,,,,,,,,13014,220.0,33.902260,0.0,0.0,23.076971,56.979232,125.0,0.0,0.0,9.0,134.0,7/7/2019,1.117074,acre feet,Indoor and Outdoor,,0.355211,acre feet,"National Park Service, Glen Canyon Bullfrog Re...",1241,2019.0,12410001,Well No.1 (WS001),Active,37.527684,-110.717070,Well,Withdrawal
27302,12412019,"National Park Service, Glen Canyon Bullfrog Re...",1241,Public,Active,2019,2/28/2020,Kane,,,,,,,,,13014,220.0,33.902260,0.0,0.0,23.076971,56.979232,125.0,0.0,0.0,9.0,134.0,7/7/2019,1.117074,acre feet,Indoor and Outdoor,,0.355211,acre feet,"National Park Service, Glen Canyon Bullfrog Re...",1241,2019.0,12410002,Well No.2 (WS002),Active,37.531396,-110.711764,Well,Withdrawal


In [13]:
#check columns
df.columns

Index(['SystemID_Year_ID', 'System Name_x', 'System ID_x', 'System Type',
       'System Status', 'History Year', 'Date Received', 'County',
       'Use Cooling Percent', 'Use Process Percent', 'Use Domestic Percent',
       'Use Miscellaneous Percent', 'Irrigation (Lawn and Garden) Percent',
       'Acres Irrigated', 'Irrigation (Agriculture)', 'Acres Irrigated.1',
       'DEQ ID', 'Population', 'Domestic Use', 'Commercial Use',
       'Industrial Use', 'Institutional Use', 'Total Use',
       'Domestic Connections', 'Commercial Connections',
       'Industrial Connections', 'Institutional Connections',
       'Total Connections', 'Peak Date', 'Peak Demand', 'Peak Demand Units',
       'Peak Use Include', 'Peak Measurement Type', 'Peak Wholesale Volume',
       'Peak Wholesale Volume Units', 'System Name_y', 'System ID_y', 'Year',
       'Source ID', 'Source Name', 'Source Status', 'Lat NAD83', 'Lon NAD83',
       'Source Type', 'Diversion Type'],
      dtype='object')

In [14]:
#check datatype
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

SystemID_Year_ID                          int64
System Name_x                            object
System ID_x                               int64
System Type                              object
System Status                            object
History Year                              int64
Date Received                            object
County                                   object
Use Cooling Percent                     float64
Use Process Percent                     float64
Use Domestic Percent                    float64
Use Miscellaneous Percent               float64
Irrigation (Lawn and Garden) Percent    float64
Acres Irrigated                         float64
Irrigation (Agriculture)                float64
Acres Irrigated.1                       float64
DEQ ID                                   object
Population                              float64
Domestic Use                            float64
Commercial Use                          float64
Industrial Use                          

In [15]:
#Exporting to Finished File
df.to_csv('P_MasterUTSiteSpecific.csv', index=False)  # The output