# Preprocessing Utah Site Specific data for WaDEQA upload.
- Date Updated: 06/11/2021
- Purpose:  To preprocess the UDWRi and UDWRe data into one master file for simple DataFrame creation and extraction

Notes:
- Seperate out Use UDWRi_SystemData water use data by customer type (e.g. Domestic, Commerical, Industrial, Insitutional, Total).
- Extract site data from both  UDWRi_Source Data & UDWRe_CulinaryWaterServiceArea data.
- Left-outter join system data to site data.
- create missing elements (water source ID)

In [1]:
# Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Utah/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

### UDWRi System Data

In [3]:
# Dataframe Creation - system Data
sys_Input = "UDWRi_SystemData_input.csv"
df_sys = pd.read_csv(sys_Input, encoding = "ISO-8859-1")
df_sys.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,,,,,,27010,800.0,62166.25,14418.28,110.34,2749.36,79444.23,349.0,16.0,1.0,5.0,371.0,7/30/2019,309.3,kgallons,Indoor and Outdoor,,,


In [4]:
# Dropping system data that isn't needed for upload.
df_sys = df_sys.drop(['Use Cooling Percent',
                      'Use Process Percent', 
                      'Use Domestic Percent',
                      'Use Miscellaneous Percent', 
                      'Irrigation (Lawn and Garden) Percent',
                      'Peak Date', 
                      'Peak Demand', 
                      'Peak Demand Units',
                      'Peak Use Include', 
                      'Peak Measurement Type', 
                      'Peak Wholesale Volume',
                      'Peak Wholesale Volume Units'], axis=1)
df_sys.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,14418.28,110.34,2749.36,79444.23,349.0,16.0,1.0,5.0,371.0


In [5]:
# System Data - Domestic
df_sys_Domestic = df_sys
df_sys_Domestic['WaterUse'] = df_sys_Domestic['Domestic Use']
df_sys_Domestic['NumOfConnections'] = df_sys_Domestic['Domestic Connections']
df_sys_Domestic['BenUse'] = "Domestic"
df_sys_Domestic['in_VariableSpecificCV'] = "Domestic"
df_sys_Domestic['in_CustomerTypeCV'] = "Domestic"
df_sys_Domestic = df_sys_Domestic.drop(['Domestic Use', 'Commercial Use', 'Industrial Use', 'Institutional Use', 'Total Use'], axis=1)
df_sys_Domestic = df_sys_Domestic.drop(['Domestic Connections', 'Commercial Connections','Industrial Connections', 'Institutional Connections', 'Total Connections'], axis=1)

print(len(df_sys_Domestic))
df_sys_Domestic.head(1)

21129


Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,in_VariableSpecificCV,in_CustomerTypeCV
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic


In [6]:
# System Data - Commercial
df_sys_Commercial = df_sys
df_sys_Commercial['WaterUse'] = df_sys_Commercial['Commercial Use']
df_sys_Commercial['NumOfConnections'] = df_sys_Commercial['Commercial Connections']
df_sys_Commercial['BenUse'] = "Commercial"
df_sys_Commercial['in_VariableSpecificCV'] = "Commercial"
df_sys_Commercial['in_CustomerTypeCV'] = "Commercial"
df_sys_Commercial = df_sys_Commercial.drop(['Domestic Use', 'Commercial Use', 'Industrial Use', 'Institutional Use', 'Total Use'], axis=1)
df_sys_Commercial = df_sys_Commercial.drop(['Domestic Connections', 'Commercial Connections','Industrial Connections', 'Institutional Connections', 'Total Connections'], axis=1)

print(len(df_sys_Commercial))
df_sys_Commercial.head(1)

21129


Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,in_VariableSpecificCV,in_CustomerTypeCV
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,14418.28,16.0,Commercial,Commercial,Commercial


In [7]:
# System Data - Industrial
df_sys_Industrial = df_sys
df_sys_Industrial['WaterUse'] = df_sys_Industrial['Industrial Use']
df_sys_Industrial['NumOfConnections'] = df_sys_Industrial['Industrial Connections']
df_sys_Industrial['BenUse'] = "Industrial"
df_sys_Industrial['in_VariableSpecificCV'] = "Industrial"
df_sys_Industrial['in_CustomerTypeCV'] = "Industrial"
df_sys_Industrial = df_sys_Industrial.drop(['Domestic Use', 'Commercial Use', 'Industrial Use', 'Institutional Use', 'Total Use'], axis=1)
df_sys_Industrial = df_sys_Industrial.drop(['Domestic Connections', 'Commercial Connections','Industrial Connections', 'Institutional Connections', 'Total Connections'], axis=1)

print(len(df_sys_Industrial))
df_sys_Industrial.head(1)

21129


Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,in_VariableSpecificCV,in_CustomerTypeCV
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,110.34,1.0,Industrial,Industrial,Industrial


In [8]:
# Institutional
df_sys_Institutional = df_sys
df_sys_Institutional['WaterUse'] = df_sys_Institutional['Institutional Use']
df_sys_Institutional['NumOfConnections'] = df_sys_Institutional['Institutional Connections']
df_sys_Institutional['BenUse'] = "Institutional"
df_sys_Institutional['in_VariableSpecificCV'] = "Institutional"
df_sys_Institutional['in_CustomerTypeCV'] = "Institutional"
df_sys_Institutional = df_sys_Institutional.drop(['Domestic Use', 'Commercial Use', 'Industrial Use', 'Institutional Use', 'Total Use'], axis=1)
df_sys_Institutional = df_sys_Institutional.drop(['Domestic Connections', 'Commercial Connections','Industrial Connections', 'Institutional Connections', 'Total Connections'], axis=1)

print(len(df_sys_Institutional))
df_sys_Institutional.head(1)

21129


Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,in_VariableSpecificCV,in_CustomerTypeCV
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,2749.36,5.0,Institutional,Institutional,Institutional


In [9]:
# System Data - Total (DCII)
df_sys_Total = df_sys
df_sys_Total['WaterUse'] = df_sys_Total['Total Use']
df_sys_Total['NumOfConnections'] = df_sys_Total['Total Connections']
df_sys_Total['BenUse'] = "Total (DCII)"
df_sys_Total['in_VariableSpecificCV'] = "Total (DCII)"
df_sys_Total['in_CustomerTypeCV'] = "Total"
df_sys_Total = df_sys_Total.drop(['Domestic Use', 'Commercial Use', 'Industrial Use', 'Institutional Use', 'Total Use'], axis=1)
df_sys_Total = df_sys_Total.drop(['Domestic Connections', 'Commercial Connections','Industrial Connections', 'Institutional Connections', 'Total Connections'], axis=1)

print(len(df_sys_Total))
df_sys_Total.head(1)

21129


Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,in_VariableSpecificCV,in_CustomerTypeCV
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,79444.23,371.0,Total (DCII),Total (DCII),Total


In [10]:
# Concatenate System Data into one long dataframe.
frames = [df_sys_Domestic, df_sys_Commercial, df_sys_Industrial, df_sys_Institutional, df_sys_Total]
df_sys_all = pd.concat(frames)

df_sys_all = df_sys_all.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(df_sys_all))
df_sys_all.head(1)

105645


Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,in_VariableSpecificCV,in_CustomerTypeCV
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic


### UDWRi Source Data

In [11]:
# Dataframe Creation - system Data
sour_Input = "UDWRi_SourceData_input.csv"
df_sour = pd.read_csv(sour_Input, encoding = "ISO-8859-1")
df_sour.head(1)

Unnamed: 0,Source ID,Source Name,Year,Source Status,System Name,System ID,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Use Type,Units,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total
0,10000001,Oak Grove Spring (WS001),,Active,Leeds Domestic Water Users Association,1000,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,,,,,,,,,,,,,,


In [12]:
# Creating Water Source Type.
wsTypeDict = {
    "Spring" : "Surface Water",
    "Well" : "Groundwater",
    "Stream" : "Surface Water",
    "Tunnel" : "Groundwater",
    "Reservoir" : "Surface Water",
    "Drain" : "Surface Water",
    "Lake" : "Surface Water"}
def assignWaterSourceTypeCV(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = ""
    else:
        String1 = colrowValue.strip()
        try:
            outList = wsTypeDict[String1]
        except:
            outList = ""
    return outList

df_sour['in_WaterSourceTypeCV'] = df_sour.apply(lambda row: assignWaterSourceTypeCV(row['Source Type']), axis=1)
df_sour.head(1)

Unnamed: 0,Source ID,Source Name,Year,Source Status,System Name,System ID,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Use Type,Units,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total,in_WaterSourceTypeCV
0,10000001,Oak Grove Spring (WS001),,Active,Leeds Domestic Water Users Association,1000,37.309077,-113.429412,Spring,Withdrawal,Water Supplier,,,,,,,,,,,,,,,Surface Water


In [13]:
# Output dataframe for Culinary Service Area Data

df_sourOut = pd.DataFrame(index=df_sour.index)
df_sourOut['linkKey'] = df_sour['System ID']

# # Variable Info
# df_sourOut['in_VariableSpecificCV'] = "Water Use " + df_sour['Diversion Type']

# Water Source Info
df_sourOut['in_WaterSourceTypeCV'] = df_sour['in_WaterSourceTypeCV']

# Site Info
df_sourOut['in_CoordinateMethodCV'] = "Representation Node"
df_sourOut['in_Latitude'] = df_sour['Lat NAD83'].astype(float)
df_sourOut['in_Longitude'] = df_sour['Lon NAD83'].astype(float)
df_sourOut['in_PODorPOUSite'] = "POD"
df_sourOut['in_SiteName'] = df_sour['Source Name']
df_sourOut['in_SiteNativeID'] =df_sour['Source ID']
df_sourOut['in_SiteTypeCV'] = df_sour['Source Type']

df_sourOut = df_sourOut.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(df_sourOut))
df_sourOut.head(1)

7041


Unnamed: 0,linkKey,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV
0,1000,Surface Water,Representation Node,37.309077,-113.429412,POD,Oak Grove Spring (WS001),10000001,Spring


### UDWRe Culinary Service Area Data

In [14]:
# Dataframe Creation - UDWRe site information
csa_Input = "UDWRe_CulinaryWaterServiceAreas_input.csv"
df_csa = pd.read_csv(csa_Input, encoding = "ISO-8859-1")
df_csa.head(1)

Unnamed: 0,FID_1,WRENAME,WRNAME,DWNAME,SYSTEMTYPE,WATERRESID,WRID,DWSYSNUM,WRLINK,WHOLESALER,LABEL,STATE,COUNTY,BASIN,SUBAREA,SUBAREANAM,LANUM,LANAME,ENDYEAR,DATASOURCE,SOURCEDATE,EDITOR,EDITDATE,SHAPE_Leng,Shape__Are,Shape__Len,Shape_Length,Shape_Area,Longitude,Latitude
0,1,Irontown,Irontown,Irontown,C,564,11358,UTAH11070,http://waterrights.utah.gov/cgi-bin/wuseview.e...,,,Utah,Iron,Cedar/Beaver,6/3/2001,Escalante Desert,06-03-01a,Escalante Desert,2019,DWRe/Supplier,5/21/2019 0:00,ADAMCLARK,10/19/2020 0:00,5567.248025,656517.818,5567.248025,5567.248025,656517.8179,-113.44603,37.602583


In [15]:
# Output dataframe for Culinary Service Area Data

df_csaOut = pd.DataFrame(index=df_csa.index)
df_csaOut['linkKey'] = df_csa['WRID']

# # Variable Info
# df_csaOut['in_VariableSpecificCV'] = "Water Use Unspecified"

# Water Source Info
df_csaOut['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
df_csaOut['in_CoordinateMethodCV'] = "Centroid of Area"
df_csaOut['in_Latitude'] = df_csa['Latitude'].astype(float)
df_csaOut['in_Longitude'] = df_csa['Longitude'].astype(float)
df_csaOut['in_PODorPOUSite'] = "POU"
df_csaOut['in_SiteName'] = df_csa['WRENAME']
df_csaOut['in_SiteNativeID'] = df_csa['WRID']
#df_csaOut['in_SiteTypeCV'] = df_csa['SYSTEMTYPE']
df_csaOut['in_SiteTypeCV'] = "Unspecified"

df_csaOut = df_csaOut.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(df_csaOut))
df_csaOut.head(1)

1294


Unnamed: 0,linkKey,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV
0,11358,Unspecified,Centroid of Area,37.602583,-113.44603,POU,Irontown,11358,Unspecified


### Concatenate  UDWRi Source Data with DWRe Culinary Service Area Data to create one long dataframe

In [16]:
# Concatenate System Data into one long dataframe.
frames = [df_sourOut, df_csaOut]
df_site = pd.concat(frames)

df_site = df_site.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(df_site))
df_site

8335


Unnamed: 0,linkKey,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV
0,1000,Surface Water,Representation Node,37.309077,-113.429412,POD,Oak Grove Spring (WS001),10000001,Spring
1,1000,Groundwater,Representation Node,37.267014,-113.36306,POD,Leeds Well (WS002),10000002,Well
2,1000,Groundwater,Representation Node,37.265809,-113.351593,POD,"El Dorado Hills Well (8 in, 335 ft deep)",108540479,Well
3,1001,Groundwater,Representation Node,41.169959,-112.005634,POD,5190S 1050W #1 Well (WS003),10010001,Well
4,1001,Groundwater,Representation Node,41.164615,-112.005275,POD,Golf Well (6`),10010002,Well
...,...,...,...,...,...,...,...,...,...
8330,0,Unspecified,Centroid of Area,37.559081,-113.225921,POU,Harmony Mtn Ranches,0,Unspecified
8331,0,Unspecified,Centroid of Area,40.641739,-111.971124,POU,Summit Vista Water Co.,0,Unspecified
8332,11472,Unspecified,Centroid of Area,40.693258,-112.2603,POU,"BCI-LP Holdings, L.C.",11472,Unspecified
8333,11608,Unspecified,Centroid of Area,37.709078,-113.080866,POU,Staker Parsons Co. - Western Rock Products (Ce...,11608,Unspecified


### Merge UDWRi System Data with site Data
- Need to do an left outer join, then remove NULL rows

In [17]:
# Create output dataframe.  Merge the two dataframes into one.
dfout = pd.merge(df_sys_all, df_site, left_on='System ID', right_on='linkKey', how='left')

dfout = dfout.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfout))
dfout

926235


Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,in_VariableSpecificCV,in_CustomerTypeCV,linkKey,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic,1000,Surface Water,Representation Node,37.309077,-113.429412,POD,Oak Grove Spring (WS001),10000001,Spring
1,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic,1000,Groundwater,Representation Node,37.267014,-113.36306,POD,Leeds Well (WS002),10000002,Well
2,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic,1000,Groundwater,Representation Node,37.265809,-113.351593,POD,"El Dorado Hills Well (8 in, 335 ft deep)",108540479,Well
3,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic,1000,Unspecified,Centroid of Area,37.237197,-113.347061,POU,Leeds Domestic Water Users Assoc.,1000,Unspecified
4,Leeds Domestic Water Users Association,1000,Public,Active,2018,4/30/2019,Washington,,,,27010,800.0,61670.86,335.0,Domestic,Domestic,Domestic,1000,Surface Water,Representation Node,37.309077,-113.429412,POD,Oak Grove Spring (WS001),10000001,Spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926230,"Magnum Holdings, LLC.",11716,Industrial,Active,2019,10/2/2020,Millard,0.0,0.0,0.0,,,,,Total (DCII),Total (DCII),Total,11716,Groundwater,Representation Node,39.488348,-112.604034,POD,Well #10,108545955,Well
926231,"Magnum Holdings, LLC.",11716,Industrial,Active,2019,10/2/2020,Millard,0.0,0.0,0.0,,,,,Total (DCII),Total (DCII),Total,11716,Groundwater,Representation Node,39.486722,-112.611788,POD,Well #11 (MH-5),108545956,Well
926232,"Magnum Holdings, LLC.",11716,Industrial,Active,2019,10/2/2020,Millard,0.0,0.0,0.0,,,,,Total (DCII),Total (DCII),Total,11716,Groundwater,Representation Node,39.494021,-112.612465,POD,Well #6 (MH-1),108545957,Well
926233,"Magnum Holdings, LLC.",11716,Industrial,Active,2019,10/2/2020,Millard,0.0,0.0,0.0,,,,,Total (DCII),Total (DCII),Total,11716,Groundwater,Representation Node,39.486279,-112.624303,POD,Well #12,108545958,Well


In [18]:
# Convert History Year to string.
# Create WaDE Timeframe start and end date.  Assume start = 01/01 & end =  12/31 for now.

dfout['History Year'] = dfout['History Year'].astype(str)
dfout['in_TimeframeEnd'] = '12/31/' + dfout['History Year']
dfout['in_TimeframeStart'] = '01/01/' + dfout['History Year']

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], errors = 'coerce')
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], errors = 'coerce')
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,in_VariableSpecificCV,in_CustomerTypeCV,linkKey,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_TimeframeEnd,in_TimeframeStart
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic,1000,Surface Water,Representation Node,37.309077,-113.429412,POD,Oak Grove Spring (WS001),10000001,Spring,2019-12-31,2019-01-01


In [19]:
# # Converting Population to float value.
# dfout['Population'] = dfout['Population'].astype(float).replace(np.nan, 0, regex=True)
# dfout['Population'] = dfout['Population'].astype(int)
# dfout['Population']

## WaDE Custom Elements (due to missing info)

In [20]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEUT_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceTypeCV']), axis=1)
dfout

Unnamed: 0,System Name,System ID,System Type,System Status,History Year,Date Received,County,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,WaterUse,NumOfConnections,BenUse,in_VariableSpecificCV,in_CustomerTypeCV,linkKey,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceNativeID
0,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic,1000,Surface Water,Representation Node,37.309077,-113.429412,POD,Oak Grove Spring (WS001),10000001,Spring,2019-12-31,2019-01-01,WaDEUT_WS1
1,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic,1000,Groundwater,Representation Node,37.267014,-113.36306,POD,Leeds Well (WS002),10000002,Well,2019-12-31,2019-01-01,WaDEUT_WS2
2,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic,1000,Groundwater,Representation Node,37.265809,-113.351593,POD,"El Dorado Hills Well (8 in, 335 ft deep)",108540479,Well,2019-12-31,2019-01-01,WaDEUT_WS2
3,Leeds Domestic Water Users Association,1000,Public,Active,2019,1/22/2020,Washington,,,,27010,800.0,62166.25,349.0,Domestic,Domestic,Domestic,1000,Unspecified,Centroid of Area,37.237197,-113.347061,POU,Leeds Domestic Water Users Assoc.,1000,Unspecified,2019-12-31,2019-01-01,WaDEUT_WS3
4,Leeds Domestic Water Users Association,1000,Public,Active,2018,4/30/2019,Washington,,,,27010,800.0,61670.86,335.0,Domestic,Domestic,Domestic,1000,Surface Water,Representation Node,37.309077,-113.429412,POD,Oak Grove Spring (WS001),10000001,Spring,2018-12-31,2018-01-01,WaDEUT_WS1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926230,"Magnum Holdings, LLC.",11716,Industrial,Active,2019,10/2/2020,Millard,0.0,0.0,0.0,,,,,Total (DCII),Total (DCII),Total,11716,Groundwater,Representation Node,39.488348,-112.604034,POD,Well #10,108545955,Well,2019-12-31,2019-01-01,WaDEUT_WS2
926231,"Magnum Holdings, LLC.",11716,Industrial,Active,2019,10/2/2020,Millard,0.0,0.0,0.0,,,,,Total (DCII),Total (DCII),Total,11716,Groundwater,Representation Node,39.486722,-112.611788,POD,Well #11 (MH-5),108545956,Well,2019-12-31,2019-01-01,WaDEUT_WS2
926232,"Magnum Holdings, LLC.",11716,Industrial,Active,2019,10/2/2020,Millard,0.0,0.0,0.0,,,,,Total (DCII),Total (DCII),Total,11716,Groundwater,Representation Node,39.494021,-112.612465,POD,Well #6 (MH-1),108545957,Well,2019-12-31,2019-01-01,WaDEUT_WS2
926233,"Magnum Holdings, LLC.",11716,Industrial,Active,2019,10/2/2020,Millard,0.0,0.0,0.0,,,,,Total (DCII),Total (DCII),Total,11716,Groundwater,Representation Node,39.486279,-112.624303,POD,Well #12,108545958,Well,2019-12-31,2019-01-01,WaDEUT_WS2


In [21]:
# Converting Population to float value.
dfout['Population'] = dfout['Population'].replace(np.nan, 0, regex=True)
dfout['Population'] = dfout['Population'].replace('', 0, regex=True)
dfout['Population'] = dfout['Population'].astype(int)
dfout['Population']

0         800
1         800
2         800
3         800
4         800
         ... 
926230      0
926231      0
926232      0
926233      0
926234      0
Name: Population, Length: 926235, dtype: int32

In [22]:
#check datatype
print(len(dfout))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

926235
System Name                         object
System ID                            int64
System Type                         object
System Status                       object
History Year                        object
Date Received                       object
County                              object
Acres Irrigated                     object
Irrigation (Agriculture)            object
Acres Irrigated.1                   object
DEQ ID                              object
Population                           int32
WaterUse                            object
NumOfConnections                    object
BenUse                              object
in_VariableSpecificCV               object
in_CustomerTypeCV                   object
linkKey                              int64
in_WaterSourceTypeCV                object
in_CoordinateMethodCV               object
in_Latitude                         object
in_Longitude                        object
in_PODorPOUSite                     object
in_S

In [23]:
# Exporting output files.
dfout.to_csv('P_MasterUTSiteSpecific.csv', index=False)  # The output.