# Preprocessing Utah Site Specific data for WaDEQA upload.
- Date Updated: 11/14/2021
- Purpose:  To preprocess the UDWRi and UDWRe data into one master file for simple DataFrame creation and extraction

Notes:
- Will treat UDWRi System + UDWRe data as POUs, and UDWRi Source data as PODs.
- For annual data, assume start = 01/01 & end =  12/31 for now.
- Seperate out water use System data data by customer type / benefical use (e.g. Domestic, Commerical, Industrial, Insitutional).  Do not use the Total fields.
- Seperate out the water use Source data by monthly and again by annual.
- create missing elements (water source type).

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Utah/SS_PublicSupplyWaterUse/RawInputData"
os.chdir(workingDir)

## Place of Use Data

#### UDWRi_SystemData_PerUse_input

In [None]:
# Input File - UDWRi_SystemData_PerUse_input.csv
fileInput = "UDWRi_SystemData_PerUse_input.csv"
dfsyspu = pd.read_csv(fileInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsyspu:
    dfsyspu['WaDEUUID'] = "utsyspu" + dfsyspu.index.astype(str)
    dfsyspu.to_csv('UDWRi_SystemData_PerUse_input.csv', index=False)

print(len(dfsyspu))
dfsyspu.head(1)

In [None]:
# clean up input data
# ---------------------------

#active recods only
dfsyspu = dfsyspu[dfsyspu['System Status'] == "Active"]

# drop rows with a null Year value
dfsyspu = dfsyspu.dropna(subset=['History Year'])

# Adjust data type of fields
dfsyspu['Year'] = dfsyspu['History Year'].astype(int)

print(len(dfsyspu))

In [None]:
useTypeList = ['Domestic', 'Commercial', 'Industrial', 'Institutional']

In [None]:
%%time

dfsyspuOut = pd.DataFrame()

for x in useTypeList:
    useTypeString = str(x)
    print(useTypeString)
    
    dftemp = pd.DataFrame(index=dfsyspu.index)
    # Variable Info
    dftemp['in_VariableCV'] = "Delivered Water Use"
    dftemp['in_VariableSpecificCV'] = "Delivered Water Use_Annual_" + useTypeString
    
    # SiteVariableAmounts_fact Info
    amountUseStr = useTypeString + " Use"
    dftemp['in_Amount'] = dfsyspu[amountUseStr]
    dftemp['in_PopulationServed'] = dfsyspu['Domestic Connections']
    dftemp['in_BenUse'] = useTypeString
#     dftemp['in_CustomerTypeCV'] = useTypeString
    dftemp['in_ReportYearCV'] = dfsyspu['History Year']
    dftemp['in_TimeframeEnd'] = '12/31/' + dfsyspu['History Year'].astype(str)
    dftemp['in_TimeframeStart'] = '01/01/' + dfsyspu['History Year'].astype(str)
    
    # Water Source Info
    dftemp['in_WaterSourceTypeCV'] = "Unspecified"
    
    # link to site data
    dftemp['linkKey'] = dfsyspu['System ID']
    
    dfsyspuOut = pd.concat([dfsyspuOut, dftemp])   

print(len(dfsyspuOut))
dfsyspuOut.head(1)

#### UDWRi_SystemData_Total_no0Null_input

In [None]:
# Input File - UDWRi_SystemData_Total_no0Null_input.csv
fileInput = "UDWRi_SystemData_Total_no0Null_input.csv"
dfsyst = pd.read_csv(fileInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsyst:
    dfsyst['WaDEUUID'] = "utsyst" + dfsyst.index.astype(str)
    dfsyst.to_csv('UDWRi_SystemData_Total_no0Null_input.csv', index=False)

print(len(dfsyst))
dfsyst.head(1)

In [None]:
# clean up input data
# ---------------------------

#active recods only
dfsyst = dfsyst[dfsyst['System Status'] == "Active"]

# drop rows with a null Year value
dfsyst = dfsyst.dropna(subset=['History Year'])

# Adjust data type of fields
dfsyst['Year'] = dfsyst['History Year'].astype(int)

print(len(dfsyst))

In [None]:
useTypeList = ['Total Use']

In [None]:
%%time

dfsystOut = pd.DataFrame()

for x in useTypeList:
    useTypeString = str(x)
    print(useTypeString)
    
    dftemp = pd.DataFrame(index=dfsyst.index)
    # Variable Info
    dftemp['in_VariableCV'] = "Delivered Water Use"
    dftemp['in_VariableSpecificCV'] = "Delivered Water Use_Annual_DCII"
    
    # SiteVariableAmounts_fact Info
    amountUseStr = useTypeString
    dftemp['in_Amount'] = dfsyst[amountUseStr]
    dftemp['in_PopulationServed'] = dfsyst['Domestic Connections']
    dftemp['in_BenUse'] = "DCII"
#     dftemp['in_CustomerTypeCV'] = "DCII"
    dftemp['in_ReportYearCV'] = dfsyst['History Year']
    dftemp['in_TimeframeEnd'] = '12/31/' + dfsyst['History Year'].astype(str)
    dftemp['in_TimeframeStart'] = '01/01/' + dfsyst['History Year'].astype(str)
    
    # Water Source Info
    dftemp['in_WaterSourceTypeCV'] = "Unspecified"
    
    # link to site data
    dftemp['linkKey'] = dfsyst['System ID']
    
    dfsystOut = pd.concat([dfsystOut, dftemp])   

print(len(dfsystOut))
dfsystOut.head(1)

#### UDWRe Culinary Service Area Data
- Need to tie the UDWRi system data to the UDWRe site info

In [None]:
# PoU Shapefile Data
# Shapefile input
ShapeFileInput = gpd.read_file('shapefile/CulinaryWaterServiceAreas.shp')
dfcsa = pd.DataFrame(ShapeFileInput)
dfcsa.head(3)

In [None]:
# # Input File - StreamGageGetStationList.csv
# fileInput = "UDWRe_CulinaryWaterServiceAreas_input.csv"
# dfcsa = pd.read_csv(fileInput)

# # WaDE UUID tracker for data assessment
# if 'WaDEUUID' not in dfcsa:
#     dfcsa['WaDEUUID'] = "utcsa" + dfcsa.index.astype(str)
#     dfcsa.to_csv('UDWRe_CulinaryWaterServiceAreas_input.csv', index=False)

# print(len(dfcsa))
# dfcsa.head(1)

In [None]:
# Output dataframe for Culinary Service Area Data

dfcsa_Out = pd.DataFrame(index=dfcsa.index)
dfcsa_Out['linkKey'] = dfcsa['WRID']

# Site Info
dfcsa_Out['in_CoordinateMethodCV'] = "Centroid of Area"
dfcsa_Out['in_County'] = dfcsa['COUNTY']
dfcsa_Out['in_Latitude'] = dfcsa['Latitude'].astype(float)
dfcsa_Out['in_Longitude'] = dfcsa['Longitude'].astype(float)
dfcsa_Out['in_PODorPOUSite'] = "POU"
dfcsa_Out['in_SiteName'] = dfcsa['WRENAME']
dfcsa_Out['in_SiteNativeID'] = "POU" + dfcsa['WRID'].astype(str)
dfcsa_Out['in_SiteTypeCV'] = "Unspecified"

dfcsa_Out = dfcsa_Out.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfcsa_Out))
dfcsa_Out.head(1)

In [None]:
# Merge site data to System_Per Use data
dfsyspuOut = pd.merge(dfsyspuOut, dfcsa_Out, on='linkKey', how='left')
dfsyspuOut = dfsyspuOut.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfsyspuOut))
dfsyspuOut.head()

In [None]:
# Merge site data to System_Total data
dfsystOut = pd.merge(dfsystOut, dfcsa_Out, on='linkKey', how='left')
dfsystOut = dfsystOut.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfsystOut))
dfsystOut.head()

In [None]:
# Concatenate all System Data into one long dataframe.
frames = [dfsyspuOut, dfsystOut]
dfsysOut = pd.concat(frames)

dfsysOut = dfsysOut.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfsysOut))
dfsysOut.head(1)

## UDWRi Source (POD) Data

#### UDWRi_SourceData_Monthly_input

In [None]:
# Input File - UDWRi_SourceData_Monthly_input.csv
fileInput = "UDWRi_SourceData_Monthly_input.csv"
dfsourm = pd.read_csv(fileInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsourm:
    dfsourm['WaDEUUID'] = "utsourm" + dfsourm.index.astype(str)
    dfsourm.to_csv('UDWRi_SourceData_Monthly_input.csv', index=False)

print(len(dfsourm))
dfsourm.head(1)

In [None]:
# clean up input data
# ---------------------------

#active recods only
dfsourm = dfsourm[dfsourm['Source Status'] == "Active"]

# drop rows with a null Year value
dfsourm = dfsourm.dropna(subset=['Year'])

# Adjust data type of fields
dfsourm['Year'] = dfsourm['Year'].astype(int)

print(len(dfsourm))

In [None]:
# Loop data list.  Use this to search for specific fields.
variableTypeList = ['Withdrawal', 'Transfer In', 'Transfer Out', 'Delivery', 'Return']

useTypeList = ['Industrial', 'Irrigation', 'Domestic', 'Commercial', 'Geothermal', 'Agricultural', 'Mining', 
               'Power (Fossil-Fuel)', 'Power (Geothermal)', 'Power (Hydro-Elec)', 'Sewage Treatment', 'Water Supplier']

monthUseList = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
startDateList = ["01/01/", "02/01/", "03/01/", "04/01/", "05/01/", "06/01/", "07/01/", "08/01/", "09/01/", "10/01/", "11/01/", "12/01/"]
endDateList = ["01/31/", "02/28/", "03/31/", "04/30/", "05/31/", "06/30/", "07/31/", "08/31/", "09/30/", "10/31/", "11/30/", "12/31/"]

In [None]:
# create time series data

# the output dataframe
dfsourm_Out = pd.DataFrame()

# for each value in variableTypeList
for x in variableTypeList:
    
    # trim down input data to match variable Type
    variableTypeStr = str(x)
    print(variableTypeStr) 
    dfsourm2 = dfsourm.copy()
    dfsourm2 = dfsourm2[dfsourm2['Diversion Type'] == variableTypeStr].reset_index(drop=True)
    print(f"length of dfsour2 ", variableTypeStr, " is ", len(dfsourm2))
          
    # for each value in useTypeList
    for y in useTypeList:
        
        # trim down input data to match Use Type
        useTypeString = str(y)
        print(f"..", useTypeString)
        dfsourm3 = dfsourm2.copy()
        dfsourm3 = dfsourm3[dfsourm3['Use Type'] == useTypeString].reset_index(drop=True)
        print(f"..length of df ", useTypeString, " is ", len(dfsourm3))
    
        # Create Output for that Use Type
        dftemp2 = pd.DataFrame(index=dfsourm3.index)
    
        # Variable Info
        dftemp2['in_VariableCV'] = dfsourm3['Diversion Type'].astype(str)
        dftemp2['in_VariableSpecificCV'] = dfsourm3['Diversion Type'].astype(str) + "_Monthly_" + useTypeString
        
        # SiteVariableAmounts_fact Info
        dftemp2['in_PopulationServed'] = ""
        dftemp2['in_BenUse'] = dfsourm3['Use Type']
#         dftemp2['in_CustomerTypeCV'] = dfsourm3['Use Type']
        dftemp2['in_ReportYearCV'] = dfsourm3['Year']
    
        # Water Source Info
        dftemp2['in_WaterSourceTypeCV'] = dfsourm3['Source Type']
    
        # Site Info
        dftemp2['in_CoordinateMethodCV'] = "Representation Node"
        dftemp2['in_Latitude'] = dfsourm3['Lat NAD83'].astype(float)
        dftemp2['in_Longitude'] = dfsourm3['Lon NAD83'].astype(float)
        dftemp2['in_PODorPOUSite'] = "POD"
        dftemp2['in_SiteName'] = dfsourm3['Source Name']
        dftemp2['in_SiteNativeID'] = "POD" + dfsourm3['Source ID'].astype(str)
        dftemp2['in_SiteTypeCV'] = dfsourm3['Source Type']
        
        # link to other sites
        dftemp2['linkKey'] = dfsourm3['System ID']
        
        # for each value in monthUseList
        for z in range(len(monthUseList)):
            monthUseString = str(monthUseList[z])
            startDateString = str(startDateList[z])
            endDateString = str(endDateList[z])
            dftemp3 = dftemp2.copy()
            dftemp3['monthCheck'] = monthUseString
            dftemp3['in_Amount'] = dfsourm3[monthUseList[z]] # insert column name here
            dftemp3['in_TimeframeStart'] = startDateString + dfsourm3['Year'].astype(str)
            dftemp3['in_TimeframeEnd'] = endDateString + dfsourm3['Year'].astype(str)
            dfsourm_Out = pd.concat([dfsourm_Out, dftemp3])

print(len(dfsourm_Out))

#### UDWRi_SourceData_Annual_no0Null_input

In [None]:
# Input File - UDWRi_SourceData_Annual_no0Null_input.csv
fileInput = "UDWRi_SourceData_Annual_no0Null_input.csv"
dfsoura = pd.read_csv(fileInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsoura:
    dfsoura['WaDEUUID'] = "utsoura" + dfsoura.index.astype(str)
    dfsoura.to_csv('UDWRi_SourceData_Annual_no0Null_input.csv', index=False)

print(len(dfsoura))
dfsoura.head(1)

In [None]:
# clean up input data
# ---------------------------

#active recods only
dfsoura = dfsoura[dfsoura['Source Status'] == "Active"]

# drop rows with a null Year value
dfsoura = dfsoura.dropna(subset=['Year'])

# Adjust data type of fields
dfsoura['Year'] = dfsoura['Year'].astype(int)

print(len(dfsoura))

In [None]:
# Loop data list.  Use this to search for specific fields.
variableTypeList = ['Withdrawal', 'Transfer In', 'Transfer Out', 'Delivery', 'Return']

useTypeList = ['Industrial', 'Irrigation', 'Domestic', 'Commercial', 'Geothermal', 'Agricultural', 'Mining', 
               'Power (Fossil-Fuel)', 'Power (Geothermal)', 'Power (Hydro-Elec)', 'Sewage Treatment', 'Water Supplier']

monthUseList = ['Total']

In [None]:
# create time series data

# the output dataframe
dfsoura_Out = pd.DataFrame()

# for each value in variableTypeList
for x in variableTypeList:
    
    # trim down input data to match variable Type
    variableTypeStr = str(x)
    print(variableTypeStr) 
    dfsoura2 = dfsoura.copy()
    dfsoura2 = dfsoura2[dfsoura2['Diversion Type'] == variableTypeStr].reset_index(drop=True)
    print(f"length of dfsour2 ", variableTypeStr, " is ", len(dfsoura2))
          
    # for each value in useTypeList
    for y in useTypeList:
        
        # trim down input data to match Use Type
        useTypeString = str(y)
        print(f"..", useTypeString)
        dfsoura3 = dfsoura2.copy()
        dfsoura3 = dfsoura3[dfsoura3['Use Type'] == useTypeString].reset_index(drop=True)
        print(f"..length of df ", useTypeString, " is ", len(dfsoura3))
    
        # Create Output for that Use Type
        dftemp2 = pd.DataFrame(index=dfsoura3.index)
    
        # Variable Info
        dftemp2['in_VariableCV'] = dfsoura3['Diversion Type'].astype(str)
        dftemp2['in_VariableSpecificCV'] = dfsoura3['Diversion Type'].astype(str) + "_Annual_" + useTypeString
        
        # SiteVariableAmounts_fact Info
        dftemp2['in_PopulationServed'] = ""
        dftemp2['in_BenUse'] = dfsoura3['Use Type']
#         dftemp2['in_CustomerTypeCV'] = dfsoura3['Use Type']
        dftemp2['in_ReportYearCV'] = dfsoura3['Year']
    
        # Water Source Info
        dftemp2['in_WaterSourceTypeCV'] = dfsoura3['Source Type']
    
        # Site Info
        dftemp2['in_CoordinateMethodCV'] = "Representation Node"
        dftemp2['in_Latitude'] = dfsoura3['Lat NAD83'].astype(float)
        dftemp2['in_Longitude'] = dfsoura3['Lon NAD83'].astype(float)
        dftemp2['in_PODorPOUSite'] = "POD"
        dftemp2['in_SiteName'] = dfsoura3['Source Name']
        dftemp2['in_SiteNativeID'] = "POD" + dfsoura3['Source ID'].astype(str)
        dftemp2['in_SiteTypeCV'] = dfsoura3['Source Type']
        
        # link to other sites
        dftemp2['linkKey'] = dfsoura3['System ID']
        
        # for each value in monthUseList
        for z in range(len(monthUseList)):
            monthUseString = str(monthUseList[z])
            startDateString = str(startDateList[z])
            endDateString = str(endDateList[z])
            dftemp3 = dftemp2.copy()
            dftemp3['monthCheck'] = monthUseString
            dftemp3['in_Amount'] = dfsoura3[monthUseList[z]] # insert column name here
            dftemp3['in_TimeframeEnd'] = '12/31/' + dfsoura3['Year'].astype(str)
            dftemp3['in_TimeframeStart'] = '01/01/' + dfsoura3['Year'].astype(str)
            dfsoura_Out = pd.concat([dfsoura_Out, dftemp3])

print(len(dfsoura_Out))

In [None]:
# Concatenate all Source Data into one long dataframe.
frames = [dfsourm_Out, dfsoura_Out]
dfsour_Out = pd.concat(frames)

dfsour_Out = dfsour_Out.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfsour_Out))
dfsour_Out.head(1)

# Concatenate System Data (POUs) with Source Data (PODs).

In [None]:
# Concatenate Source Data into one long dataframe.
frames = [dfsysOut, dfsour_Out]
dfout = pd.concat(frames)

dfout = dfout.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfout))
dfout.head(1)

# Clean data

In [None]:
# fix blank / null WaterSourcetypeCV
# simplify to WaDE specific categories

wsTypeDict = {
    "Well" : "Groundwater",
    "Well/Spring" : "Groundwater",
    "Well Field" : "Groundwater",
    "Well/Stream" : "Groundwater",
    "Tunnel" : "Groundwater",
    "Drain" : "Groundwater",
    "Stream" : "Surface Water",
    "Spring" : "Surface Water",
    "Reservoir" : "Surface Water",
    "Lake" : "Surface Water"}

def fixWaterSourceTypeCV(valA):
    valA = str(valA).strip()
    if valA == "" or pd.isnull(valA):
        outString = "Unspecified"
    else:
        try:
            outString = wsTypeDict[valA]
        except:
            outString = "Unspecified"
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: fixWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceTypeCV'].unique()

In [None]:
# Convert History Year to string.

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], errors = 'coerce')
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], errors = 'coerce')
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.tail(1)

In [None]:
# Converting in_PopulationServed to int value.
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].replace(np.nan, 0, regex=True)
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].replace('', 0, regex=True)
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].astype(int)

dfout.head(1)

In [None]:
# Converting Latitude & Longitude to float values
dfout['in_Latitude'] = dfout['in_Latitude'].replace(np.nan, 0, regex=True)
dfout['in_Latitude'] = dfout['in_Latitude'].replace('', 0, regex=True)
dfout['in_Latitude'] = dfout['in_Latitude'].astype(float)

dfout['in_Longitude'] = dfout['in_Longitude'].replace(np.nan, 0, regex=True)
dfout['in_Longitude'] = dfout['in_Longitude'].replace('', 0, regex=True)
dfout['in_Longitude'] = dfout['in_Longitude'].astype(float)

dfout.head(1)

In [None]:
# Converting in_Amount to float values
# issue of some values are strings.
dfout['in_Amount'] = dfout['in_Amount'].replace('-','', regex=True)
dfout['in_Amount'] = dfout['in_Amount'].replace(',','', regex=True)
dfout['in_Amount'] = dfout['in_Amount'].replace('FALSE','', regex=True)
dfout['in_Amount'] = dfout['in_Amount'].str.strip()
dfout['in_Amount'] = pd.to_numeric(dfout['in_Amount'])

dfout['in_Amount'] = dfout['in_Amount'].astype(float)

dfout.head(1)

## WaDE Custom Elements (due to missing info)

In [None]:
dfinPOD['nativeID'] = dfinPOD['AquiferCod'].str.split('-')[0]
dfinPOD['name'] = dfinPOD['AquiferCod'].str.split('-')[1] (edited)

In [None]:
def extractNativeIDFunc(A):
    A = str(A).strip()
    try:
        outString = A.str.split('-')[0]
    except:
        outString = A
    return outString

dfinPOD['nativeID'] = dfinPOD.apply(lambda row: extractNativeIDFunc(row['AquiferCod']), axis=1)
dfinPOD['nativeID'].unique()

In [None]:
# updaing VariableSpecificCV to include a water source type

def updateVariableSpecificCV(valA, valB):
    valA = str(valA).strip()
    valB = str(valB).strip()       
    outString = valA + "_" + valB
    return outString

dfout['in_VariableSpecificCV'] = dfout.apply(lambda row: updateVariableSpecificCV(row['in_VariableSpecificCV'], row['in_WaterSourceTypeCV']), axis=1)
dfout['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEUT_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

## Shapefile Data
- For attaching gemetry to POU csv inputs.

In [None]:
# PoU Shapefile Data
# Shapefile input
ShapeFileInput = gpd.read_file('shapefile/CulinaryWaterServiceAreas.shp')
dfPoUshapetemp = pd.DataFrame(ShapeFileInput)
dfPoUshapetemp.head(3)

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['WRID'].astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

## Export Data

In [None]:
#check datatype
print(len(dfout))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

In [None]:
# Exporting output files.
dfout.to_csv('P_MasterUTSiteSpecific.csv', index=False)  # The output.
dfPoUshape.to_csv('P_utSSGeometry.csv', index=False) # The output geometry.