# Preprocessing Texas Site Specific data for WaDEQA upload.
- Date Updated: 01/07/2022
- Purpose: N/A

Notes:
- Working with Historical Municipal Water Intake Report for Public Water Systems by Water Planning Region reports.
- <font color='red'>Temp fix of removing duplicate entries in the Historical Municipal data.</font> 

In [None]:
# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd # the library that lets us read in shapefiles

# visulizaiton
import matplotlib.pyplot as plot
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory and Input File
workingDir = "G:/Shared drives/WaDE Data/Texas/SS_PublicSupplyWaterUse/RawInputData"
os.chdir(workingDir)

# Inputs and Dataframe Creation
- Inputs for Historical Municipal data A-P.
- Inputs for PWS Water use 2016-2019
- Input of an Export of Shapefile data.
- Bridge table for Historical Municipal to Export of Shapefile data.|

In [None]:
# 1) HistoricalMunicipal_A
fileInput = "HistoricalMunicipal_A.csv"
df_A = pd.read_csv(fileInput).replace(np.nan, "")
df_A['sourceFile'] = 'A'
print(len(df_A))

In [None]:
# 2) HistoricalMunicipal_B
fileInput = "HistoricalMunicipal_B.csv"
df_B = pd.read_csv(fileInput).replace(np.nan, "")
df_B['sourceFile'] = 'B'
print(len(df_B))

In [None]:
# 3) HistoricalMunicipal_C
fileInput = "HistoricalMunicipal_C.csv"
df_C = pd.read_csv(fileInput).replace(np.nan, "")
df_C['sourceFile'] = 'C'
print(len(df_C))

In [None]:
# 4) HistoricalMunicipal_D
fileInput = "HistoricalMunicipal_D.csv"
df_D = pd.read_csv(fileInput).replace(np.nan, "")
df_D['sourceFile'] = 'D'
print(len(df_D))

In [None]:
# 5) HistoricalMunicipal_E
fileInput = "HistoricalMunicipal_E.csv"
df_E = pd.read_csv(fileInput).replace(np.nan, "")
df_E['sourceFile'] = 'E'
print(len(df_E))

In [None]:
# 6) HistoricalMunicipal_F
fileInput = "HistoricalMunicipal_F.csv"
df_F = pd.read_csv(fileInput).replace(np.nan, "")
df_F['sourceFile'] = 'F'
print(len(df_F))

In [None]:
# 7) HistoricalMunicipal_G
fileInput = "HistoricalMunicipal_G.csv"
df_G = pd.read_csv(fileInput).replace(np.nan, "")
df_G['sourceFile'] = 'G'
print(len(df_G))

In [None]:
# 8) HistoricalMunicipal_H
fileInput = "HistoricalMunicipal_H.csv"
df_H = pd.read_csv(fileInput).replace(np.nan, "")
df_H['sourceFile'] = 'H'
print(len(df_H))

In [None]:
# 9) HistoricalMunicipal_I
fileInput = "HistoricalMunicipal_I.csv"
df_I = pd.read_csv(fileInput).replace(np.nan, "")
df_I['sourceFile'] = 'I'
print(len(df_I))

In [None]:
# 10) HistoricalMunicipal_J
fileInput = "HistoricalMunicipal_J.csv"
df_J = pd.read_csv(fileInput).replace(np.nan, "")
df_J['sourceFile'] = 'J'
print(len(df_J))

In [None]:
# 11) HistoricalMunicipal_K
fileInput = "HistoricalMunicipal_K.csv"
df_K = pd.read_csv(fileInput).replace(np.nan, "")
df_K['sourceFile'] = 'K'
print(len(df_K))

In [None]:
# 12) HistoricalMunicipal_L
fileInput = "HistoricalMunicipal_L.csv"
df_L = pd.read_csv(fileInput).replace(np.nan, "")
df_L['sourceFile'] = 'L'
print(len(df_L))

In [None]:
# 13) HistoricalMunicipal_M
fileInput = "HistoricalMunicipal_M.csv"
df_M = pd.read_csv(fileInput).replace(np.nan, "")
df_M['sourceFile'] = 'M'
print(len(df_M))

In [None]:
# 14) HistoricalMunicipal_N
fileInput = "HistoricalMunicipal_N.csv"
df_N = pd.read_csv(fileInput).replace(np.nan, "")
df_N['sourceFile'] = 'N'
print(len(df_N))

In [None]:
# 15) HistoricalMunicipal_O
fileInput = "HistoricalMunicipal_O.csv"
df_O = pd.read_csv(fileInput).replace(np.nan, "")
df_O['sourceFile'] = 'O'
print(len(df_O))

In [None]:
# 16) HistoricalMunicipal_P
fileInput = "HistoricalMunicipal_P.csv"
df_P = pd.read_csv(fileInput).replace(np.nan, "")
df_P['sourceFile'] = 'P'
print(len(df_P))

In [None]:
# Concatenate Historical Municipal Data Together
frames = [df_A, df_B, df_C, df_D, df_E, df_F, df_G, df_H, 
          df_I, df_J, df_K, df_L, df_M, df_N, df_O, df_P]
df_HM = pd.concat(frames).reset_index(drop=True)
print(len(df_HM))

In [None]:
# 1) PWS Water Use 2016
fileInput = "PWS_Categorical_Connections_and_Volumes_2016.csv"
dfpws_16 = pd.read_csv(fileInput).replace(np.nan, "")
dfpws_16['sourceFile'] = 'pws2016'
print(len(dfpws_16))

In [None]:
# 2) PWS Water Use 2017
fileInput = "PWS_Categorical_Connections_and_Volumes_2017.csv"
dfpws_17 = pd.read_csv(fileInput).replace(np.nan, "")
dfpws_17['sourceFile'] = 'pws2017'
print(len(dfpws_17))

In [None]:
# 3) PWS Water Use 2018
fileInput = "PWS_Categorical_Connections_and_Volumes_2018.csv"
dfpws_18 = pd.read_csv(fileInput).replace(np.nan, "")
dfpws_18['sourceFile'] = 'pws2018'
print(len(dfpws_18))

In [None]:
# 4) PWS Water Use 2019
fileInput = "PWS_Categorical_Connections_and_Volumes_2019.csv"
dfpws_19 = pd.read_csv(fileInput).replace(np.nan, "")
dfpws_19['sourceFile'] = 'pws2019'
print(len(dfpws_19))

In [None]:
# Concatenate PWS Water Use Data Together
frames = [dfpws_16, dfpws_17, dfpws_18, dfpws_19]
df_PWU = pd.concat(frames).reset_index(drop=True)
print(len(df_PWU))

In [None]:
# Input of an Export of Shapefile data.
fileInput = "PWS_Shapefile_Export/PWS_Export.shp"
df_PWS = gpd.read_file(fileInput)
print(len(df_PWS))
df_PWS.head()

In [None]:
# Bridge Table
fileInput = "20220106 PWS-SurveyNO bridge table.csv"
df_bridge = pd.read_csv(fileInput)
print(len(df_bridge))
df_bridge.head()

In [None]:
# Left Join HistoricalMunicipal to Bridge table via 'TWDB_Survey_No' = 'surveyNo'.
# Left Join Shapefile Export via 'pwsCode2' = 'PWSId'.

df = pd.merge(df_HM, df_bridge, left_on='TWDB_Survey_No', right_on='surveyNo', how='left')
df = pd.merge(df, df_PWS, left_on='pwsCode2', right_on='PWSId', how='left')

print(len(df))
df.head()

In [None]:
# Left Join PWS Water Use to Bridge table via 'TWDB_Survey_Number' = 'surveyNo'.
# Left Join Shapefile Export via 'pwsCode2' = 'PWSId'.

df2 = pd.merge(df_PWU, df_bridge, left_on='TWDB_Survey_Number', right_on='surveyNo', how='left')
df2 = pd.merge(df2, df_PWS, left_on='pwsCode2', right_on='PWSId', how='left')

print(len(df2))
df2.head()

# (1) Intake Data: Monthly and Annual Timeseries
- Exporting Monthly and Annual (total) timeseries data.

In [None]:
# Create temporary main dataframe
df_temp = pd.DataFrame(index=df.index)

# Method Info
df_temp['in_MethodUUID'] = "TXss_M1"

# Variable Info
df_temp['in_VariableSpecificCV'] = "" # Timeseries specific.

# Water Source Info
df_temp['Organization'] = df['Organization']
df_temp['Aquifer_Source'] = df['Aquifer_Source']
df_temp['Surface_Water_Source'] = df['Surface_Water_Source']
df_temp['in_WaterSourceName'] = "" # specific to name
df_temp['in_WaterSourceTypeCV'] = df['Water_Type']

# Site Info
df_temp['in_CoordinateMethodCV'] = df['Source']
df_temp['in_County'] = df['County_Used']
df_temp['in_Latitude'] = df['Lat'].astype(float)
df_temp['in_Longitude'] = df['Long'].astype(float)
df_temp['in_SiteName'] = df['pwsName_y']
df_temp['in_SiteNativeID'] = "POU" + df['PWSId'].astype(str)

# Site Variable Amount Info
df_temp['in_Amount'] = "" # Timeseries specific.
df_temp['in_BeneficialUseCategory'] = "Municipal"
df_temp['in_CommunityWaterSupplySystem'] =  df['pwsName_y']
df_temp['in_PopulationServed'] =  df['Population_Served']
df_temp['in_ReportYearCV'] =  df['Year']
df_temp['in_TimeframeStart'] = "" # Timeseries specific.
df_temp['in_TimeframeEnd'] = "" # Timeseries specific.

print(len(df_temp))
df_temp.head(1)

In [None]:
# 1) Monthly Jan
df_Jan = df_temp.copy()

df_Jan['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Jan['in_Amount'] = df['Jan'] # <- change here
df_Jan['in_TimeframeStart'] = df['Year'].astype(str) + "-01-01"  # <- change here
df_Jan['in_TimeframeEnd'] = df['Year'].astype(str) + "-01-31"  # <- change here
df_Jan['timeStamp'] = "January"

df_Jan['in_TimeframeEnd'] = df['Year'].astype(str) + "-01-31"  # <- change here

print(len(df_Jan))
df_Jan.head(1)

In [None]:
# 2) Monthly Feb
df_Feb = df_temp.copy()

df_Feb['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Feb['in_Amount'] = df['Feb'] # <- change here
df_Feb['in_TimeframeStart'] = df['Year'].astype(str) + "-02-01"  # <- change here
df_Feb['in_TimeframeEnd'] = df['Year'].astype(str) + "-02-28"  # <- change here
df_Feb['timeStamp'] = "Feburary"

print(len(df_Feb))
df_Feb.head(1)

In [None]:
# 3) Monthly Mar
df_Mar = df_temp.copy()

df_Mar['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Mar['in_Amount'] = df['Mar'] # <- change here
df_Mar['in_TimeframeStart'] = df['Year'].astype(str) + "-03-01"  # <- change here
df_Mar['in_TimeframeEnd'] = df['Year'].astype(str) + "-03-31"  # <- change here
df_Mar['timeStamp'] = "March"

print(len(df_Mar))
df_Mar.head(1)

In [None]:
# 4) Monthly Apr
df_Apr = df_temp.copy()

df_Apr['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Apr['in_Amount'] = df['Apr'] # <- change here
df_Apr['in_TimeframeStart'] = df['Year'].astype(str) + "-04-01"  # <- change here
df_Apr['in_TimeframeEnd'] = df['Year'].astype(str) + "-04-30"  # <- change here
df_Apr['timeStamp'] = "April"

print(len(df_Apr))
df_Apr.head(1)

In [None]:
# 5) Monthly May
df_May = df_temp.copy()

df_May['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_May['in_Amount'] = df['May'] # <- change here
df_May['in_TimeframeStart'] = df['Year'].astype(str) + "-05-01"  # <- change here
df_May['in_TimeframeEnd'] = df['Year'].astype(str) + "-05-31"  # <- change here
df_May['timeStamp'] = "May"

print(len(df_May))
df_May.head(1)

In [None]:
# 6) Monthly Jun
df_Jun = df_temp.copy()

df_Jun['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Jun['in_Amount'] = df['Jun'] # <- change here
df_Jun['in_TimeframeStart'] = df['Year'].astype(str) + "-06-01"  # <- change here
df_Jun['in_TimeframeEnd'] = df['Year'].astype(str) + "-06-30"  # <- change here
df_Jun['timeStamp'] = "June"

print(len(df_Jun))
df_Jun.head(1)

In [None]:
# 7) Monthly Jul
df_Jul = df_temp.copy()

df_Jul['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Jul['in_Amount'] = df['Jul'] # <- change here
df_Jul['in_TimeframeStart'] = df['Year'].astype(str) + "-07-01"  # <- change here
df_Jul['in_TimeframeEnd'] = df['Year'].astype(str) + "-07-31"  # <- change here
df_Jul['timeStamp'] = "July"

print(len(df_Jul))
df_Jul.head(1)

In [None]:
# 8) Monthly Aug
df_Aug = df_temp.copy()

df_Aug['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Aug['in_Amount'] = df['Aug'] # <- change here
df_Aug['in_TimeframeStart'] = df['Year'].astype(str) + "-08-01"  # <- change here
df_Aug['in_TimeframeEnd'] = df['Year'].astype(str) + "-08-31"  # <- change here
df_Aug['timeStamp'] = "August"

print(len(df_Aug))
df_Aug.head(1)

In [None]:
# 9) Monthly Sep
df_Sep = df_temp.copy()

df_Sep['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Sep['in_Amount'] = df['Sep'] # <- change here
df_Sep['in_TimeframeStart'] = df['Year'].astype(str) + "-09-01"  # <- change here
df_Sep['in_TimeframeEnd'] = df['Year'].astype(str) + "-09-30"  # <- change here
df_Sep['timeStamp'] = "September"

print(len(df_Sep))
df_Sep.head(1)

In [None]:
# 10) Monthly Oct
df_Oct = df_temp.copy()

df_Oct['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Oct['in_Amount'] = df['Oct'] # <- change here
df_Oct['in_TimeframeStart'] = df['Year'].astype(str) + "-10-01"  # <- change here
df_Oct['in_TimeframeEnd'] = df['Year'].astype(str) + "-10-31"  # <- change here
df_Oct['timeStamp'] = "October"

print(len(df_Oct))
df_Oct.head(1)

In [None]:
# 11) Monthly Nov
df_Nov = df_temp.copy()

df_Nov['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Nov['in_Amount'] = df['Nov'] # <- change here
df_Nov['in_TimeframeStart'] = '11/01/' + df['Year'].astype(str) # <- change here
df_Nov['in_TimeframeEnd'] = '11/30/' + df['Year'].astype(str) # <- change here
df_Nov['in_TimeframeStart'] = df['Year'].astype(str) + "-11-01"  # <- change here
df_Nov['in_TimeframeEnd'] = df['Year'].astype(str) + "-11-30"  # <- change here
df_Nov['timeStamp'] = "November"

print(len(df_Nov))
df_Nov.head(1)

In [None]:
# 12) Monthly Dec
df_Dec = df_temp.copy()

df_Dec['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Dec['in_Amount'] = df['Dec'] # <- change here
df_Dec['in_TimeframeStart'] = df['Year'].astype(str) + "-12-01"  # <- change here
df_Dec['in_TimeframeEnd'] = df['Year'].astype(str) + "-12-31"  # <- change here
df_Dec['timeStamp'] = "December"

print(len(df_Dec))
df_Dec.head(1)

In [None]:
# 13) Annual (total)
df_Annual = df_temp.copy()

df_Annual['in_VariableSpecificCV'] = "Intake_Annual_MI" # <- change here
df_Annual['in_Amount'] = df['Total_Intake__Gallons_'] # <- change here
df_Annual['in_TimeframeStart'] = df['Year'].astype(str) + "-01-01"  # <- change here
df_Annual['in_TimeframeEnd'] = df['Year'].astype(str) + "-12-31"  # <- change here
df_Annual['timeStamp'] = "Annual"

print(len(df_Annual))
df_Annual.head(1)

In [None]:
# Concatenate Monthly and Annual Intake Together
frames = [df_Jan, df_Feb, df_Mar, df_Apr, df_May, df_Jun,
          df_Jul, df_Aug, df_Sep, df_Oct, df_Nov, df_Dec, df_Annual]
dfout_intake = pd.concat(frames).reset_index(drop=True)
print(len(dfout_intake))

In [None]:
# dropping duplicate
dropList=['in_VariableSpecificCV','in_WaterSourceTypeCV','in_SiteNativeID','in_BeneficialUseCategory','in_ReportYearCV','in_TimeframeStart','in_TimeframeEnd']
dfout_intake = dfout_intake.drop_duplicates(subset=dropList, keep='first').reset_index(drop=True)
print(len(dfout_intake))

In [None]:
# Create WaterSource name
# Surface Water = Surface_Water_Source, Groundwater = Aquifer_Source, Reuse = Organization.

# Fixing empty site names

def setWSN(Type, SWName, GWName, RUName):
    Type = str(Type).strip()
    SWName = str(SWName).strip()
    GWName = str(GWName).strip()
    RUName = str(RUName).strip()
    
    outString = "Unspecified" # default
    
    if Type == "Surface Water":
        outString = SWName
    if Type == "Ground Water":
        outString = GWName
    if Type == "Reuse":
        outString = RUName
        
    if outString == "":
        outString = "Unspecified"
        
    return outString

dfout_intake['in_WaterSourceName'] = dfout_intake.apply(lambda row: setWSN(row['in_WaterSourceTypeCV'], 
                                                             row['Surface_Water_Source'], 
                                                             row['Aquifer_Source'], 
                                                             row['Organization']), axis=1)
print(len(dfout_intake))

In [None]:
# Update in_VariableSpecificCV to include water source type.

def changeGroundWater(WSTcv):
    WSTcv = str(WSTcv).strip()
    if WSTcv == "Ground Water":
        outString = 'Groundwater'
    else:
        outString = WSTcv
    return outString

dfout_intake['temp_WaterSourceTypeCV'] = dfout_intake.apply(lambda row: changeGroundWater(row['in_WaterSourceTypeCV']), axis=1)

def updateVariableSpecificCV(VScv, WSTcv):
    VScv = str(VScv).strip()
    WSTcv = str(WSTcv).strip()
    outString = VScv + "_" + WSTcv
    return outString

dfout_intake['in_VariableSpecificCV'] = dfout_intake.apply(lambda row: updateVariableSpecificCV(row['in_VariableSpecificCV'], row['temp_WaterSourceTypeCV']), axis=1)
dfout_intake = dfout_intake.drop(['temp_WaterSourceTypeCV'], axis=1)
dfout_intake['in_VariableSpecificCV'].unique()

# (2) Public Water System Data: Annual Timeseries
- Nine time series to use:
    - SingleFamily
    - Multifamily
    - Comercial
    - Industrial
    - Agriculture
    - Reuse
    - Total Metered
    - Total Unmetered

In [None]:
# 1) PWS SingleFamily
dfpwu_sf = pd.DataFrame(index=df2.index)

# Method Info
dfpwu_sf['in_MethodUUID'] = "TXss_M2"

# Variable Info
dfpwu_sf['in_VariableSpecificCV'] = "Water Use_Annual_SingleFamily_Unspecified"

# Water Source Info
dfpwu_sf['in_WaterSourceName'] = "Unspecified"
dfpwu_sf['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
dfpwu_sf['in_CoordinateMethodCV'] = df2['Source']
dfpwu_sf['in_County'] = ""  # should be okay to leave blank as we use SiteNativeID from shapefile to identify sites
dfpwu_sf['in_Latitude'] = df2['Lat'].astype(float)
dfpwu_sf['in_Longitude'] = df2['Long'].astype(float)
dfpwu_sf['in_SiteName'] = df2['pwsName_y']
dfpwu_sf['in_SiteNativeID'] = df2['PWSId']

# Site Variable Amount Info
dfpwu_sf['in_Amount'] = df2['Single_Family_Volume']  # Timeseries specific.
dfpwu_sf['in_BeneficialUseCategory'] = "SingleFamily"  # Timeseries specific.
dfpwu_sf['in_CommunityWaterSupplySystem'] =  df2['pwsName_y']
dfpwu_sf['in_PopulationServed'] =  df2['Single_Family_Connections']  # Timeseries specific.
dfpwu_sf['in_ReportYearCV'] =  df2['Year']
dfpwu_sf['in_TimeframeStart'] = df2['Year'].astype(str) + "-01-01"  # <- change here
dfpwu_sf['in_TimeframeEnd'] = df2['Year'].astype(str) + "-12-31"  # <- change here

print(len(dfpwu_sf))
dfpwu_sf.head(1)

In [None]:
# 2) PWS MultiFamily
dfpwu_mf = pd.DataFrame(index=df2.index)

# Method Info
dfpwu_mf['in_MethodUUID'] = "TXss_M2"

# Variable Info
dfpwu_mf['in_VariableSpecificCV'] = "Water Use_Annual_MultiFamily_Unspecified"

# Water Source Info
dfpwu_mf['in_WaterSourceName'] = "Unspecified"
dfpwu_mf['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
dfpwu_mf['in_CoordinateMethodCV'] = df2['Source']
dfpwu_mf['in_County'] = ""  # should be okay to leave blank as we use SiteNativeID from shapefile to identify sites
dfpwu_mf['in_Latitude'] = df2['Lat'].astype(float)
dfpwu_mf['in_Longitude'] = df2['Long'].astype(float)
dfpwu_mf['in_SiteName'] = df2['pwsName_y']
dfpwu_mf['in_SiteNativeID'] = df2['PWSId']

# Site Variable Amount Info
dfpwu_mf['in_Amount'] = df2['Multi_Family_Volume']  # Timeseries specific.
dfpwu_mf['in_BeneficialUseCategory'] = "MultiFamily"  # Timeseries specific.
dfpwu_mf['in_CommunityWaterSupplySystem'] =  df2['pwsName_y']
dfpwu_mf['in_PopulationServed'] =  df2['Multi_Family_Connections']  # Timeseries specific.
dfpwu_mf['in_ReportYearCV'] =  df2['Year']
dfpwu_mf['in_TimeframeStart'] = df2['Year'].astype(str) + "-01-01"  # <- change here
dfpwu_mf['in_TimeframeEnd'] = df2['Year'].astype(str) + "-12-31"  # <- change here

print(len(dfpwu_mf))
dfpwu_mf.head(1)

In [None]:
# 3) PWS Commercial
dfpwu_c = pd.DataFrame(index=df2.index)

# Method Info
dfpwu_c['in_MethodUUID'] = "TXss_M2"

# Variable Info
dfpwu_c['in_VariableSpecificCV'] = "Water Use_Annual_Commercial_Unspecified"

# Water Source Info
dfpwu_c['in_WaterSourceName'] = "Unspecified"
dfpwu_c['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
dfpwu_c['in_CoordinateMethodCV'] = df2['Source']
dfpwu_c['in_County'] = ""  # should be okay to leave blank as we use SiteNativeID from shapefile to identify sites
dfpwu_c['in_Latitude'] = df2['Lat'].astype(float)
dfpwu_c['in_Longitude'] = df2['Long'].astype(float)
dfpwu_c['in_SiteName'] = df2['pwsName_y']
dfpwu_c['in_SiteNativeID'] = df2['PWSId']

# Site Variable Amount Info
dfpwu_c['in_Amount'] = df2['Commercial_Volume']  # Timeseries specific.
dfpwu_c['in_BeneficialUseCategory'] = "Commercial"  # Timeseries specific.
dfpwu_c['in_CommunityWaterSupplySystem'] =  df2['pwsName_y']
dfpwu_c['in_PopulationServed'] =  df2['Commercial_Connections']  # Timeseries specific.
dfpwu_c['in_ReportYearCV'] =  df2['Year']
dfpwu_c['in_TimeframeStart'] = df2['Year'].astype(str) + "-01-01"  # <- change here
dfpwu_c['in_TimeframeEnd'] = df2['Year'].astype(str) + "-12-31"  # <- change here

print(len(dfpwu_c))
dfpwu_c.head(1)

In [None]:
# 4) PWS Industrial
dfpwu_i = pd.DataFrame(index=df2.index)

# Method Info
dfpwu_i['in_MethodUUID'] = "TXss_M2"

# Variable Info
dfpwu_i['in_VariableSpecificCV'] = "Water Use_Annual_Industrial_Unspecified"

# Water Source Info
dfpwu_i['in_WaterSourceName'] = "Unspecified"
dfpwu_i['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
dfpwu_i['in_CoordinateMethodCV'] = df2['Source']
dfpwu_i['in_County'] = ""  # should be okay to leave blank as we use SiteNativeID from shapefile to identify sites
dfpwu_i['in_Latitude'] = df2['Lat'].astype(float)
dfpwu_i['in_Longitude'] = df2['Long'].astype(float)
dfpwu_i['in_SiteName'] = df2['pwsName_y']
dfpwu_i['in_SiteNativeID'] = df2['PWSId']

# Site Variable Amount Info
dfpwu_i['in_Amount'] = df2['Industrial_Volume']  # Timeseries specific.
dfpwu_i['in_BeneficialUseCategory'] = "Industrial"  # Timeseries specific.
dfpwu_i['in_CommunityWaterSupplySystem'] =  df2['pwsName_y']
dfpwu_i['in_PopulationServed'] =  df2['Industrial_Connections']  # Timeseries specific.
dfpwu_i['in_ReportYearCV'] =  df2['Year']
dfpwu_i['in_TimeframeStart'] = df2['Year'].astype(str) + "-01-01"  # <- change here
dfpwu_i['in_TimeframeEnd'] = df2['Year'].astype(str) + "-12-31"  # <- change here

print(len(dfpwu_i))
dfpwu_i.head(1)

In [None]:
# 5) PWS Institutional
dfpwu_ins = pd.DataFrame(index=df2.index)

# Method Info
dfpwu_ins['in_MethodUUID'] = "TXss_M2"

# Variable Info
dfpwu_ins['in_VariableSpecificCV'] = "Water Use_Annual_Institutional_Unspecified"

# Water Source Info
dfpwu_ins['in_WaterSourceName'] = "Unspecified"
dfpwu_ins['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
dfpwu_ins['in_CoordinateMethodCV'] = df2['Source']
dfpwu_ins['in_County'] = ""  # should be okay to leave blank as we use SiteNativeID from shapefile to identify sites
dfpwu_ins['in_Latitude'] = df2['Lat'].astype(float)
dfpwu_ins['in_Longitude'] = df2['Long'].astype(float)
dfpwu_ins['in_SiteName'] = df2['pwsName_y']
dfpwu_ins['in_SiteNativeID'] = df2['PWSId']

# Site Variable Amount Info
dfpwu_ins['in_Amount'] = df2['Institutional_Volume']  # Timeseries specific.
dfpwu_ins['in_BeneficialUseCategory'] = "Institutional"  # Timeseries specific.
dfpwu_ins['in_CommunityWaterSupplySystem'] =  df2['pwsName_y']
dfpwu_ins['in_PopulationServed'] =  df2['Institutional_Connections']  # Timeseries specific.
dfpwu_ins['in_ReportYearCV'] =  df2['Year']
dfpwu_ins['in_TimeframeStart'] = df2['Year'].astype(str) + "-01-01"  # <- change here
dfpwu_ins['in_TimeframeEnd'] = df2['Year'].astype(str) + "-12-31"  # <- change here

print(len(dfpwu_ins))
dfpwu_ins.head(1)

In [None]:
# 6) PWS Agriculture
dfpwu_ag = pd.DataFrame(index=df2.index)

# Method Info
dfpwu_ag['in_MethodUUID'] = "TXss_M2"

# Variable Info
dfpwu_ag['in_VariableSpecificCV'] = "Water Use_Annual_Agriculture_Unspecified"

# Water Source Info
dfpwu_ag['in_WaterSourceName'] = "Unspecified"
dfpwu_ag['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
dfpwu_ag['in_CoordinateMethodCV'] = df2['Source']
dfpwu_ag['in_County'] = ""  # should be okay to leave blank as we use SiteNativeID from shapefile to identify sites
dfpwu_ag['in_Latitude'] = df2['Lat'].astype(float)
dfpwu_ag['in_Longitude'] = df2['Long'].astype(float)
dfpwu_ag['in_SiteName'] = df2['pwsName_y']
dfpwu_ag['in_SiteNativeID'] = df2['PWSId']

# Site Variable Amount Info
dfpwu_ag['in_Amount'] = df2['Agriculture_Volume']  # Timeseries specific.
dfpwu_ag['in_BeneficialUseCategory'] = "Agriculture"  # Timeseries specific.
dfpwu_ag['in_CommunityWaterSupplySystem'] =  df2['pwsName_y']
dfpwu_ag['in_PopulationServed'] =  df2['Agriculture_Connections']  # Timeseries specific.
dfpwu_ag['in_ReportYearCV'] =  df2['Year']
dfpwu_ag['in_TimeframeStart'] = df2['Year'].astype(str) + "-01-01"  # <- change here
dfpwu_ag['in_TimeframeEnd'] = df2['Year'].astype(str) + "-12-31"  # <- change here

print(len(dfpwu_ag))
dfpwu_ag.head(1)

In [None]:
# 7) Reuse_Connections
dfpwu_re = pd.DataFrame(index=df2.index)

# Method Info
dfpwu_re['in_MethodUUID'] = "TXss_M2"

# Variable Info
dfpwu_re['in_VariableSpecificCV'] = "Water Use_Annual_Reuse_Unspecified"

# Water Source Info
dfpwu_re['in_WaterSourceName'] = "Unspecified"
dfpwu_re['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
dfpwu_re['in_CoordinateMethodCV'] = df2['Source']
dfpwu_re['in_County'] = ""  # should be okay to leave blank as we use SiteNativeID from shapefile to identify sites
dfpwu_re['in_Latitude'] = df2['Lat'].astype(float)
dfpwu_re['in_Longitude'] = df2['Long'].astype(float)
dfpwu_re['in_SiteName'] = df2['pwsName_y']
dfpwu_re['in_SiteNativeID'] = df2['PWSId']

# Site Variable Amount Info
dfpwu_re['in_Amount'] = df2['Reuse_Volume']  # Timeseries specific.
dfpwu_re['in_BeneficialUseCategory'] = "Reuse"  # Timeseries specific.
dfpwu_re['in_CommunityWaterSupplySystem'] =  df2['pwsName_y']
dfpwu_re['in_PopulationServed'] =  df2['Reuse_Connections']  # Timeseries specific.
dfpwu_re['in_ReportYearCV'] =  df2['Year']
dfpwu_re['in_TimeframeStart'] = df2['Year'].astype(str) + "-01-01"  # <- change here
dfpwu_re['in_TimeframeEnd'] = df2['Year'].astype(str) + "-12-31"  # <- change here

print(len(dfpwu_re))
dfpwu_re.head(1)

In [None]:
# 8) TotalMetered
dfpwu_tm = pd.DataFrame(index=df2.index)

# Method Info
dfpwu_tm['in_MethodUUID'] = "TXss_M2"

# Variable Info
dfpwu_tm['in_VariableSpecificCV'] = "Water Use_Annual_TotalMetered_Unspecified"

# Water Source Info
dfpwu_tm['in_WaterSourceName'] = "Unspecified"
dfpwu_tm['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
dfpwu_tm['in_CoordinateMethodCV'] = df2['Source']
dfpwu_tm['in_County'] = ""  # should be okay to leave blank as we use SiteNativeID from shapefile to identify sites
dfpwu_tm['in_Latitude'] = df2['Lat'].astype(float)
dfpwu_tm['in_Longitude'] = df2['Long'].astype(float)
dfpwu_tm['in_SiteName'] = df2['pwsName_y']
dfpwu_tm['in_SiteNativeID'] = df2['PWSId']

# Site Variable Amount Info
dfpwu_tm['in_Amount'] = df2['Total_Metered_Volume']  # Timeseries specific.
dfpwu_tm['in_BeneficialUseCategory'] = "TotalMetered"  # Timeseries specific.
dfpwu_tm['in_CommunityWaterSupplySystem'] =  df2['pwsName_y']
dfpwu_tm['in_PopulationServed'] =  df2['Total_Metered_Connections']  # Timeseries specific.
dfpwu_tm['in_ReportYearCV'] =  df2['Year']
dfpwu_tm['in_TimeframeStart'] = df2['Year'].astype(str) + "-01-01"  # <- change here
dfpwu_tm['in_TimeframeEnd'] = df2['Year'].astype(str) + "-12-31"  # <- change here

print(len(dfpwu_tm))
dfpwu_tm.head(1)

In [None]:
# 9) TotalMetered
dfpwu_tum = pd.DataFrame(index=df2.index)

# Method Info
dfpwu_tum['in_MethodUUID'] = "TXss_M2"

# Variable Info
dfpwu_tum['in_VariableSpecificCV'] = "Water Use_Annual_TotalUnmetered_Unspecified"

# Water Source Info
dfpwu_tum['in_WaterSourceName'] = "Unspecified"
dfpwu_tum['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
dfpwu_tum['in_CoordinateMethodCV'] = df2['Source']
dfpwu_tum['in_County'] = ""  # should be okay to leave blank as we use SiteNativeID from shapefile to identify sites
dfpwu_tum['in_Latitude'] = df2['Lat'].astype(float)
dfpwu_tum['in_Longitude'] = df2['Long'].astype(float)
dfpwu_tum['in_SiteName'] = df2['pwsName_y']
dfpwu_tum['in_SiteNativeID'] = df2['PWSId']

# Site Variable Amount Info
dfpwu_tum['in_Amount'] = df2['Unmetered_Volume']  # Timeseries specific.
dfpwu_tum['in_BeneficialUseCategory'] = "TotalUnmetered"  # Timeseries specific.
dfpwu_tum['in_CommunityWaterSupplySystem'] =  df2['pwsName_y']
dfpwu_tum['in_PopulationServed'] =  df2['Total_Unmetered_Connections']  # Timeseries specific.
dfpwu_tum['in_ReportYearCV'] =  df2['Year']
dfpwu_tum['in_TimeframeStart'] = df2['Year'].astype(str) + "-01-01"  # <- change here
dfpwu_tum['in_TimeframeEnd'] = df2['Year'].astype(str) + "-12-31"  # <- change here

print(len(dfpwu_tum))
dfpwu_tum.head(1)

In [None]:
# Concatenate PWS Water Use Together
frames = [dfpwu_sf, dfpwu_mf, dfpwu_c, dfpwu_i, dfpwu_ins,
          dfpwu_ag, dfpwu_re, dfpwu_tm, dfpwu_tum]
dfout_pwu = pd.concat(frames).reset_index(drop=True)
print(len(dfout_pwu))

## Concatenate Intake with PWS outputs datatogether

In [None]:
# Concatenate PWS Water Use Together
frames = [dfout_intake, dfout_pwu]
dfout = pd.concat(frames).reset_index(drop=True)
print(len(dfout_pwu))

## Cleaning Output & WaDE Custom Elements (due to missing info)

In [None]:
# Convert History Year to YYYY-MM-DD format.

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], errors = 'coerce')
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], errors = 'coerce')
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.head()

## WaDE Custom Elements (due to missing info)

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDETX_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = dfout['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

In [None]:
# Converting numbers that are in string to float.

# in_Amount
dfout['in_Amount'] = dfout['in_Amount'].replace(",", "", regex=True)
dfout['in_Amount'] = pd.to_numeric(dfout['in_Amount'], errors='coerce')

#in_PopulationServed
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].replace(",", "", regex=True)
dfout['in_PopulationServed'] = pd.to_numeric(dfout['in_PopulationServed'], errors='coerce')

dfout.head()

## Shapefile Data
- For attaching gemetry to POU csv inputs.

In [None]:
# PoU Shapefile Data
# Shapefile input
ShapeFileInput = gpd.read_file('PWS_Shapefile_Export/PWS_Export.shp')
dfPoUshapetemp = pd.DataFrame(ShapeFileInput)
dfPoUshapetemp.head(3)

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['PWSId'].astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

## Export Outputfile

In [None]:
dfout.info()

In [None]:
# Exporting output files.
dfout.to_csv('P_MasterTXSiteSpecific.csv', index=False)  # The master output.
dfPoUshape.to_csv('P_txSSGeometry.csv', index=False) # The output geometry.