# Preprocessing Texas Site Specific data for WaDEQA upload.
- Date Updated: 01/07/2022
- Purpose: N/A

Notes:
- Working with Historical Municipal Water Intake Report for Public Water Systems by Water Planning Region reports.

In [None]:
# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd # the library that lets us read in shapefiles

# visulizaiton
import matplotlib.pyplot as plot
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Texas/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

# Inputs and Dataframe Creation
- Inputs for Historical Municipal data A-P.
- Export of Shapefile data.
- Bridge table for Historical Municipal to Export of Shapefile data.|

In [None]:
# 1) HistoricalMunicipal_A
fileInput = "HistoricalMunicipal_A.csv"
df_A = pd.read_csv(fileInput).replace(np.nan, "")
df_A['sourceFile'] = 'A'
print(len(df_A))

In [None]:
# 2) HistoricalMunicipal_B
fileInput = "HistoricalMunicipal_B.csv"
df_B = pd.read_csv(fileInput).replace(np.nan, "")
df_B['sourceFile'] = 'B'
print(len(df_B))

In [None]:
# 3) HistoricalMunicipal_C
fileInput = "HistoricalMunicipal_C.csv"
df_C = pd.read_csv(fileInput).replace(np.nan, "")
df_C['sourceFile'] = 'C'
print(len(df_C))

In [None]:
# 4) HistoricalMunicipal_D
fileInput = "HistoricalMunicipal_D.csv"
df_D = pd.read_csv(fileInput).replace(np.nan, "")
df_D['sourceFile'] = 'D'
print(len(df_D))

In [None]:
# 5) HistoricalMunicipal_E
fileInput = "HistoricalMunicipal_E.csv"
df_E = pd.read_csv(fileInput).replace(np.nan, "")
df_E['sourceFile'] = 'E'
print(len(df_E))

In [None]:
# 6) HistoricalMunicipal_F
fileInput = "HistoricalMunicipal_F.csv"
df_F = pd.read_csv(fileInput).replace(np.nan, "")
df_F['sourceFile'] = 'F'
print(len(df_F))

In [None]:
# 7) HistoricalMunicipal_G
fileInput = "HistoricalMunicipal_G.csv"
df_G = pd.read_csv(fileInput).replace(np.nan, "")
df_G['sourceFile'] = 'G'
print(len(df_G))

In [None]:
# 8) HistoricalMunicipal_H
fileInput = "HistoricalMunicipal_H.csv"
df_H = pd.read_csv(fileInput).replace(np.nan, "")
df_H['sourceFile'] = 'H'
print(len(df_H))

In [None]:
# 9) HistoricalMunicipal_I
fileInput = "HistoricalMunicipal_I.csv"
df_I = pd.read_csv(fileInput).replace(np.nan, "")
df_I['sourceFile'] = 'I'
print(len(df_I))

In [None]:
# 10) HistoricalMunicipal_J
fileInput = "HistoricalMunicipal_J.csv"
df_J = pd.read_csv(fileInput).replace(np.nan, "")
df_J['sourceFile'] = 'J'
print(len(df_J))

In [None]:
# 11) HistoricalMunicipal_K
fileInput = "HistoricalMunicipal_K.csv"
df_K = pd.read_csv(fileInput).replace(np.nan, "")
df_K['sourceFile'] = 'K'
print(len(df_K))

In [None]:
# 12) HistoricalMunicipal_L
fileInput = "HistoricalMunicipal_L.csv"
df_L = pd.read_csv(fileInput).replace(np.nan, "")
df_L['sourceFile'] = 'L'
print(len(df_L))

In [None]:
# 13) HistoricalMunicipal_M
fileInput = "HistoricalMunicipal_M.csv"
df_M = pd.read_csv(fileInput).replace(np.nan, "")
df_M['sourceFile'] = 'M'
print(len(df_M))

In [None]:
# 14) HistoricalMunicipal_N
fileInput = "HistoricalMunicipal_N.csv"
df_N = pd.read_csv(fileInput).replace(np.nan, "")
df_N['sourceFile'] = 'N'
print(len(df_N))

In [None]:
# 15) HistoricalMunicipal_O
fileInput = "HistoricalMunicipal_O.csv"
df_O = pd.read_csv(fileInput).replace(np.nan, "")
df_O['sourceFile'] = 'O'
print(len(df_O))

In [None]:
# 16) HistoricalMunicipal_P
fileInput = "HistoricalMunicipal_P.csv"
df_P = pd.read_csv(fileInput).replace(np.nan, "")
df_P['sourceFile'] = 'P'
print(len(df_P))

In [None]:
# Concatenate Historical Municipal Data Together
frames = [df_A, df_B, df_C, df_D, df_E, df_F, df_G, df_H, 
          df_I, df_J, df_K, df_L, df_M, df_N, df_O, df_P]
df_HM = pd.concat(frames).reset_index(drop=True)
print(len(df_HM))

In [None]:
# Export of Shapefile data
fileInput = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Texas/SiteSpecificAmounts/RawInputData/PWS_Shapefile_Export/PWS_Export.shp"
df_PWS = gpd.read_file(fileInput)
print(len(df_PWS))
df_PWS.head()

In [None]:
# Bridge Table
fileInput = "20220106 PWS-SurveyNO bridge table.csv"
df_bridge = pd.read_csv(fileInput)
print(len(df_bridge))
df_bridge.head()

In [None]:
# Left Join HistoricalMunicipal to Bridge table via 'TWDB_Survey_No' = 'surveyNo'.
# Left Join Shapefile Export via 'pwsCode2' = 'PWSId'.

df = pd.merge(df_HM, df_bridge, left_on='TWDB_Survey_No', right_on='surveyNo', how='left')
df = pd.merge(df, df_PWS, left_on='pwsCode2', right_on='PWSId', how='left')

print(len(df))
df.head()

In [None]:
df['Lat'].unique()

# Monthly and Annual Timeseries
- Exporting Monthly and Annual (total) timeseries data.

In [None]:
# Create temporary main dataframe
df_temp = pd.DataFrame(index=df.index)

# Variable Info
df_temp['in_VariableSpecificCV'] = "" # Timeseries specific.

# Water Source Info
df_temp['Organization'] = df['Organization']
df_temp['Aquifer_Source'] = df['Aquifer_Source']
df_temp['Surface_Water_Source'] = df['Surface_Water_Source']
df_temp['in_WaterSourceName'] = "" # specific to name
df_temp['in_WaterSourceTypeCV'] = df['Water_Type']

# Site Info
df_temp['in_CoordinateMethodCV'] = df['Source']
df_temp['in_County'] = df['County_Used']
df_temp['in_Latitude'] = df['Lat'].astype(float)
df_temp['in_Longitude'] = df['Long'].astype(float)
df_temp['in_SiteName'] = df['pwsName_y']
df_temp['in_SiteNativeID'] = df['PWSId']

# Site Variable Amount Info
df_temp['in_Amount'] = "" # Timeseries specific.
df_temp['in_CommunityWaterSupplySystem'] =  df['pwsName_y']
df_temp['in_PopulationServed'] =  df['Population_Served']
df_temp['in_ReportYearCV'] =  df['Year']
df_temp['in_TimeframeStart'] = "" # Timeseries specific.
df_temp['in_TimeframeEnd'] = "" # Timeseries specific.

print(len(df_temp))
df_temp.head(1)

In [None]:
# 1) Monthly Jan
df_Jan = df_temp.copy()

df_Jan['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Jan['in_Amount'] = df['Jan'] # <- change here
df_Jan['in_TimeframeStart'] = '01/01/' + df['Year'].astype(str) # <- change here
df_Jan['in_TimeframeEnd'] = '01/31/' + df['Year'].astype(str) # <- change here
df_Jan['timeStamp'] = "January"

print(len(df_Jan))
df_Jan.head(1)

In [None]:
# 2) Monthly Feb
df_Feb = df_temp.copy()

df_Feb['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Feb['in_Amount'] = df['Feb'] # <- change here
df_Feb['in_TimeframeStart'] = '02/01/' + df['Year'].astype(str) # <- change here
df_Feb['in_TimeframeEnd'] = '02/28/' + df['Year'].astype(str) # <- change here
df_Feb['timeStamp'] = "Feburary"

print(len(df_Feb))
df_Feb.head(1)

In [None]:
# 3) Monthly Mar
df_Mar = df_temp.copy()

df_Mar['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Mar['in_Amount'] = df['Mar'] # <- change here
df_Mar['in_TimeframeStart'] = '03/01/' + df['Year'].astype(str) # <- change here
df_Mar['in_TimeframeEnd'] = '03/31/' + df['Year'].astype(str) # <- change here
df_Mar['timeStamp'] = "March"

print(len(df_Mar))
df_Mar.head(1)

In [None]:
# 4) Monthly Apr
df_Apr = df_temp.copy()

df_Apr['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Apr['in_Amount'] = df['Apr'] # <- change here
df_Apr['in_TimeframeStart'] = '04/01/' + df['Year'].astype(str) # <- change here
df_Apr['in_TimeframeEnd'] = '04/30/' + df['Year'].astype(str) # <- change here
df_Apr['timeStamp'] = "April"

print(len(df_Apr))
df_Apr.head(1)

In [None]:
# 5) Monthly May
df_May = df_temp.copy()

df_May['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_May['in_Amount'] = df['May'] # <- change here
df_May['in_TimeframeStart'] = '05/01/' + df['Year'].astype(str) # <- change here
df_May['in_TimeframeEnd'] = '05/31/' + df['Year'].astype(str) # <- change here
df_May['timeStamp'] = "May"

print(len(df_May))
df_May.head(1)

In [None]:
# 6) Monthly Jun
df_Jun = df_temp.copy()

df_Jun['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Jun['in_Amount'] = df['Jun'] # <- change here
df_Jun['in_TimeframeStart'] = '06/01/' + df['Year'].astype(str) # <- change here
df_Jun['in_TimeframeEnd'] = '06/30/' + df['Year'].astype(str) # <- change here
df_Jun['timeStamp'] = "June"

print(len(df_Jun))
df_Jun.head(1)

In [None]:
# 7) Monthly Jul
df_Jul = df_temp.copy()

df_Jul['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Jul['in_Amount'] = df['Jul'] # <- change here
df_Jul['in_TimeframeStart'] = '07/01/' + df['Year'].astype(str) # <- change here
df_Jul['in_TimeframeEnd'] = '07/31/' + df['Year'].astype(str) # <- change here
df_Jul['timeStamp'] = "July"

print(len(df_Jul))
df_Jul.head(1)

In [None]:
# 8) Monthly Aug
df_Aug = df_temp.copy()

df_Aug['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Aug['in_Amount'] = df['Aug'] # <- change here
df_Aug['in_TimeframeStart'] = '08/01/' + df['Year'].astype(str) # <- change here
df_Aug['in_TimeframeEnd'] = '08/31/' + df['Year'].astype(str) # <- change here
df_Aug['timeStamp'] = "August"

print(len(df_Aug))
df_Aug.head(1)

In [None]:
# 9) Monthly Sep
df_Sep = df_temp.copy()

df_Sep['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Sep['in_Amount'] = df['Sep'] # <- change here
df_Sep['in_TimeframeStart'] = '09/01/' + df['Year'].astype(str) # <- change here
df_Sep['in_TimeframeEnd'] = '09/30/' + df['Year'].astype(str) # <- change here
df_Sep['timeStamp'] = "September"

print(len(df_Sep))
df_Sep.head(1)

In [None]:
# 10) Monthly Oct
df_Oct = df_temp.copy()

df_Oct['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Oct['in_Amount'] = df['Oct'] # <- change here
df_Oct['in_TimeframeStart'] = '10/01/' + df['Year'].astype(str) # <- change here
df_Oct['in_TimeframeEnd'] = '10/31/' + df['Year'].astype(str) # <- change here
df_Oct['timeStamp'] = "October"

print(len(df_Oct))
df_Oct.head(1)

In [None]:
# 11) Monthly Nov
df_Nov = df_temp.copy()

df_Nov['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Nov['in_Amount'] = df['Nov'] # <- change here
df_Nov['in_TimeframeStart'] = '11/01/' + df['Year'].astype(str) # <- change here
df_Nov['in_TimeframeEnd'] = '11/30/' + df['Year'].astype(str) # <- change here
df_Nov['timeStamp'] = "November"

print(len(df_Nov))
df_Nov.head(1)

In [None]:
# 12) Monthly Dec
df_Dec = df_temp.copy()

df_Dec['in_VariableSpecificCV'] = "Intake_Monthly_MI" # <- change here
df_Dec['in_Amount'] = df['Dec'] # <- change here
df_Dec['in_TimeframeStart'] = '12/01/' + df['Year'].astype(str) # <- change here
df_Dec['in_TimeframeEnd'] = '12/31/' + df['Year'].astype(str) # <- change here
df_Dec['timeStamp'] = "December"

print(len(df_Dec))
df_Dec.head(1)

In [None]:
# 13) Annual (total)
df_Annual = df_temp.copy()

df_Annual['in_VariableSpecificCV'] = "Intake_Annual_MI" # <- change here
df_Annual['in_Amount'] = df['Total_Intake__Gallons_'] # <- change here
df_Annual['in_TimeframeStart'] = '01/01/' + df['Year'].astype(str) # <- change here
df_Annual['in_TimeframeEnd'] = '12/31/' + df['Year'].astype(str) # <- change here
df_Annual['timeStamp'] = "Annual"

print(len(df_Annual))
df_Annual.head(1)

In [None]:
# Concatenate Monthly and Annual Together
frames = [df_Jan, df_Feb, df_Mar, df_Apr, df_May, df_Jun,
          df_Jul, df_Aug, df_Sep, df_Oct, df_Nov, df_Dec, df_Annual]
dfout = pd.concat(frames).reset_index(drop=True)
print(len(dfout))

In [None]:
# Convert History Year to YYYY-MM-DD format.

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], errors = 'coerce')
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], errors = 'coerce')
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

In [None]:
# Create WaterSource name
# Surface Water = Surface_Water_Source, Groundwater = Aquifer_Source, Reuse = Organization.

# Fixing empty site names

def setWSN(Type, SWName, GWName, RUName):
    Type = str(Type).strip()
    SWName = str(SWName).strip()
    GWName = str(GWName).strip()
    RUName = str(RUName).strip()
    
    outString = "Unspecified" # default
    
    if Type == "Surface Water":
        outString = SWName
    if Type == "Ground Water":
        outString = GWName
    if Type == "Reuse":
        outString = RUName
        
    if outString == "":
        outString = "Unspecified"
        
    return outString

dfout['in_WaterSourceName'] = dfout.apply(lambda row: setWSN(row['in_WaterSourceTypeCV'], 
                                                             row['Surface_Water_Source'], 
                                                             row['Aquifer_Source'], 
                                                             row['Organization']), axis=1)
print(len(dfout))

In [None]:
# Update in_VariableSpecificCV to include water source type.

def changeGroundWater(WSTcv):
    WSTcv = str(WSTcv).strip()
    if WSTcv == "Ground Water":
        outString = 'Groundwater'
    else:
        outString = WSTcv
    return outString

dfout['temp_WaterSourceTypeCV'] = dfout.apply(lambda row: changeGroundWater(row['in_WaterSourceTypeCV']), axis=1)

def updateVariableSpecificCV(VScv, WSTcv):
    VScv = str(VScv).strip()
    WSTcv = str(WSTcv).strip()
    outString = VScv + "_" + WSTcv
    return outString

dfout['in_VariableSpecificCV'] = dfout.apply(lambda row: updateVariableSpecificCV(row['in_VariableSpecificCV'], row['temp_WaterSourceTypeCV']), axis=1)
dfout = dfout.drop(['temp_WaterSourceTypeCV'], axis=1)
dfout['in_VariableSpecificCV'].unique()

### WaDE Custom Elements (due to missing info)

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDETX_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

In [None]:
# Converting numbers that are in string to float.

# in_Amount
dfout['in_Amount'] = dfout['in_Amount'].replace(",", "", regex=True)
dfout['in_Amount'] = pd.to_numeric(dfout['in_Amount'], errors='coerce')

#in_PopulationServed
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].replace(",", "", regex=True)
dfout['in_PopulationServed'] = pd.to_numeric(dfout['in_PopulationServed'], errors='coerce')

dfout.head()

In [None]:
df.info()

## External Geometry input file

In [None]:
df_geo = df_PWS.copy()
df_geo = df_geo.drop(['pwsName', 'Status', 'Source', 'Area', 'Long', 'Lat', 'SubmitDate'], axis=1)
df_geo.head()

## Export Outputfile

In [None]:
# Exporting output files.
dfout.to_csv('P_MasterTXSiteSpecific.csv', index=False)  # The master output.
df_geo.to_csv('shapeGeometery.csv', index=False)  # The geometry ouput.