# Pre-processing CA Site-Specific Public Supply Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/California/SS_PublicSupplyWaterUse" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

## Data Input

In [None]:
# Input File - delivered-wate
fileInput = "RawInputData/delivered-water-public-system-water-reported-in-the-electronic-annual-report-ear-2013-2016.zip"
dfin1 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/delivered-water-public-system-water-reported-in-the-electronic-annual-report-ear-2013-2016.zip", compression=dict(method='zip', archive_name="delivered-water-public-system-water-reported-in-the-electronic-annual-report-ear-2013-2016.csv"), index=False)

dfin1['PWSID'] = dfin1['PWSID'].astype(str).str.strip()
print(len(dfin1))
dfin1.head(1)

In [None]:
# Input File - produced-water
fileInput = "RawInputData/produced-water-public-water-system-reported-in-the-electronic-annual-report-ear-2013-2016.zip"
dfin2 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv("RawInputData/produced-water-public-water-system-reported-in-the-electronic-annual-report-ear-2013-2016.zip", compression=dict(method='zip', archive_name="produced-water-public-water-system-reported-in-the-electronic-annual-report-ear-2013-2016.csv"), index=False)

dfin2['PWSID'] = dfin2['PWSID'].astype(str).str.strip()
print(len(dfin2))
dfin2.head(1)

In [None]:
# Input File - Drinking Water Watch - Public Water System facilities (DWWPWSF)
fileInput = "RawInputData/Drinking Water Watch - Public Water System facilities.zip"
dfin3 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin3:
    dfin3['WaDEUUID'] = "in3" + dfin3.index.astype(str)
    dfin3.to_csv("RawInputData/Drinking Water Watch - Public Water System facilities.zip", compression=dict(method='zip', archive_name="Drinking Water Watch - Public Water System facilities.csv"), index=False)

dfin3['Water System No'] = dfin3['Water System No'].astype(str).str.strip()
print(len(dfin3))
dfin3.head(1)

In [None]:
# Input File - California_Drinking_Water_System_Area_Boundaries (CDWSAB) shp file info
shapefileInput = "RawInputData/California_Drinking_Water_System_Area_Boundaries.zip" # ziped folder of the shp file
dfPoUshapetemp = gpd.read_file(shapefileInput).replace(np.nan, "")
dfPoUshapetemp['SABL_PWSID'] = dfPoUshapetemp['SABL_PWSID'].astype(str).str.strip()

dfPoUshapetemp['geometry'] = dfPoUshapetemp['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
dfPoUshapetemp["cent_lattitude"] = dfPoUshapetemp.centroid.y
dfPoUshapetemp["cent_longitude"] = dfPoUshapetemp.centroid.x
dfPoUshapetemp.head(1)

## Work with Data

### delivered Data

In [None]:
# left merge DWWPWSF to delivered ts
# left merge CDWSAB to delivered ts
dftemp = pd.DataFrame()
dftemp = dfin1.merge(dfin3[['Water System No', 'Primary Water Source Type', 'State Water System Type']], left_on='PWSID', right_on='Water System No', how='left').replace(np.nan, "")
dftemp = dftemp.merge(dfPoUshapetemp[['SABL_PWSID', 'WATER_SY_1', 'BOUNDARY_T', 'COUNTY', 'cent_lattitude', 'cent_longitude']], left_on='PWSID', right_on='SABL_PWSID', how='left').replace(np.nan, "")

print(len(dftemp))
dftemp.head(1)

In [None]:
# Loop data list.  Use this to search for specific fields.
amountList = ['WATER DELIVERIES TO Single.family.Residential',
'WATER DELIVERIES TO Multi.family.Residential',
'WATER DELIVERIES TO Commercial.Institutional',
'WATER DELIVERIES TO Industrial',
'WATER DELIVERIES TO Landscape.Irrigation',
'WATER DELIVERIES TO Other',
'WATER DELIVERIES TO Agricultural',
'WATER DELIVERIES TO Other.PWS']

benUseList = ['Single Family Residential', 
              'Multi Family Residential',
              'Commercial / Institutional',
              'Industrial',
              'Landscape Irrigation',
              'Other',
              'Agricultural',
              'Other PWS']

In [None]:
# create output POD dataframe
outdf1 = pd.DataFrame()

# for each value in variableTypeList
for x in range(len(amountList)):

    df = pd.DataFrame()
   
    # Data Assessment UUID
    df['WaDEUUID'] = dftemp['WaDEUUID']

    # Method Info
    df['in_MethodUUID'] = "CAssps_M1"
    
    # Variable Info
    df['in_VariableCV'] = "Cumulative Delivered"
    df['in_AggregationIntervalUnitCV'] = "Monthly"
    df['in_VariableSpecificUUID'] = "" # will create sa portion below

    
    # Organization Info
    df['in_OrganizationUUID'] = "CAssps_O1"
    
    # WaterSource Info
    df['in_Geometry'] = ""
    df['in_GNISFeatureNameCV'] = ""
    df['in_WaterQualityIndicatorCV'] = "Fresh"
    df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
    df['in_WaterSourceNativeID'] = "" # auto fill in below
    df['in_WaterSourceTypeCV'] = dftemp['Primary Water Source Type']
    
    # Site Info
    df['in_CoordinateAccuracy'] = ""
    df['in_CoordinateMethodCV'] = "Centroid of Area"
    df['in_County'] = dftemp['COUNTY']
    df['in_EPSGCodeCV'] = 4326
    df['in_Geometry'] = ""
    df['in_GNISCodeCV'] = ""
    df['in_HUC12'] = ""
    df['in_HUC8'] = ""
    df['in_Latitude'] = dftemp['cent_lattitude']
    df['in_Longitude'] = dftemp['cent_longitude']
    df['in_NHDNetworkStatusCV'] = ""
    df['in_NHDProductCV'] = ""
    df['in_PODorPOUSite'] = "POU"
    df['in_SiteName'] = dftemp['WATER_SY_1']
    df['in_SiteNativeID'] = dftemp['SABL_PWSID'].astype(str)
    df['in_SitePoint'] = ""
    df['in_SiteTypeCV'] = dftemp['BOUNDARY_T']
    df['in_StateCV'] = "CA"
    df['in_USGSSiteID'] = ""
       
    # Site VariableAmounts Info
    df['temp1_WaterUnits'] = dftemp['Delivered.Water.Units AS ORIGINALLY REPORTED']
    df['temp2_WaterUnits'] = dftemp['Delivered.Water.Units.Revised BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS'] # to check units on amounts
    df['temp_unitCheck'] = dftemp['UNITS ADJUSTED BY OIMA?']
    df['in_Amount'] = dftemp[amountList[x]]
    
    df['in_AllocationCropDutyAmount'] = ""
    df['in_AssociatedNativeAllocationIDs'] = ""
    df['in_BeneficialUseCategory'] = benUseList[x]
    df['in_CommunityWaterSupplySystem'] = dftemp['Water.System.Name']
    df['in_CropTypeCV'] = ""
    df['in_CustomerTypeCV'] = dftemp['State Water System Type']
    df['in_DataPublicationDate'] = ""
    df['in_DataPublicationDOI'] = ""
    df['in_Geometry'] = ""
    df['in_IrrigatedAcreage'] = ""
    df['in_IrrigationMethodCV'] = ""
    df['in_PopulationServed'] = dftemp['Population Of Service Area']
    df['in_PowerGeneratedGWh'] = ""
    df['in_PowerType'] = ""
    df['in_PrimaryUseCategory'] = "" # auto fill in below
    df['in_ReportYearCV'] =  dftemp['Year'].replace("", 0).fillna(0).astype(int).astype(str)
    df['in_SDWISIdentifier'] = ""
    df['temp_Month'] = dftemp['Month'] #temp to get string of month name
    df['temp_DaysInMonth'] = dftemp['Days.In.Month']  #temp to get last data of that month
    df['in_TimeframeEnd'] = "" # will fix below using Month and DaysInMonth
    df['in_TimeframeStart'] = dftemp['Date']

    outdf1 = pd.concat([outdf1, df])

outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

### produced Data

In [None]:
# left merge DWWPWSF to delivered ts
# left merge CDWSAB to delivered ts
dftemp = pd.DataFrame()
dftemp = dfin2.merge(dfin3[['Water System No', 'Primary Water Source Type', 'State Water System Type']], left_on='PWSID', right_on='Water System No', how='left').replace(np.nan, "")
dftemp = dftemp.merge(dfPoUshapetemp[['SABL_PWSID', 'WATER_SY_1', 'BOUNDARY_T', 'COUNTY', 'cent_lattitude', 'cent_longitude']], left_on='PWSID', right_on='SABL_PWSID', how='left').replace(np.nan, "")

print(len(dftemp))
dftemp.head(1)

In [None]:
# Loop data list.  Use this to search for specific fields.
amountList = ['WATER PRODUCED FROM GROUNDWATER',
'WATER PRODUCED FROM SURFACE WATER',
'FINSIHIED WATER PURCHASED OR RECEIVED FROM ANOTHER PUBLIC WATER SYSTEM',
'WATER SOLD TO ANOTHER PUBLIC WATER SYSTEM',
'Non-Potable Produced Water (EXCLUDING RECYCLING)',
'RECYCLED WATER PRODUCED']

benUseList = ['Produced from Groundwater',
'Produced from Surface Water',
'Purchased from another PWS',
'Sold to another PWS',
'Non-Potable',
'Recycled']

In [None]:
# create output POD dataframe
outdf2 = pd.DataFrame()

# for each value in variableTypeList
for x in range(len(amountList)):

    df = pd.DataFrame()
   
    # Data Assessment UUID
    df['WaDEUUID'] = dftemp['WaDEUUID']

    # Method Info
    df['in_MethodUUID'] = "CAssps_M1"
    
     # Variable Info
    df['in_VariableCV'] = "Cumulative Produced"
    df['in_AggregationIntervalUnitCV'] = "Monthly"
    df['in_VariableSpecificUUID'] = "" # will create sa portion below
    
    # Organization Info
    df['in_OrganizationUUID'] = "CAssps_O1"
    
    # WaterSource Info
    df['in_Geometry'] = ""
    df['in_GNISFeatureNameCV'] = ""
    df['in_WaterQualityIndicatorCV'] = "Fresh"
    df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
    df['in_WaterSourceNativeID'] = "" # auto fill in below
    df['in_WaterSourceTypeCV'] = dftemp['Primary Water Source Type']
    
    # Site Info
    df['in_CoordinateAccuracy'] = ""
    df['in_CoordinateMethodCV'] = "Centroid of Area"
    df['in_County'] = dftemp['COUNTY']
    df['in_EPSGCodeCV'] = 4326
    df['in_Geometry'] = ""
    df['in_GNISCodeCV'] = ""
    df['in_HUC12'] = ""
    df['in_HUC8'] = ""
    df['in_Latitude'] = dftemp['cent_lattitude']
    df['in_Longitude'] = dftemp['cent_longitude']
    df['in_NHDNetworkStatusCV'] = ""
    df['in_NHDProductCV'] = ""
    df['in_PODorPOUSite'] = "POU"
    df['in_SiteName'] = dftemp['WATER_SY_1']
    df['in_SiteNativeID'] = dftemp['SABL_PWSID'].astype(str)
    df['in_SitePoint'] = ""
    df['in_SiteTypeCV'] = dftemp['BOUNDARY_T']
    df['in_StateCV'] = "CA"
    df['in_USGSSiteID'] = ""
       
    # Site VariableAmounts Info
    df['temp1_WaterUnits'] = dftemp['WATER PRODUCED Water.Units IN UNITS ORIGINALLY REPORTED']
    df['temp2_WaterUnits'] = dftemp['WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS'] # to check units on amounts
    df['temp_unitCheck'] = dftemp['UNITS ADJUSTED BY OIMA?']
    df['in_Amount'] = dftemp[amountList[x]]

    df['in_AllocationCropDutyAmount'] = ""
    df['in_AssociatedNativeAllocationIDs'] = ""
    df['in_BeneficialUseCategory'] = benUseList[x]
    df['in_CommunityWaterSupplySystem'] = dftemp['Water.System.Name']
    df['in_CropTypeCV'] = ""
    df['in_CustomerTypeCV'] = dftemp['State Water System Type']
    df['in_DataPublicationDate'] = ""
    df['in_DataPublicationDOI'] = ""
    df['in_Geometry'] = ""
    df['in_IrrigatedAcreage'] = ""
    df['in_IrrigationMethodCV'] = ""
    df['in_PopulationServed'] = dftemp['Population Of Service Area']
    df['in_PowerGeneratedGWh'] = ""
    df['in_PowerType'] = ""
    df['in_PrimaryUseCategory'] = "" # auto fill in below
    df['in_ReportYearCV'] =  dftemp['Year'].replace("", 0).fillna(0).astype(int).astype(str)
    df['in_SDWISIdentifier'] = ""
    df['temp_Month'] = dftemp['Month'] #temp to get string of month name
    df['temp_DaysInMonth'] = dftemp['Days.In.Month']  #temp to get last data of that month
    df['in_TimeframeEnd'] = "" # will fix below using Month and DaysInMonth
    df['in_TimeframeStart'] = dftemp['Date']

    outdf2 = pd.concat([outdf2, df])

outdf2 = outdf2.drop_duplicates().reset_index(drop=True)
print(len(outdf2))
outdf2.head()

## Concatenate POD and POU Data.  Make needed changes

In [None]:
# Concatenate dataframes
frames = [outdf1, outdf2]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# fix blank / null WaterSourcetypeCV
# simplify to WaDE specific categories

wsTypeDict = {
    "Groundwater" : "Groundwater",
    "Groundwater Purchased" : "Groundwater",
    "Groundwater UDI Surface Water" : "Groundwater",
    "Surface Water" : "Surface Water",
    "Surface Water Purchased" : "Surface Water"}

def fixWaterSourceTypeCV(valA):
    valA = str(valA).strip()
    if valA == "" or pd.isnull(valA):
        outString = "WaDE Blank"
    else:
        try:
            outString = wsTypeDict[valA]
        except:
            outString = "WaDE Blank"
    return outString

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: fixWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
# Creating amount value for based on units. Convert to gallons from input unit.

# Issue of some entries are strings. Fix unit type to float
outdf['in_Amount'] = outdf['in_Amount'].replace('-','', regex=True)
outdf['in_Amount'] = outdf['in_Amount'].replace(',','', regex=True)
outdf['in_Amount'] = outdf['in_Amount'].replace('FALSE','', regex=True)
outdf['in_Amount'] = outdf['in_Amount'].astype(str).str.strip()
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'])

def createAmountGallon(check, unit1, unit2, val):
    outVal = val # default
    check = str(check).strip()
    
    if check == 'NO CHANGES':
        unit = str(unit1).strip()
    else:
        unit = str(unit2).strip()


    if unit == 'AF':
        outVal = val * 325851
    if unit == 'MG':
        outVal = val * 1000000
    if unit == 'TG':
        outVal = val * 1000
    if unit == 'HG':
        outVal = val * 100
    if unit == 'DG':
        outVal = val * 10
    if unit == 'CCF':
        outVal = val * 748.052
    if unit == 'CF':
        outVal = val * 7.48052
    if unit == '-':
        outVal = val
    
    return outVal

outdf['in_Amount'] = outdf.apply(lambda row: createAmountGallon(row['temp_unitCheck'], row['temp1_WaterUnits'], row['temp2_WaterUnits'], row['in_Amount']), axis=1)
outdf['in_Amount'].unique()

In [None]:
# Create WaDE TimeframeEnd

MonthDictionary = {
"January" : "01",
"February" : "02",
"March" : "03",
"April" : "04",
"May" : "05",
"June" : "06",
"July" : "07",
"August" : "08",
"September" : "09",
"October" : "10",
"November" : "11",
"December" : "12"}

def createTimeframeEnd(Year, Month, Day):
    yearString = str(Year).strip()
    monthString = str(MonthDictionary[str(Month).strip()]).strip()
    dayString = str(Day).strip()
    try:
        outString = yearString + "/" + monthString + "/" + dayString
    except:
        outString = ''
    return outString

outdf['in_TimeframeEnd'] = outdf.apply(lambda row: createTimeframeEnd(row['in_ReportYearCV'], row['temp_Month'], row['temp_DaysInMonth']), axis=1)
outdf['in_TimeframeEnd'].unique()

In [None]:
# we can remove ',' from this project

# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_CommunityWaterSupplySystem'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_CommunityWaterSupplySystem']), axis=1)
outdf['in_CommunityWaterSupplySystem'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_CommunityWaterSupplySystem'] = outdf.apply(lambda row: ensureEmptyString(row['in_CommunityWaterSupplySystem']), axis=1)
outdf['in_CommunityWaterSupplySystem'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

In [None]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).astype(str)
outdf['in_PopulationServed'].unique()

In [None]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

In [None]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

In [None]:
# extract year out
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].replace("", 0).fillna(0).astype(int).astype(str)
outdf['in_ReportYearCV'].unique()

In [None]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [None]:
# PoU Shapefile Data
# see above for input

print(len(dfPoUshapetemp))
dfPoUshapetemp.head(1)

In [None]:
# create temp dataframe to hold native ID and geometry from shapefile input
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)

# assing values to temp dataframe based on shapefile input
# for in_SiteNativeID assure ID value is the same as that listed above for POU info.
dfPoUshape['in_SiteNativeID'] = dfPoUshapetemp['SABL_PWSID']
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
print(len(dfPoUshape))
dfPoUshape.head()

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pssps_Main.zip', compression=dict(method='zip', archive_name='Pssps_Main.csv'), index=False)  # The output, save as a zip
dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.