# Pre-processing California Site Specific data for WaDEQA upload.
Date Updated: 02/26/2021

Purpose:  To pre-process the California site specific data into one master file for simple dataframe creation and extraction.  See "CA_SiteSpecificAmounts Schema Mapping to WaDE_QA.xlsx" for more details.

Notes:
- Going to use both Produced Water & Delivered Water data, and pair both to the Area of Use as POU.
- Create three seperate dataframes (one for Delivered, and two for Produced), then concatenate into single long output dataframe.

In [1]:
# Needed Libararies
import os
import math
import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

#Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/California/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

In [2]:
# Time Series Data - Delivered
fileInput1a = "deliveredPWS_2013_2016_input.csv"
dfdts = pd.read_csv(fileInput1a)
print(len(dfdts))
dfdts.head(1)

78537


Unnamed: 0,PWSID,Water.System.Name,Water.System.Classification,Year,Month,Date,Days.In.Month,Delivered.Water.Units AS ORIGINALLY REPORTED,Delivered.Water.Units.Revised BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS,UNITS ADJUSTED BY OIMA?,WATER DELIVERIES TO Single.family.Residential,WATER DELIVERIES TO Multi.family.Residential,WATER DELIVERIES TO Commercial.Institutional,WATER DELIVERIES TO Industrial,WATER DELIVERIES TO Landscape.Irrigation,WATER DELIVERIES TO Other,WATER DELIVERIES TO Agricultural,WATER DELIVERIES TO Other.PWS,"WATER DELIVERIES Total.Delivered Residential IN REVISED UNITS (Total Does not include Landscape Irrigation, Agricultural or to other PWS)","Total. RESIDENTIAL Delivered.Gallons (Total Does not include Landscape Irrigation, Agricultural or to other PWS)",Population Of Service Area,CALCULATED GPCD (Total delivery to residential in gallons per capita day)
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN.,Community Water System,2015,January,1/1/2015,31,G,G,NO CHANGES,105995,,,,,,,,105995.0,105995.0,50.0,68.4


In [3]:
# Time Series Data - Produced
fileInput1b = "producedPWS_2013_2016_input.csv"
dfpts = pd.read_csv(fileInput1b)
print(len(dfpts))
dfpts.head(1)

212824


Unnamed: 0,PWSID,Water.System.Name,Water.System.Classification,Year,Month,Date,Days.In.Month,WATER PRODUCED Water.Units IN UNITS ORIGINALLY REPORTED,WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS,UNITS ADJUSTED BY OIMA?,Finished.Water.Vol.Type.Revised,WATER PRODUCED FROM GROUNDWATER,WATER PRODUCED FROM SURFACE WATER,FINSIHIED WATER PURCHASED OR RECEIVED FROM ANOTHER PUBLIC WATER SYSTEM,WATER SOLD TO ANOTHER PUBLIC WATER SYSTEM,Non-Potable Produced Water (EXCLUDING RECYCLING),RECYCLED WATER PRODUCED,"TOTAL POTABLE WATER PRODUCED USING REVISED UNITS (Total Does not Include Sold, Non-potable and Recycled amounts)","TOTAL POTABLE WATER IN GALLONS (Total Does not Include Sold, Non-potable and Recycled amounts)",Population Of Service Area,CALCULATED GPCD (Total Potable Produced in gallons per capita day)
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN.,Community Water System,2014,January,1/1/2014,31,G,G,NO CHANGES,M,171120.0,,,,,,171120.0,171120.0,50.0,110.4


In [4]:
# Facility info
fileInput2 = "PWS Facility Information_input.csv"
dffi = pd.read_csv(fileInput2)
print(len(dffi))
dffi.head(1)

7763


Unnamed: 0,Water System No,Water System Name,Principal County Served,Federal Water System Type CODE,Federal Water System Type,State Water System Type CODE,State Water System Type,Water System Status CODE,System Status,Owner Type CODE,Owner Type,Primary Water Source Type CODE,Primary Water Source Type,Residential Population,Non-Transient Population,Transient Population,Total Population,Number of Agricultural Service Connections (AG),Number of Combined Service Connections (CB),Number of Commercial Service Connections (CM),Number of Institutional Service Connections (IN),Number of Residential Service Connections (RS),Total Number of Service Connections,Fee Code,Fee Code Description,Date of Sanitary Survey visit (SNSV Visit Date),CITY,Treatment Plant Class CODE,Treatment Plant Class,Distribution System Class CODE,Distribution System Class
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,ALAMEDA,C,Community,C,Community,A,Active,P,Private,GW,Groundwater,50.0,,,50.0,,19.0,,,,19.0,SC,Small Community,12/12/2017,CASTRO VALLEY,,,D1,Distribution Operator Level 1


In [5]:
# Shapefile / Site and Boundary info
fileInput3 = "CADWS_AreaBoundaries_input.csv"
dfbi = pd.read_csv(fileInput3)
print(len(dfbi))
dfbi.head(1)

4563


Unnamed: 0,OID_,OBJECTID_1,SABL_PWSID,WATER_SYST,WATER_SY_1,BOUNDARY_T,REGULATING,COUNTY,ADDR_LINE_,ADDR_LIN_1,ADDRESS_CI,ADDRESS_ST,ADDRESS_ZI,STATE_CLAS,POPULATION,SERVICE_CO,VERIFIED_S,LAST_EDITE,VERIFIED_N,VERIFIED_T,DT_VERIFIE,CREATED_US,CREATED_DA,LAST_EDI_1,OBJECTID,BOUNDARY_F,ACTIVITY_S,ACTIVITY_D,OWNER_TYPE,FEDERAL_CL,Shape__Are,Shape__Len,SHAPE_1,Shape_Length,Shape_Area,Lat,Long
0,1,304417,CA1100445,CA1100445,ORLAND MOBILE H.P.,Water Service Area,DISTRICT 21 - VALLEY,GLENN,4265 ROAD 99 WEST,P.O. BOX 1721,ORLAND,CA,95963,COMMUNITY,178,54,Not Verified,10/20/2020 0:00,,,1/1/1970 0:00,SBUCKNAM,10/20/2020 0:00,SBUCKNAM,,WBT Tool,A,3/28/1986 0:00,P,COMMUNITY,36576.63873,790.599915,,0.006519,2e-06,39.734289,-122.198109


# Delivered Data
- Eight timeseries info.

In [6]:
# Merging dataframes into one, using left-join.
df = pd.DataFrame()

df = pd.merge(dfdts, dffi, left_on='PWSID', right_on='Water System No', how='left') 
df = pd.merge(df, dfbi, left_on='PWSID', right_on='SABL_PWSID', how='left')

df = df.replace("Null", "")  # The State's Master input dataframe. Remove any nulls.
df = df.replace("nan", "")  # The State's Master input dataframe. Remove any nulls.
df = df.replace(np.nan, "")  # The State's Master input dataframe. Remove any nulls.

print(len(df))
df.head(1)

78539


Unnamed: 0,PWSID,Water.System.Name,Water.System.Classification,Year,Month,Date,Days.In.Month,Delivered.Water.Units AS ORIGINALLY REPORTED,Delivered.Water.Units.Revised BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS,UNITS ADJUSTED BY OIMA?,WATER DELIVERIES TO Single.family.Residential,WATER DELIVERIES TO Multi.family.Residential,WATER DELIVERIES TO Commercial.Institutional,WATER DELIVERIES TO Industrial,WATER DELIVERIES TO Landscape.Irrigation,WATER DELIVERIES TO Other,WATER DELIVERIES TO Agricultural,WATER DELIVERIES TO Other.PWS,"WATER DELIVERIES Total.Delivered Residential IN REVISED UNITS (Total Does not include Landscape Irrigation, Agricultural or to other PWS)","Total. RESIDENTIAL Delivered.Gallons (Total Does not include Landscape Irrigation, Agricultural or to other PWS)",Population Of Service Area,CALCULATED GPCD (Total delivery to residential in gallons per capita day),Water System No,Water System Name,Principal County Served,Federal Water System Type CODE,Federal Water System Type,State Water System Type CODE,State Water System Type,Water System Status CODE,System Status,Owner Type CODE,Owner Type,Primary Water Source Type CODE,Primary Water Source Type,Residential Population,Non-Transient Population,Transient Population,Total Population,Number of Agricultural Service Connections (AG),Number of Combined Service Connections (CB),Number of Commercial Service Connections (CM),Number of Institutional Service Connections (IN),Number of Residential Service Connections (RS),Total Number of Service Connections,Fee Code,Fee Code Description,Date of Sanitary Survey visit (SNSV Visit Date),CITY,Treatment Plant Class CODE,Treatment Plant Class,Distribution System Class CODE,Distribution System Class,OID_,OBJECTID_1,SABL_PWSID,WATER_SYST,WATER_SY_1,BOUNDARY_T,REGULATING,COUNTY,ADDR_LINE_,ADDR_LIN_1,ADDRESS_CI,ADDRESS_ST,ADDRESS_ZI,STATE_CLAS,POPULATION,SERVICE_CO,VERIFIED_S,LAST_EDITE,VERIFIED_N,VERIFIED_T,DT_VERIFIE,CREATED_US,CREATED_DA,LAST_EDI_1,OBJECTID,BOUNDARY_F,ACTIVITY_S,ACTIVITY_D,OWNER_TYPE,FEDERAL_CL,Shape__Are,Shape__Len,SHAPE_1,Shape_Length,Shape_Area,Lat,Long
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN.,Community Water System,2015,January,1/1/2015,31,G,G,NO CHANGES,105995,,,,,,,,105995.0,105995.0,50.0,68.4,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,ALAMEDA,C,Community,C,Community,A,Active,P,Private,GW,Groundwater,50.0,,,50.0,,19.0,,,,19.0,SC,Small Community,12/12/2017,CASTRO VALLEY,,,D1,Distribution Operator Level 1,728.0,305171.0,CA0103040,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,Water Service Area,DISTRICT 04 - SAN FRANCISCO,ALAMEDA,8653 NORRIS CANYON ROAD,,CASTRO VALLEY,CA,94552,COMMUNITY,50.0,19.0,Not Verified,10/20/2020 0:00,,,1/1/1970 0:00,SBUCKNAM,10/20/2020 0:00,SBUCKNAM,,WBT Tool,A,1/27/1983 0:00,P,COMMUNITY,593518.1428,3456.677697,,0.0278,3.8e-05,37.734364,-122.027303


In [7]:
# 1) Cumulative Delivered_Monthly_Single Family Residential_Unspecified
df_df1 = pd.DataFrame(index=df.index)

# Variable Info
df_df1['in_VariableCV'] = "Delivered"
df_df1['in_VariableSpecificCV'] = "Cumulative Delivered_Monthly_Single Family Residential_Unspecified"

# Water Source Info
df_df1['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
df_df1['in_CoordinateMethodCV'] = "Centroid of Area"
df_df1['in_County'] = df['COUNTY']
df_df1['in_Latitude'] = df['Lat']
df_df1['in_Longitude'] = df['Long']
df_df1['in_PODorPOUSite'] = "POU"
df_df1['in_SiteName'] = df['Water System Name']
df_df1['in_SiteNativeID'] = df['SABL_PWSID']
df_df1['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site VariableAmounts Info
df_df1['WaterUnits'] = ""
df_df1['in_Amount'] = df['WATER DELIVERIES TO Single.family.Residential']
df_df1['in_BenUse'] = "Single Family Residential"
df_df1['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_df1['in_CustomerTypeCV'] = df['State Water System Type']
df_df1['in_PopulationServed'] = df['Population Of Service Area']
df_df1['Year'] = df['Year']
df_df1['Month'] = df['Month']
df_df1['Days.In.Month'] = df['Days.In.Month']

print(len(df_df1))
df_df1.head(1)

78539


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Delivered,Cumulative Delivered_Monthly_Single Family Res...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995,Single Family Residential,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31


In [8]:
# 2) Cumulative Delivered_Monthly_Multi Family Residential_Unspecified
df_df2 = pd.DataFrame(index=df.index)

# Variable Info
df_df2['in_VariableCV'] = "Delivered"
df_df2['in_VariableSpecificCV'] = "Cumulative Delivered_Monthly_Multi Family Residential_Unspecified"

# Water Source Info
df_df2['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
df_df2['in_CoordinateMethodCV'] = "Centroid of Area"
df_df2['in_County'] = df['COUNTY']
df_df2['in_Latitude'] = df['Lat']
df_df2['in_Longitude'] = df['Long']
df_df2['in_PODorPOUSite'] = "POU"
df_df2['in_SiteName'] = df['Water System Name']
df_df2['in_SiteNativeID'] = df['SABL_PWSID']
df_df2['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site VariableAmounts Info
df_df2['WaterUnits'] = ""
df_df2['in_Amount'] = df['WATER DELIVERIES TO Multi.family.Residential']
df_df2['in_BenUse'] = "Multi Family Residential"
df_df2['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_df2['in_CustomerTypeCV'] = df['State Water System Type']
df_df2['in_PopulationServed'] = df['Population Of Service Area']
df_df2['Year'] = df['Year']
df_df2['Month'] = df['Month']
df_df2['Days.In.Month'] = df['Days.In.Month']

print(len(df_df2))
df_df2.head(1)

78539


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Delivered,Cumulative Delivered_Monthly_Multi Family Resi...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,,Multi Family Residential,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31


In [9]:
# 3) Cumulative Delivered_Monthly_Commercial Institutional_Unspecified
df_df3 = pd.DataFrame(index=df.index)

# Variable Info
df_df3['in_VariableCV'] = "Delivered"
df_df3['in_VariableSpecificCV'] = "Cumulative Delivered_Monthly_Commercial Institutional_Unspecified"

# Water Source Info
df_df3['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
df_df3['in_CoordinateMethodCV'] = "Centroid of Area"
df_df3['in_County'] = df['COUNTY']
df_df3['in_Latitude'] = df['Lat']
df_df3['in_Longitude'] = df['Long']
df_df3['in_PODorPOUSite'] = "POU"
df_df3['in_SiteName'] = df['Water System Name']
df_df3['in_SiteNativeID'] = df['SABL_PWSID']
df_df3['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site VariableAmounts Info
df_df3['WaterUnits'] = ""
df_df3['in_Amount'] = df['WATER DELIVERIES TO Commercial.Institutional']
df_df3['in_BenUse'] = "Commercial Institutional"
df_df3['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_df3['in_CustomerTypeCV'] = df['State Water System Type']
df_df3['in_PopulationServed'] = df['Population Of Service Area']
df_df3['Year'] = df['Year']
df_df3['Month'] = df['Month']
df_df3['Days.In.Month'] = df['Days.In.Month']

print(len(df_df3))
df_df3.head(1)

78539


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Delivered,Cumulative Delivered_Monthly_Commercial Instit...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,,Commercial Institutional,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31


In [10]:
# 4) Cumulative Delivered_Monthly_Industrial_Unspecified
df_df4 = pd.DataFrame(index=df.index)

# Variable Info
df_df4['in_VariableCV'] = "Delivered"
df_df4['in_VariableSpecificCV'] = "Cumulative Delivered_Monthly_Industrial_Unspecified"

# Water Source Info
df_df4['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
df_df4['in_CoordinateMethodCV'] = "Centroid of Area"
df_df4['in_County'] = df['COUNTY']
df_df4['in_Latitude'] = df['Lat']
df_df4['in_Longitude'] = df['Long']
df_df4['in_PODorPOUSite'] = "POU"
df_df4['in_SiteName'] = df['Water System Name']
df_df4['in_SiteNativeID'] = df['SABL_PWSID']
df_df4['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site VariableAmounts Info
df_df4['WaterUnits'] = ""
df_df4['in_Amount'] = df['WATER DELIVERIES TO Industrial']
df_df4['in_BenUse'] = "Industrial"
df_df4['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_df4['in_CustomerTypeCV'] = df['State Water System Type']
df_df4['in_PopulationServed'] = df['Population Of Service Area']
df_df4['Year'] = df['Year']
df_df4['Month'] = df['Month']
df_df4['Days.In.Month'] = df['Days.In.Month']

print(len(df_df4))
df_df4.head(1)

78539


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Delivered,Cumulative Delivered_Monthly_Industrial_Unspec...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,,Industrial,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31


In [11]:
# 5) Cumulative Delivered_Monthly_Landscape Irrigation_Unspecified
df_df5 = pd.DataFrame(index=df.index)

# Variable Info
df_df5['in_VariableCV'] = "Delivered"
df_df5['in_VariableSpecificCV'] = "Cumulative Delivered_Monthly_Landscape Irrigation_Unspecified"

# Water Source Info
df_df5['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
df_df5['in_CoordinateMethodCV'] = "Centroid of Area"
df_df5['in_County'] = df['COUNTY']
df_df5['in_Latitude'] = df['Lat']
df_df5['in_Longitude'] = df['Long']
df_df5['in_PODorPOUSite'] = "POU"
df_df5['in_SiteName'] = df['Water System Name']
df_df5['in_SiteNativeID'] = df['SABL_PWSID']
df_df5['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site VariableAmounts Info
df_df5['WaterUnits'] = ""
df_df5['in_Amount'] = df['WATER DELIVERIES TO Landscape.Irrigation']
df_df5['in_BenUse'] = "Landscape Irrigation"
df_df5['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_df5['in_CustomerTypeCV'] = df['State Water System Type']
df_df5['in_PopulationServed'] = df['Population Of Service Area']
df_df5['Year'] = df['Year']
df_df5['Month'] = df['Month']
df_df5['Days.In.Month'] = df['Days.In.Month']

print(len(df_df5))
df_df5.head(1)

78539


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Delivered,Cumulative Delivered_Monthly_Landscape Irrigat...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,,Landscape Irrigation,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31


In [12]:
# 6) Cumulative Delivered_Monthly_Other_Unspecified
df_df6 = pd.DataFrame(index=df.index)

# Variable Info
df_df6['in_VariableCV'] = "Delivered"
df_df6['in_VariableSpecificCV'] = "Cumulative Delivered_Monthly_Other_Unspecified"

# Water Source Info
df_df6['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
df_df6['in_CoordinateMethodCV'] = "Centroid of Area"
df_df6['in_County'] = df['COUNTY']
df_df6['in_Latitude'] = df['Lat']
df_df6['in_Longitude'] = df['Long']
df_df6['in_PODorPOUSite'] = "POU"
df_df6['in_SiteName'] = df['Water System Name']
df_df6['in_SiteNativeID'] = df['SABL_PWSID']
df_df6['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site VariableAmounts Info
df_df6['WaterUnits'] = ""
df_df6['in_Amount'] = df['WATER DELIVERIES TO Other']
df_df6['in_BenUse'] = "Other"
df_df6['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_df6['in_CustomerTypeCV'] = df['State Water System Type']
df_df6['in_PopulationServed'] = df['Population Of Service Area']
df_df6['Year'] = df['Year']
df_df6['Month'] = df['Month']
df_df6['Days.In.Month'] = df['Days.In.Month']

print(len(df_df6))
df_df6.head(1)

78539


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Delivered,Cumulative Delivered_Monthly_Other_Unspecified,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,,Other,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31


In [13]:
# 7) Cumulative Delivered_Monthly_Agricultural_Unspecified
df_df7 = pd.DataFrame(index=df.index)

# Variable Info
df_df7['in_VariableCV'] = "Delivered"
df_df7['in_VariableSpecificCV'] = "Cumulative Delivered_Monthly_Agricultural_Unspecified"

# Water Source Info
df_df7['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
df_df7['in_CoordinateMethodCV'] = "Centroid of Area"
df_df7['in_County'] = df['COUNTY']
df_df7['in_Latitude'] = df['Lat']
df_df7['in_Longitude'] = df['Long']
df_df7['in_PODorPOUSite'] = "POU"
df_df7['in_SiteName'] = df['Water System Name']
df_df7['in_SiteNativeID'] = df['SABL_PWSID']
df_df7['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site VariableAmounts Info
df_df7['WaterUnits'] = ""
df_df7['in_Amount'] = df['WATER DELIVERIES TO Agricultural']
df_df7['in_BenUse'] = "Agricultural"
df_df7['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_df7['in_CustomerTypeCV'] = df['State Water System Type']
df_df7['in_PopulationServed'] = df['Population Of Service Area']
df_df7['Year'] = df['Year']
df_df7['Month'] = df['Month']
df_df7['Days.In.Month'] = df['Days.In.Month']

print(len(df_df7))
df_df7.head(1)

78539


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Delivered,Cumulative Delivered_Monthly_Agricultural_Unsp...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,,Agricultural,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31


In [14]:
# 8) Cumulative Delivered_Monthly_Other PWS_Unspecified
df_df8 = pd.DataFrame(index=df.index)

# Variable Info
df_df8['in_VariableCV'] = "Delivered"
df_df8['in_VariableSpecificCV'] = "Cumulative Delivered_Monthly_Other PWS_Unspecified"

# Water Source Info
df_df8['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
df_df8['in_CoordinateMethodCV'] = "Centroid of Area"
df_df8['in_County'] = df['COUNTY']
df_df8['in_Latitude'] = df['Lat']
df_df8['in_Longitude'] = df['Long']
df_df8['in_PODorPOUSite'] = "POU"
df_df8['in_SiteName'] = df['Water System Name']
df_df8['in_SiteNativeID'] = df['SABL_PWSID']
df_df8['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site VariableAmounts Info
df_df8['WaterUnits'] = ""
df_df8['in_Amount'] = df['WATER DELIVERIES TO Other.PWS']
df_df8['in_BenUse'] = "Other PWS"
df_df8['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_df8['in_CustomerTypeCV'] = df['State Water System Type']
df_df8['in_PopulationServed'] = df['Population Of Service Area']
df_df8['Year'] = df['Year']
df_df8['Month'] = df['Month']
df_df8['Days.In.Month'] = df['Days.In.Month']

print(len(df_df8))
df_df8.head(1)

78539


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Delivered,Cumulative Delivered_Monthly_Other PWS_Unspeci...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,,Other PWS,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31


In [15]:
# Concatenate Delivered Data Together
frames = [df_df1, df_df2, df_df3, df_df4, df_df5, df_df6, df_df7, df_df8]
df_D_out = pd.concat(frames).reset_index(drop=True)
print(len(df_D_out))

628312


In [16]:
# Fixing Water Amount datatype
# Issue of some entries are strings with a "," or as "FALSE".

df_D_out['in_Amount'] = df_D_out['in_Amount'].replace(',','', regex=True)
df_D_out['in_Amount'] = df_D_out['in_Amount'].replace('FALSE','', regex=True)
df_D_out['in_Amount'] = df_D_out['in_Amount'].str.strip()
df_D_out['in_Amount'] = pd.to_numeric(df_D_out['in_Amount'])

# Produced Data
 - Three timeseries datasets.

In [17]:
# Merging dataframes into one, using left-join.
# Note: not all sites had groundwater values, will drop those rows.

df = pd.DataFrame()

df = pd.merge(dfpts, dffi, left_on='PWSID', right_on='Water System No', how='left') 
df = pd.merge(df, dfbi, left_on='PWSID', right_on='SABL_PWSID', how='left')

df = df.dropna(subset = ["WATER PRODUCED FROM GROUNDWATER"]).reset_index(drop=True)

df = df.replace("Null", "")  # The State's Master input dataframe. Remove any nulls.
df = df.replace("nan", "")  # The State's Master input dataframe. Remove any nulls.
df = df.replace(np.nan, "")  # The State's Master input dataframe. Remove any nulls.

print(len(df))
df.head(1)

195780


Unnamed: 0,PWSID,Water.System.Name,Water.System.Classification,Year,Month,Date,Days.In.Month,WATER PRODUCED Water.Units IN UNITS ORIGINALLY REPORTED,WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS,UNITS ADJUSTED BY OIMA?,Finished.Water.Vol.Type.Revised,WATER PRODUCED FROM GROUNDWATER,WATER PRODUCED FROM SURFACE WATER,FINSIHIED WATER PURCHASED OR RECEIVED FROM ANOTHER PUBLIC WATER SYSTEM,WATER SOLD TO ANOTHER PUBLIC WATER SYSTEM,Non-Potable Produced Water (EXCLUDING RECYCLING),RECYCLED WATER PRODUCED,"TOTAL POTABLE WATER PRODUCED USING REVISED UNITS (Total Does not Include Sold, Non-potable and Recycled amounts)","TOTAL POTABLE WATER IN GALLONS (Total Does not Include Sold, Non-potable and Recycled amounts)",Population Of Service Area,CALCULATED GPCD (Total Potable Produced in gallons per capita day),Water System No,Water System Name,Principal County Served,Federal Water System Type CODE,Federal Water System Type,State Water System Type CODE,State Water System Type,Water System Status CODE,System Status,Owner Type CODE,Owner Type,Primary Water Source Type CODE,Primary Water Source Type,Residential Population,Non-Transient Population,Transient Population,Total Population,Number of Agricultural Service Connections (AG),Number of Combined Service Connections (CB),Number of Commercial Service Connections (CM),Number of Institutional Service Connections (IN),Number of Residential Service Connections (RS),Total Number of Service Connections,Fee Code,Fee Code Description,Date of Sanitary Survey visit (SNSV Visit Date),CITY,Treatment Plant Class CODE,Treatment Plant Class,Distribution System Class CODE,Distribution System Class,OID_,OBJECTID_1,SABL_PWSID,WATER_SYST,WATER_SY_1,BOUNDARY_T,REGULATING,COUNTY,ADDR_LINE_,ADDR_LIN_1,ADDRESS_CI,ADDRESS_ST,ADDRESS_ZI,STATE_CLAS,POPULATION,SERVICE_CO,VERIFIED_S,LAST_EDITE,VERIFIED_N,VERIFIED_T,DT_VERIFIE,CREATED_US,CREATED_DA,LAST_EDI_1,OBJECTID,BOUNDARY_F,ACTIVITY_S,ACTIVITY_D,OWNER_TYPE,FEDERAL_CL,Shape__Are,Shape__Len,SHAPE_1,Shape_Length,Shape_Area,Lat,Long
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN.,Community Water System,2014,January,1/1/2014,31,G,G,NO CHANGES,M,171120.0,,,,,,171120.0,171120.0,50.0,110.4,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,ALAMEDA,C,Community,C,Community,A,Active,P,Private,GW,Groundwater,50.0,,,50.0,,19.0,,,,19.0,SC,Small Community,12/12/2017,CASTRO VALLEY,,,D1,Distribution Operator Level 1,728.0,305171.0,CA0103040,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,Water Service Area,DISTRICT 04 - SAN FRANCISCO,ALAMEDA,8653 NORRIS CANYON ROAD,,CASTRO VALLEY,CA,94552,COMMUNITY,50.0,19.0,Not Verified,10/20/2020 0:00,,,1/1/1970 0:00,SBUCKNAM,10/20/2020 0:00,SBUCKNAM,,WBT Tool,A,1/27/1983 0:00,P,COMMUNITY,593518.1428,3456.677697,,0.0278,3.8e-05,37.734364,-122.027303


In [18]:
# 1) Cumulative Produced_Monthly_Total_Groundwater
df_dfp1 = pd.DataFrame(index=df.index)

# Variable Info
df_dfp1['in_VariableCV'] = "Produced"
df_dfp1['in_VariableSpecificCV'] = "Cumulative Produced_Monthly_Total_Groundwater"

# Water Source Info
df_dfp1['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df_dfp1['in_CoordinateMethodCV'] = "Centroid of Area"
df_dfp1['in_County'] = df['COUNTY']
df_dfp1['in_Latitude'] = df['Lat']
df_dfp1['in_Longitude'] = df['Long']
df_dfp1['in_PODorPOUSite'] = "POU"
df_dfp1['in_SiteName'] = df['Water System Name']
df_dfp1['in_SiteNativeID'] = df['SABL_PWSID']
df_dfp1['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site Variable Amounts Info
df_dfp1['WaterUnits'] = df['WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS']
df_dfp1['in_Amount'] = df['WATER PRODUCED FROM GROUNDWATER']
df_dfp1['in_BenUse'] = "Unspecified"
df_dfp1['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_dfp1['in_CustomerTypeCV'] = df['State Water System Type']
df_dfp1['in_PopulationServed'] = df['Population Of Service Area']
df_dfp1['Year'] = df['Year']
df_dfp1['Month'] = df['Month']
df_dfp1['Days.In.Month'] = df['Days.In.Month']

print(len(df_dfp1))
df_dfp1.head(1)

195780


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Produced,Cumulative Produced_Monthly_Total_Groundwater,Groundwater,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,171120.0,Unspecified,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2014,January,31


In [19]:
# 2) Cumulative Produced_Monthly_Total_Surface Water
df_dfp2 = pd.DataFrame(index=df.index)

# Variable Info
df_dfp2['in_VariableCV'] = "Produced"
df_dfp2['in_VariableSpecificCV'] = "Cumulative Produced_Monthly_Total_Surface Water"

# Water Source Info
df_dfp2['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df_dfp2['in_CoordinateMethodCV'] = "Centroid of Area"
df_dfp2['in_County'] = df['COUNTY']
df_dfp2['in_Latitude'] = df['Lat']
df_dfp2['in_Longitude'] = df['Long']
df_dfp2['in_PODorPOUSite'] = "POU"
df_dfp2['in_SiteName'] = df['Water System Name']
df_dfp2['in_SiteNativeID'] = df['SABL_PWSID']
df_dfp2['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site Variable Amounts Info
df_dfp2['WaterUnits'] = df['WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS']
df_dfp2['in_Amount'] = df['WATER PRODUCED FROM SURFACE WATER']
df_dfp2['in_BenUse'] = "Unspecified"
df_dfp2['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_dfp2['in_CustomerTypeCV'] = df['State Water System Type']
df_dfp2['in_PopulationServed'] = df['Population Of Service Area']
df_dfp2['Year'] = df['Year']
df_dfp2['Month'] = df['Month']
df_dfp2['Days.In.Month'] = df['Days.In.Month']

print(len(df_dfp2))
df_dfp2.head(1)

195780


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Produced,Cumulative Produced_Monthly_Total_Surface Water,Surface Water,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,,Unspecified,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2014,January,31


In [20]:
# 3) Cumulative Produced _Monthly_Total_Purchased
df_dfp3 = pd.DataFrame(index=df.index)

# Variable Info
df_dfp3['in_VariableCV'] = "Produced"
df_dfp3['in_VariableSpecificCV'] = "Cumulative Produced_Monthly_Total_Purchased"

# Water Source Info
df_dfp3['in_WaterSourceTypeCV'] = "Purchased"

# Site Info
df_dfp3['in_CoordinateMethodCV'] = "Centroid of Area"
df_dfp3['in_County'] = df['COUNTY']
df_dfp3['in_Latitude'] = df['Lat']
df_dfp3['in_Longitude'] = df['Long']
df_dfp3['in_PODorPOUSite'] = "POU"
df_dfp3['in_SiteName'] = df['Water System Name']
df_dfp3['in_SiteNativeID'] = df['SABL_PWSID']
df_dfp3['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site Variable Amounts Info
df_dfp3['WaterUnits'] = df['WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS']
df_dfp3['in_Amount'] = df['FINSIHIED WATER PURCHASED OR RECEIVED FROM ANOTHER PUBLIC WATER SYSTEM']
df_dfp3['in_BenUse'] = "Unspecified"
df_dfp3['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_dfp3['in_CustomerTypeCV'] = df['State Water System Type']
df_dfp3['in_PopulationServed'] = df['Population Of Service Area']
df_dfp3['Year'] = df['Year']
df_dfp3['Month'] = df['Month']
df_dfp3['Days.In.Month'] = df['Days.In.Month']

print(len(df_dfp3))
df_dfp3.head(1)

195780


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Produced,Cumulative Produced_Monthly_Total_Purchased,Purchased,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,,Unspecified,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2014,January,31


In [21]:
# Concatenate Produced Data Together
frames = [df_dfp1, df_dfp2, df_dfp3]
df_P_out = pd.concat(frames).reset_index(drop=True)
print(len(df_P_out))

587340


In [22]:
# Creating amount value for based on units. Convert to gallons from input unit.

# Fix unit type to float
# Issue of some entries are strings
df_P_out['in_Amount'] = df_P_out['in_Amount'].replace('-','', regex=True)
df_P_out['in_Amount'] = df_P_out['in_Amount'].replace(',','', regex=True)
df_P_out['in_Amount'] = df_P_out['in_Amount'].replace('FALSE','', regex=True)
df_P_out['in_Amount'] = df_P_out['in_Amount'].str.strip()
df_P_out['in_Amount'] = pd.to_numeric(df_P_out['in_Amount'])

def createAmount(unit, val):
    outVal = val # default
    if unit == 'AF':
        outVal = val * 325851
    if unit == 'CCF':
        outVal = val * 748.052
    if unit == 'MG':
        outVal = val * 1000000
    if unit == 'DG':
        outVal = val * 10
    if unit == 'TG':
        outVal = val * 1000
    if unit == 'HG':
        outVal = val * 100
    if unit == 'CF':
        outVal = val * 7.48052

    return outVal

df_P_out['temp_Amount'] = df_P_out['in_Amount']
df_P_out['in_Amount'] = df_P_out.apply(lambda row: createAmount(row['WaterUnits'], row['temp_Amount']), axis=1)
df_P_out = df_P_out.drop(['temp_Amount'], axis=1)
df_P_out.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Produced,Cumulative Produced_Monthly_Total_Groundwater,Groundwater,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,171120.0,Unspecified,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2014,January,31


# Concatenate Together. Output Dataframe.

In [23]:
# Concatenate Produced Data Together
frames = [df_D_out, df_P_out]
dfout = pd.concat(frames).reset_index(drop=True)
print(len(dfout))

1215652


In [24]:
# Create WaDE TimeframeStart

MonthDictionary = {
"January" : "01",
"February" : "02",
"March" : "03",
"April" : "04",
"May" : "05",
"June" : "06",
"July" : "07",
"August" : "08",
"September" : "09",
"October" : "10",
"November" : "11",
"December" : "12"}

def createTimeframeStart(Year, Month):
    yearString = str(Year).strip()
    monthString = str(MonthDictionary[str(Month).strip()]).strip()
    try:
        outString = yearString + "-" + monthString + "-01"
    except:
        outString = ''
    return outString

dfout['in_TimeframeStart'] = dfout.apply(lambda row: createTimeframeStart(row['Year'], row['Month']), axis=1)
dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month,in_TimeframeStart
0,Delivered,Cumulative Delivered_Monthly_Single Family Res...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,Single Family Residential,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31,2015-01-01


In [25]:
# Create WaDE TimeframeEnd

MonthDictionary = {
"January" : "01",
"February" : "02",
"March" : "03",
"April" : "04",
"May" : "05",
"June" : "06",
"July" : "07",
"August" : "08",
"September" : "09",
"October" : "10",
"November" : "11",
"December" : "12"}

def createTimeframeEnd(Year, Month, Day):
    yearString = str(Year).strip()
    monthString = str(MonthDictionary[str(Month).strip()]).strip()
    dayString = str(Day).strip()
    try:
        outString = yearString + "-" + monthString + "-" + dayString
    except:
        outString = ''
    return outString

dfout['in_TimeframeEnd'] = dfout.apply(lambda row: createTimeframeEnd(row['Year'], row['Month'], row['Days.In.Month']), axis=1)
dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month,in_TimeframeStart,in_TimeframeEnd
0,Delivered,Cumulative Delivered_Monthly_Single Family Res...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,Single Family Residential,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31,2015-01-01,2015-01-31


In [26]:
#Update datatype of Priority Date to fit WaDE 2.0 structure

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'])
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'])
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month,in_TimeframeStart,in_TimeframeEnd
0,Delivered,Cumulative Delivered_Monthly_Single Family Res...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,Single Family Residential,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31,2015-01-01,2015-01-31


## Fixing a few errors

In [27]:
# Fixing Population Data Type

dfout['in_PopulationServed'] = dfout['in_PopulationServed'].replace('', 0, regex=True)
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].fillna(0)
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].astype(int).fillna(0)
dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_BenUse,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month,in_TimeframeStart,in_TimeframeEnd
0,Delivered,Cumulative Delivered_Monthly_Single Family Res...,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,Single Family Residential,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50,2015,January,31,2015-01-01,2015-01-31


In [28]:
# Fixing empty site names

def fixSiteName(colrowValue):
    colrowValue = str(colrowValue).strip()
    if colrowValue == "" or colrowValue == " " or pd.isnull(colrowValue):
        outString = "Unspecified"
    else:
        outString = colrowValue
    return outString

dfout['in_SiteName'] = dfout.apply(lambda row: fixSiteName(row['in_SiteName']), axis=1)

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDECA_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
    if not (ml.empty):  # check if the series is empty
        outList = ml.iloc[0]
    else:
        outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceTypeCV']), axis=1)
dfout.head(1)

## Export Outputs

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

In [None]:
#Exporting to Finished File
dfout.to_csv('P_caSSMaster.csv', index=False)  # The output