# Pre-processing California Site Specific data for WaDEQA upload.
Date Updated: 02/26/2021

Purpose:  To pre-process the California site specific data into one master file for simple dataframe creation and extraction.  See "CA_SiteSpecificAmounts Schema Mapping to WaDE_QA.xlsx" for more details.

Notes:
- Going to use both Produced Water & Delivered Water data, and pair both to the Area of Use as POU.
- Create three seperate dataframes (one for Delivered, and two for Produced), then concatenate into single long output dataframe.

In [1]:
# Needed Libararies
import os
import math
import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

#Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/California/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

In [2]:
# Time Series Data - Delivered
fileInput1a = "deliveredPWS_2013_2016_input.csv"
dfdts = pd.read_csv(fileInput1a)
print(len(dfdts))
dfdts.head(1)

78537


Unnamed: 0,PWSID,Water.System.Name,Water.System.Classification,Year,Month,Date,Days.In.Month,Delivered.Water.Units AS ORIGINALLY REPORTED,Delivered.Water.Units.Revised BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS,UNITS ADJUSTED BY OIMA?,WATER DELIVERIES TO Single.family.Residential,WATER DELIVERIES TO Multi.family.Residential,WATER DELIVERIES TO Commercial.Institutional,WATER DELIVERIES TO Industrial,WATER DELIVERIES TO Landscape.Irrigation,WATER DELIVERIES TO Other,WATER DELIVERIES TO Agricultural,WATER DELIVERIES TO Other.PWS,"WATER DELIVERIES Total.Delivered Residential IN REVISED UNITS (Total Does not include Landscape Irrigation, Agricultural or to other PWS)","Total. RESIDENTIAL Delivered.Gallons (Total Does not include Landscape Irrigation, Agricultural or to other PWS)",Population Of Service Area,CALCULATED GPCD (Total delivery to residential in gallons per capita day)
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN.,Community Water System,2015,January,1/1/2015,31,G,G,NO CHANGES,105995,,,,,,,,105995.0,105995.0,50.0,68.4


In [3]:
# Time Series Data - Produced
fileInput1b = "producedPWS_2013_2016_input.csv"
dfpts = pd.read_csv(fileInput1b)
print(len(dfpts))
dfpts.head(1)

212824


Unnamed: 0,PWSID,Water.System.Name,Water.System.Classification,Year,Month,Date,Days.In.Month,WATER PRODUCED Water.Units IN UNITS ORIGINALLY REPORTED,WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS,UNITS ADJUSTED BY OIMA?,Finished.Water.Vol.Type.Revised,WATER PRODUCED FROM GROUNDWATER,WATER PRODUCED FROM SURFACE WATER,FINSIHIED WATER PURCHASED OR RECEIVED FROM ANOTHER PUBLIC WATER SYSTEM,WATER SOLD TO ANOTHER PUBLIC WATER SYSTEM,Non-Potable Produced Water (EXCLUDING RECYCLING),RECYCLED WATER PRODUCED,"TOTAL POTABLE WATER PRODUCED USING REVISED UNITS (Total Does not Include Sold, Non-potable and Recycled amounts)","TOTAL POTABLE WATER IN GALLONS (Total Does not Include Sold, Non-potable and Recycled amounts)",Population Of Service Area,CALCULATED GPCD (Total Potable Produced in gallons per capita day)
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN.,Community Water System,2014,January,1/1/2014,31,G,G,NO CHANGES,M,171120.0,,,,,,171120.0,171120.0,50.0,110.4


In [4]:
# Facility info
fileInput2 = "PWS Facility Information_input.csv"
dffi = pd.read_csv(fileInput2)
print(len(dffi))
dffi.head(1)

7763


Unnamed: 0,Water System No,Water System Name,Principal County Served,Federal Water System Type CODE,Federal Water System Type,State Water System Type CODE,State Water System Type,Water System Status CODE,System Status,Owner Type CODE,Owner Type,Primary Water Source Type CODE,Primary Water Source Type,Residential Population,Non-Transient Population,Transient Population,Total Population,Number of Agricultural Service Connections (AG),Number of Combined Service Connections (CB),Number of Commercial Service Connections (CM),Number of Institutional Service Connections (IN),Number of Residential Service Connections (RS),Total Number of Service Connections,Fee Code,Fee Code Description,Date of Sanitary Survey visit (SNSV Visit Date),CITY,Treatment Plant Class CODE,Treatment Plant Class,Distribution System Class CODE,Distribution System Class
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,ALAMEDA,C,Community,C,Community,A,Active,P,Private,GW,Groundwater,50.0,,,50.0,,19.0,,,,19.0,SC,Small Community,12/12/2017,CASTRO VALLEY,,,D1,Distribution Operator Level 1


In [5]:
# Shapefile / Site and Boundary info
fileInput3 = "CADWS_AreaBoundaries_input.csv"
dfbi = pd.read_csv(fileInput3)
print(len(dfbi))
dfbi.head(1)

4563


Unnamed: 0,OID_,OBJECTID_1,SABL_PWSID,WATER_SYST,WATER_SY_1,BOUNDARY_T,REGULATING,COUNTY,ADDR_LINE_,ADDR_LIN_1,ADDRESS_CI,ADDRESS_ST,ADDRESS_ZI,STATE_CLAS,POPULATION,SERVICE_CO,VERIFIED_S,LAST_EDITE,VERIFIED_N,VERIFIED_T,DT_VERIFIE,CREATED_US,CREATED_DA,LAST_EDI_1,OBJECTID,BOUNDARY_F,ACTIVITY_S,ACTIVITY_D,OWNER_TYPE,FEDERAL_CL,Shape__Are,Shape__Len,SHAPE_1,Shape_Length,Shape_Area,Lat,Long
0,1,304417,CA1100445,CA1100445,ORLAND MOBILE H.P.,Water Service Area,DISTRICT 21 - VALLEY,GLENN,4265 ROAD 99 WEST,P.O. BOX 1721,ORLAND,CA,95963,COMMUNITY,178,54,Not Verified,10/20/2020 0:00,,,1/1/1970 0:00,SBUCKNAM,10/20/2020 0:00,SBUCKNAM,,WBT Tool,A,3/28/1986 0:00,P,COMMUNITY,36576.63873,790.599915,,0.006519,2e-06,39.734289,-122.198109


## Delivered Data

In [6]:
# Merging dataframes into one, using left-join.
df = pd.DataFrame()

df = pd.merge(dfdts, dffi, left_on='PWSID', right_on='Water System No', how='left') 
df = pd.merge(df, dfbi, left_on='PWSID', right_on='SABL_PWSID', how='left')

df = df.replace("Null", "")  # The State's Master input dataframe. Remove any nulls.
df = df.replace("nan", "")  # The State's Master input dataframe. Remove any nulls.
df = df.replace(np.nan, "")  # The State's Master input dataframe. Remove any nulls.

print(len(df))
df.head(1)

78539


Unnamed: 0,PWSID,Water.System.Name,Water.System.Classification,Year,Month,Date,Days.In.Month,Delivered.Water.Units AS ORIGINALLY REPORTED,Delivered.Water.Units.Revised BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS,UNITS ADJUSTED BY OIMA?,WATER DELIVERIES TO Single.family.Residential,WATER DELIVERIES TO Multi.family.Residential,WATER DELIVERIES TO Commercial.Institutional,WATER DELIVERIES TO Industrial,WATER DELIVERIES TO Landscape.Irrigation,WATER DELIVERIES TO Other,WATER DELIVERIES TO Agricultural,WATER DELIVERIES TO Other.PWS,"WATER DELIVERIES Total.Delivered Residential IN REVISED UNITS (Total Does not include Landscape Irrigation, Agricultural or to other PWS)","Total. RESIDENTIAL Delivered.Gallons (Total Does not include Landscape Irrigation, Agricultural or to other PWS)",Population Of Service Area,CALCULATED GPCD (Total delivery to residential in gallons per capita day),Water System No,Water System Name,Principal County Served,Federal Water System Type CODE,Federal Water System Type,State Water System Type CODE,State Water System Type,Water System Status CODE,System Status,Owner Type CODE,Owner Type,Primary Water Source Type CODE,Primary Water Source Type,Residential Population,Non-Transient Population,Transient Population,Total Population,Number of Agricultural Service Connections (AG),Number of Combined Service Connections (CB),Number of Commercial Service Connections (CM),Number of Institutional Service Connections (IN),Number of Residential Service Connections (RS),Total Number of Service Connections,Fee Code,Fee Code Description,Date of Sanitary Survey visit (SNSV Visit Date),CITY,Treatment Plant Class CODE,Treatment Plant Class,Distribution System Class CODE,Distribution System Class,OID_,OBJECTID_1,SABL_PWSID,WATER_SYST,WATER_SY_1,BOUNDARY_T,REGULATING,COUNTY,ADDR_LINE_,ADDR_LIN_1,ADDRESS_CI,ADDRESS_ST,ADDRESS_ZI,STATE_CLAS,POPULATION,SERVICE_CO,VERIFIED_S,LAST_EDITE,VERIFIED_N,VERIFIED_T,DT_VERIFIE,CREATED_US,CREATED_DA,LAST_EDI_1,OBJECTID,BOUNDARY_F,ACTIVITY_S,ACTIVITY_D,OWNER_TYPE,FEDERAL_CL,Shape__Are,Shape__Len,SHAPE_1,Shape_Length,Shape_Area,Lat,Long
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN.,Community Water System,2015,January,1/1/2015,31,G,G,NO CHANGES,105995,,,,,,,,105995.0,105995.0,50.0,68.4,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,ALAMEDA,C,Community,C,Community,A,Active,P,Private,GW,Groundwater,50.0,,,50.0,,19.0,,,,19.0,SC,Small Community,12/12/2017,CASTRO VALLEY,,,D1,Distribution Operator Level 1,728.0,305171.0,CA0103040,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,Water Service Area,DISTRICT 04 - SAN FRANCISCO,ALAMEDA,8653 NORRIS CANYON ROAD,,CASTRO VALLEY,CA,94552,COMMUNITY,50.0,19.0,Not Verified,10/20/2020 0:00,,,1/1/1970 0:00,SBUCKNAM,10/20/2020 0:00,SBUCKNAM,,WBT Tool,A,1/27/1983 0:00,P,COMMUNITY,593518.1428,3456.677697,,0.0278,3.8e-05,37.734364,-122.027303


In [7]:
# Output dataframe for Culinary Service Area Data

df_D = pd.DataFrame(index=df.index)

# Variable Info
df_D['in_VariableCV'] = "Water Use"
df_D['in_VariableSpecificCV'] = "Water Use Delivered"

# Water Source Info
df_D['in_WaterSourceTypeCV'] = "Unspecified"

# Site Info
df_D['in_CoordinateMethodCV'] = "Centroid of Area"
df_D['in_County'] = df['COUNTY']
df_D['in_Latitude'] = df['Lat']
df_D['in_Longitude'] = df['Long']
df_D['in_PODorPOUSite'] = "POU"
df_D['in_SiteName'] = df['Water System Name']
df_D['in_SiteNativeID'] = df['SABL_PWSID']
df_D['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site Variable Amounts Info
df_D['WaterUnits'] = ""
df_D['in_Amount'] = df['Total. RESIDENTIAL Delivered.Gallons (Total Does not include Landscape Irrigation, Agricultural or to other PWS)']
df_D['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_D['in_CustomerTypeCV'] = df['State Water System Type']
df_D['in_PopulationServed'] = df['Population Of Service Area']
df_D['Year'] = df['Year']
df_D['Month'] = df['Month']
df_D['Days.In.Month'] = df['Days.In.Month']


df_D = df_D.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(df_D))
df_D.head(1)

78537


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31


In [8]:
# Fixing Water Amount datatype
# Issue of some entries are strings with a "," or as "FALSE".

df_D['in_Amount'] = df_D['in_Amount'].replace(',','', regex=True)
df_D['in_Amount'] = df_D['in_Amount'].replace('FALSE','', regex=True)
df_D['in_Amount'] = df_D['in_Amount'].str.strip()
df_D['in_Amount'] = pd.to_numeric(df_D['in_Amount'])
df_D.head(3)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31
1,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,65156.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,February,28
2,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,99975.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,March,31


## Produced Data - Groundwater

In [9]:
# Merging dataframes into one, using left-join.
# Note: not all sites had groundwater values, will drop those rows.

df = pd.DataFrame()

df = pd.merge(dfpts, dffi, left_on='PWSID', right_on='Water System No', how='left') 
df = pd.merge(df, dfbi, left_on='PWSID', right_on='SABL_PWSID', how='left')

df = df.dropna(subset = ["WATER PRODUCED FROM GROUNDWATER"]).reset_index(drop=True)

df = df.replace("Null", "")  # The State's Master input dataframe. Remove any nulls.
df = df.replace("nan", "")  # The State's Master input dataframe. Remove any nulls.
df = df.replace(np.nan, "")  # The State's Master input dataframe. Remove any nulls.

print(len(df))
df.head(1)

195780


Unnamed: 0,PWSID,Water.System.Name,Water.System.Classification,Year,Month,Date,Days.In.Month,WATER PRODUCED Water.Units IN UNITS ORIGINALLY REPORTED,WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS,UNITS ADJUSTED BY OIMA?,Finished.Water.Vol.Type.Revised,WATER PRODUCED FROM GROUNDWATER,WATER PRODUCED FROM SURFACE WATER,FINSIHIED WATER PURCHASED OR RECEIVED FROM ANOTHER PUBLIC WATER SYSTEM,WATER SOLD TO ANOTHER PUBLIC WATER SYSTEM,Non-Potable Produced Water (EXCLUDING RECYCLING),RECYCLED WATER PRODUCED,"TOTAL POTABLE WATER PRODUCED USING REVISED UNITS (Total Does not Include Sold, Non-potable and Recycled amounts)","TOTAL POTABLE WATER IN GALLONS (Total Does not Include Sold, Non-potable and Recycled amounts)",Population Of Service Area,CALCULATED GPCD (Total Potable Produced in gallons per capita day),Water System No,Water System Name,Principal County Served,Federal Water System Type CODE,Federal Water System Type,State Water System Type CODE,State Water System Type,Water System Status CODE,System Status,Owner Type CODE,Owner Type,Primary Water Source Type CODE,Primary Water Source Type,Residential Population,Non-Transient Population,Transient Population,Total Population,Number of Agricultural Service Connections (AG),Number of Combined Service Connections (CB),Number of Commercial Service Connections (CM),Number of Institutional Service Connections (IN),Number of Residential Service Connections (RS),Total Number of Service Connections,Fee Code,Fee Code Description,Date of Sanitary Survey visit (SNSV Visit Date),CITY,Treatment Plant Class CODE,Treatment Plant Class,Distribution System Class CODE,Distribution System Class,OID_,OBJECTID_1,SABL_PWSID,WATER_SYST,WATER_SY_1,BOUNDARY_T,REGULATING,COUNTY,ADDR_LINE_,ADDR_LIN_1,ADDRESS_CI,ADDRESS_ST,ADDRESS_ZI,STATE_CLAS,POPULATION,SERVICE_CO,VERIFIED_S,LAST_EDITE,VERIFIED_N,VERIFIED_T,DT_VERIFIE,CREATED_US,CREATED_DA,LAST_EDI_1,OBJECTID,BOUNDARY_F,ACTIVITY_S,ACTIVITY_D,OWNER_TYPE,FEDERAL_CL,Shape__Are,Shape__Len,SHAPE_1,Shape_Length,Shape_Area,Lat,Long
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN.,Community Water System,2014,January,1/1/2014,31,G,G,NO CHANGES,M,171120.0,,,,,,171120.0,171120.0,50.0,110.4,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,ALAMEDA,C,Community,C,Community,A,Active,P,Private,GW,Groundwater,50.0,,,50.0,,19.0,,,,19.0,SC,Small Community,12/12/2017,CASTRO VALLEY,,,D1,Distribution Operator Level 1,728.0,305171.0,CA0103040,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,Water Service Area,DISTRICT 04 - SAN FRANCISCO,ALAMEDA,8653 NORRIS CANYON ROAD,,CASTRO VALLEY,CA,94552,COMMUNITY,50.0,19.0,Not Verified,10/20/2020 0:00,,,1/1/1970 0:00,SBUCKNAM,10/20/2020 0:00,SBUCKNAM,,WBT Tool,A,1/27/1983 0:00,P,COMMUNITY,593518.1428,3456.677697,,0.0278,3.8e-05,37.734364,-122.027303


In [10]:
# Output dataframe for Culinary Service Area Data

df_Pgw = pd.DataFrame(index=df.index)

# Variable Info
df_Pgw['in_VariableCV'] = "Water Use"
df_Pgw['in_VariableSpecificCV'] = "Water Use Produced"

# Water Source Info
df_Pgw['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df_Pgw['in_CoordinateMethodCV'] = "Centroid of Area"
df_Pgw['in_County'] = df['COUNTY']
df_Pgw['in_Latitude'] = df['Lat']
df_Pgw['in_Longitude'] = df['Long']
df_Pgw['in_PODorPOUSite'] = "POD"
df_Pgw['in_SiteName'] = df['Water System Name']
df_Pgw['in_SiteNativeID'] = df['SABL_PWSID']
df_Pgw['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site Variable Amounts Info
df_Pgw['WaterUnits'] = df['WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS']
df_Pgw['in_Amount'] = df['WATER PRODUCED FROM GROUNDWATER']
df_Pgw['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_Pgw['in_CustomerTypeCV'] = df['State Water System Type']
df_Pgw['in_PopulationServed'] = df['Population Of Service Area']
df_Pgw['Year'] = df['Year']
df_Pgw['Month'] = df['Month']
df_Pgw['Days.In.Month'] = df['Days.In.Month']


df_Pgw = df_Pgw.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(df_Pgw))
df_Pgw.head(1)

195709


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Water Use,Water Use Produced,Groundwater,Centroid of Area,ALAMEDA,37.734364,-122.027303,POD,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,171120.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2014,January,31


In [11]:
# Creating amount value for based on units.
# Convert to gallons from input unit.

# Fix unit type to float
# Issue of some entries are strings
df_Pgw['in_Amount'] = df_Pgw['in_Amount'].replace('-','', regex=True)
df_Pgw['in_Amount'] = df_Pgw['in_Amount'].replace(',','', regex=True)
df_Pgw['in_Amount'] = df_Pgw['in_Amount'].replace('FALSE','', regex=True)
df_Pgw['in_Amount'] = df_Pgw['in_Amount'].str.strip()
df_Pgw['in_Amount'] = pd.to_numeric(df_Pgw['in_Amount'])

def createAmount(unit, val):
    outVal = val # default
    if unit == 'AF':
        outVal = val * 325851
    if unit == 'CCF':
        outVal = val * 748.052
    if unit == 'MG':
        outVal = val * 1000000
    if unit == 'DG':
        outVal = val * 10
    if unit == 'TG':
        outVal = val * 1000
    if unit == 'CF':
        outVal = val * 7.48052

    return outVal

df_Pgw['in_Amount'] = df_Pgw.apply(lambda row: createAmount(row['WaterUnits'], row['in_Amount']), axis=1)
df_Pgw.head(3)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Water Use,Water Use Produced,Groundwater,Centroid of Area,ALAMEDA,37.734364,-122.027303,POD,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,171120.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2014,January,31
1,Water Use,Water Use Produced,Groundwater,Centroid of Area,ALAMEDA,37.734364,-122.027303,POD,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,154560.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2014,February,28
2,Water Use,Water Use Produced,Groundwater,Centroid of Area,ALAMEDA,37.734364,-122.027303,POD,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,171120.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2014,March,31


## Produced Data - Surface Water

In [12]:
# Merging dataframes into one, using left-join.
# Note: not all sites had surface water values, will drop those rows.

df = pd.DataFrame()

df = pd.merge(dfpts, dffi, left_on='PWSID', right_on='Water System No', how='left') 
df = pd.merge(df, dfbi, left_on='PWSID', right_on='SABL_PWSID', how='left')

df = df.dropna(subset = ["WATER PRODUCED FROM SURFACE WATER"]).reset_index(drop=True)

df = df.replace("Null", "")  # The State's Master input dataframe. Remove any nulls.
df = df.replace("nan", "")  # The State's Master input dataframe. Remove any nulls.
df = df.replace(np.nan, "")  # The State's Master input dataframe. Remove any nulls.

print(len(df))
df.head(1)

109915


Unnamed: 0,PWSID,Water.System.Name,Water.System.Classification,Year,Month,Date,Days.In.Month,WATER PRODUCED Water.Units IN UNITS ORIGINALLY REPORTED,WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS,UNITS ADJUSTED BY OIMA?,Finished.Water.Vol.Type.Revised,WATER PRODUCED FROM GROUNDWATER,WATER PRODUCED FROM SURFACE WATER,FINSIHIED WATER PURCHASED OR RECEIVED FROM ANOTHER PUBLIC WATER SYSTEM,WATER SOLD TO ANOTHER PUBLIC WATER SYSTEM,Non-Potable Produced Water (EXCLUDING RECYCLING),RECYCLED WATER PRODUCED,"TOTAL POTABLE WATER PRODUCED USING REVISED UNITS (Total Does not Include Sold, Non-potable and Recycled amounts)","TOTAL POTABLE WATER IN GALLONS (Total Does not Include Sold, Non-potable and Recycled amounts)",Population Of Service Area,CALCULATED GPCD (Total Potable Produced in gallons per capita day),Water System No,Water System Name,Principal County Served,Federal Water System Type CODE,Federal Water System Type,State Water System Type CODE,State Water System Type,Water System Status CODE,System Status,Owner Type CODE,Owner Type,Primary Water Source Type CODE,Primary Water Source Type,Residential Population,Non-Transient Population,Transient Population,Total Population,Number of Agricultural Service Connections (AG),Number of Combined Service Connections (CB),Number of Commercial Service Connections (CM),Number of Institutional Service Connections (IN),Number of Residential Service Connections (RS),Total Number of Service Connections,Fee Code,Fee Code Description,Date of Sanitary Survey visit (SNSV Visit Date),CITY,Treatment Plant Class CODE,Treatment Plant Class,Distribution System Class CODE,Distribution System Class,OID_,OBJECTID_1,SABL_PWSID,WATER_SYST,WATER_SY_1,BOUNDARY_T,REGULATING,COUNTY,ADDR_LINE_,ADDR_LIN_1,ADDRESS_CI,ADDRESS_ST,ADDRESS_ZI,STATE_CLAS,POPULATION,SERVICE_CO,VERIFIED_S,LAST_EDITE,VERIFIED_N,VERIFIED_T,DT_VERIFIE,CREATED_US,CREATED_DA,LAST_EDI_1,OBJECTID,BOUNDARY_F,ACTIVITY_S,ACTIVITY_D,OWNER_TYPE,FEDERAL_CL,Shape__Are,Shape__Len,SHAPE_1,Shape_Length,Shape_Area,Lat,Long
0,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,Community Water System,2016,January,1/1/2016,31,G,G,NO CHANGES,M,126403.0,-,-,-,-,-,126403.0,126403.0,50.0,81.6,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,ALAMEDA,C,Community,C,Community,A,Active,P,Private,GW,Groundwater,50.0,,,50.0,,19.0,,,,19.0,SC,Small Community,12/12/2017,CASTRO VALLEY,,,D1,Distribution Operator Level 1,728.0,305171.0,CA0103040,CA0103040,NORRIS CANYON PROPERTY OWNERS ASSN,Water Service Area,DISTRICT 04 - SAN FRANCISCO,ALAMEDA,8653 NORRIS CANYON ROAD,,CASTRO VALLEY,CA,94552,COMMUNITY,50.0,19.0,Not Verified,10/20/2020 0:00,,,1/1/1970 0:00,SBUCKNAM,10/20/2020 0:00,SBUCKNAM,,WBT Tool,A,1/27/1983 0:00,P,COMMUNITY,593518.1428,3456.677697,,0.0278,3.8e-05,37.734364,-122.027303


In [13]:
# Output dataframe for Culinary Service Area Data

df_Psw = pd.DataFrame(index=df.index)

# Variable Info
df_Psw['in_VariableCV'] = "Water Use"
df_Psw['in_VariableSpecificCV'] = "Water Use Produced"

# Water Source Info
df_Psw['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df_Psw['in_CoordinateMethodCV'] = "Centroid of Area"
df_Psw['in_County'] = df['COUNTY']
df_Psw['in_Latitude'] = df['Lat']
df_Psw['in_Longitude'] = df['Long']
df_Psw['in_PODorPOUSite'] = "POD"
df_Psw['in_SiteName'] = df['Water System Name']
df_Psw['in_SiteNativeID'] = df['SABL_PWSID']
df_Psw['in_SiteTypeCV'] = df['BOUNDARY_T']

# Site Variable Amounts Info
df_Psw['WaterUnits'] = df['WATER PRODUCED Water.Units REVIEWED BY OFFICE OF INFORMATION MANAGEMENT AND ANALYSIS']
df_Psw['in_Amount'] = df['WATER PRODUCED FROM SURFACE WATER']
df_Psw['in_CommunityWaterSupplySystem'] = df['Water.System.Name']
df_Psw['in_CustomerTypeCV'] = df['State Water System Type']
df_Psw['in_PopulationServed'] = df['Population Of Service Area']
df_Psw['Year'] = df['Year']
df_Psw['Month'] = df['Month']
df_Psw['Days.In.Month'] = df['Days.In.Month']


df_Psw = df_Psw.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(df_Psw))
df_Psw.head(1)

109891


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Water Use,Water Use Produced,Surface Water,Centroid of Area,ALAMEDA,37.734364,-122.027303,POD,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,-,NORRIS CANYON PROPERTY OWNERS ASSN,Community,50.0,2016,January,31


In [14]:
# Creating amount value for based on units.
# Convert to gallons from input unit.

# Fix unit type to float
# Issue of some entries are strings
df_Psw['in_Amount'] = df_Psw['in_Amount'].replace('-', '', regex=True)
df_Psw['in_Amount'] = df_Psw['in_Amount'].replace(' -   ', '', regex=True)
df_Psw['in_Amount'] = df_Psw['in_Amount'].replace(',', '', regex=True)
df_Psw['in_Amount'] = df_Psw['in_Amount'].replace('FALSE', '', regex=True)
df_Psw['in_Amount'] = df_Psw['in_Amount'].str.strip()
df_Psw['in_Amount'] = pd.to_numeric(df_Psw['in_Amount'])

def createAmount(unit, val):
    outVal = val # default
    if unit == 'AF':
        outVal = val * 325851
    if unit == 'CCF':
        outVal = val * 748.052
    if unit == 'MG':
        outVal = val * 1000000
    if unit == 'DG':
        outVal = val * 10
    if unit == 'TG':
        outVal = val * 1000
    if unit == 'CF':
        outVal = val * 7.48052
    
    if math.isnan(outVal):
        outVal = ""
  
    return outVal

df_Psw['in_Amount'] = df_Psw.apply(lambda row: createAmount(row['WaterUnits'], row['in_Amount']), axis=1)
df_Psw.head(3)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Water Use,Water Use Produced,Surface Water,Centroid of Area,ALAMEDA,37.734364,-122.027303,POD,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,,NORRIS CANYON PROPERTY OWNERS ASSN,Community,50.0,2016,January,31
1,Water Use,Water Use Produced,Surface Water,Centroid of Area,ALAMEDA,37.734364,-122.027303,POD,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,,NORRIS CANYON PROPERTY OWNERS ASSN,Community,50.0,2016,February,29
2,Water Use,Water Use Produced,Surface Water,Centroid of Area,ALAMEDA,37.734364,-122.027303,POD,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,G,,NORRIS CANYON PROPERTY OWNERS ASSN,Community,50.0,2016,March,31


## Concatenate Together

In [15]:
# Concatenate System Data into one long dataframe.

frames = [df_D, df_Pgw, df_Psw]
dfout = pd.concat(frames)

#dfout = dfout.replace(np.nan, "").drop_duplicates().reset_index(drop=True)
print(len(dfout))
dfout

384137


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,January,31
1,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,65156.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,February,28
2,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,99975.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,March,31
3,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,137268.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,April,30
4,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,179893.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50.0,2015,May,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109886,Water Use,Water Use Produced,Surface Water,Centroid of Area,YUBA,39.123141,-121.392205,POD,BEALE AIR FORCE BASE,CA5810700,Water Service Area,MG,,BEALE AIR FORCE BASE,Community,7761.0,2016,August,31
109887,Water Use,Water Use Produced,Surface Water,Centroid of Area,YUBA,39.123141,-121.392205,POD,BEALE AIR FORCE BASE,CA5810700,Water Service Area,MG,,BEALE AIR FORCE BASE,Community,7761.0,2016,September,30
109888,Water Use,Water Use Produced,Surface Water,Centroid of Area,YUBA,39.123141,-121.392205,POD,BEALE AIR FORCE BASE,CA5810700,Water Service Area,MG,,BEALE AIR FORCE BASE,Community,7761.0,2016,October,31
109889,Water Use,Water Use Produced,Surface Water,Centroid of Area,YUBA,39.123141,-121.392205,POD,BEALE AIR FORCE BASE,CA5810700,Water Service Area,MG,,BEALE AIR FORCE BASE,Community,7761.0,2016,November,30


In [16]:
# Fixing Population Data Type

dfout['in_PopulationServed'] = dfout['in_PopulationServed'].replace('', 0, regex=True)
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].fillna(0)
dfout['in_PopulationServed'] = dfout['in_PopulationServed'].astype(int).fillna(0)
dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month
0,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50,2015,January,31


In [17]:
# Create WaDE TimeframeStart

MonthDictionary = {
"January" : "01",
"February" : "02",
"March" : "03",
"April" : "04",
"May" : "05",
"June" : "06",
"July" : "07",
"August" : "08",
"September" : "09",
"October" : "10",
"November" : "11",
"December" : "12"}

def createTimeframeStart(Year, Month):
    yearString = str(Year).strip()
    monthString = str(MonthDictionary[str(Month).strip()]).strip()
    try:
        outString = yearString + "-" + monthString + "-01"
    except:
        outString = ''
    return outString

dfout['in_TimeframeStart'] = dfout.apply(lambda row: createTimeframeStart(row['Year'], row['Month']), axis=1)
dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month,in_TimeframeStart
0,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50,2015,January,31,2015-01-01


In [18]:
# Create WaDE TimeframeEnd

MonthDictionary = {
"January" : "01",
"February" : "02",
"March" : "03",
"April" : "04",
"May" : "05",
"June" : "06",
"July" : "07",
"August" : "08",
"September" : "09",
"October" : "10",
"November" : "11",
"December" : "12"}

def createTimeframeEnd(Year, Month, Day):
    yearString = str(Year).strip()
    monthString = str(MonthDictionary[str(Month).strip()]).strip()
    dayString = str(Day).strip()
    try:
        outString = yearString + "-" + monthString + "-" + dayString
    except:
        outString = ''
    return outString

dfout['in_TimeframeEnd'] = dfout.apply(lambda row: createTimeframeEnd(row['Year'], row['Month'], row['Days.In.Month']), axis=1)
dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month,in_TimeframeStart,in_TimeframeEnd
0,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50,2015,January,31,2015-01-01,2015-01-31


In [19]:
#Update datatype of Priority Date to fit WaDE 2.0 structure

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'])
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'])
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month,in_TimeframeStart,in_TimeframeEnd
0,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50,2015,January,31,2015-01-01,2015-01-31


## Fixing a few errors

In [20]:
# Fixing empty site names

def fixSiteName(colrowValue):
    colrowValue = str(colrowValue).strip()
    if colrowValue == "" or pd.isnull(colrowValue):
        outString = "Unspecified"
    else:
        outString = colrowValue
    return outString

dfout['in_SiteName'] = dfout.apply(lambda row: fixSiteName(row['in_SiteName']), axis=1)

In [21]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDECA_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
    if not (ml.empty):  # check if the series is empty
        outList = ml.iloc[0]
    else:
        outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceTypeCV']), axis=1)
dfout.head(1)

Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_WaterSourceTypeCV,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,WaterUnits,in_Amount,in_CommunityWaterSupplySystem,in_CustomerTypeCV,in_PopulationServed,Year,Month,Days.In.Month,in_TimeframeStart,in_TimeframeEnd,in_WaterSourceNativeID
0,Water Use,Water Use Delivered,Unspecified,Centroid of Area,ALAMEDA,37.734364,-122.027303,POU,NORRIS CANYON PROPERTY OWNERS ASSN,CA0103040,Water Service Area,,105995.0,NORRIS CANYON PROPERTY OWNERS ASSN.,Community,50,2015,January,31,2015-01-01,2015-01-31,WaDECA_WS1


## Export Outputs

In [22]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

in_VariableCV                            object
in_VariableSpecificCV                    object
in_WaterSourceTypeCV                     object
in_CoordinateMethodCV                    object
in_County                                object
in_Latitude                              object
in_Longitude                             object
in_PODorPOUSite                          object
in_SiteName                              object
in_SiteNativeID                          object
in_SiteTypeCV                            object
WaterUnits                               object
in_Amount                                object
in_CommunityWaterSupplySystem            object
in_CustomerTypeCV                        object
in_PopulationServed                       int32
Year                                      int64
Month                                    object
Days.In.Month                             int64
in_TimeframeStart                datetime64[ns]
in_TimeframeEnd                  datetim

In [23]:
#Exporting to Finished File
dfout.to_csv('P_caSSMaster.csv', index=False)  # The output