# Pre-processing California Site Specific data for WaDEQA upload.
Date Updated: 02/26/2021

Purpose:  To pre-process the California site specific data into one master file for simple dataframe creation and extraction.  See "CA_SiteSpecificAmounts Schema Mapping to WaDE_QA.xlsx" for more details.

Notes:
- asdf

In [None]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

#Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/California/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

In [None]:
#Time Series Data
fileInput1 = "deliveredPWS_2013_2016_input.csv"
dfts = pd.read_csv(fileInput1)
print(len(dfts))
dfts.head(1)

In [None]:
#Facility info
fileInput2 = "PWS Facility Information_input.csv"
dffi = pd.read_csv(fileInput2)
print(len(dffi))
dffi.head(1)

In [None]:
#Shapefile / Site and Boundary info
fileInput3 = "CADWS_AreaBoundaries_input.csv"
dfbi = pd.read_csv(fileInput3, encoding = "ISO-8859-1")
print(len(dfbi))
dfbi.head(1)

In [None]:
#Merging dataframes into one, using left-join.
df = pd.merge(dfts, dffi, left_on='PWSID', right_on='Water System No', how='left') 
df = pd.merge(df, dfbi, left_on='PWSID', right_on='SABL_PWSID', how='left')
print(len(df))
df.head(1)

In [None]:
# Preserve columns of interest.

columnsList=[
"PWSID",
"Water.System.Name",
"Water.System.Classification",
"Year",
"Month",
"Date",
"Days.In.Month",
"Total. RESIDENTIAL Delivered.Gallons (Total Does not include Landscape Irrigation, Agricultural or to other PWS)",
"Population Of Service Area",
"State Water System Type",
"Primary Water Source Type",
"SABL_PWSID",
"Lat",
"Long",
"WATER_SY_1",
"BOUNDARY_T",
"COUNTY"]

dfout = pd.DataFrame(columns=columnsList)
dfout = df[columnsList]
print(len(dfout))
dfout.head(1)

In [None]:
# Remaning columns for dataframe simplicity / reduce errors.

dfout = dfout.rename(columns={"Water.System.Name": "Water System Name",
                              "Water.System.Classification" : "Water System Classification",
                              "Days.In.Month" : "Days In Month",
                              "Total. RESIDENTIAL Delivered.Gallons (Total Does not include Landscape Irrigation, Agricultural or to other PWS)": "in_Amount",
                              "WATER_SY_1" : "in_SiteTypeCV"})
print(len(dfout))
dfout.head(1)                           

In [None]:
print(dfout['in_Amount'])

In [None]:
# Fixing Water Amount datatype
# Issue of some entries are strings with a "," or as "FALSE".

dfout['in_Amount'] = dfout['in_Amount'].replace(',','', regex=True)
dfout['in_Amount'] = dfout['in_Amount'].replace('FALSE','', regex=True)
dfout['in_Amount'] = dfout['in_Amount'].str.strip()
dfout['in_Amount'] = pd.to_numeric(dfout['in_Amount'])
dfout.head(3)

In [None]:
# Fixing Population Data Type
dfout['Population Of Service Area'] = dfout['Population Of Service Area'].fillna(0)
dfout['Population Of Service Area'] = dfout['Population Of Service Area'].astype(int).fillna(0)
dfout.head(1)

In [None]:
# Create WaDE TimeframeStart

MonthDictionary = {
"January" : "01",
"February" : "02",
"March" : "03",
"April" : "04",
"May" : "05",
"June" : "06",
"July" : "07",
"August" : "08",
"September" : "09",
"October" : "10",
"November" : "11",
"December" : "12"}

def createTimeframeStart(Year, Month):
    yearString = str(Year).strip()
    monthString = str(MonthDictionary[str(Month).strip()]).strip()
    try:
        outString = yearString + "-" + monthString + "-01"
    except:
        outString = ''
    return outString

dfout['in_TimeframeStart'] = dfout.apply(lambda row: createTimeframeStart(row['Year'], row['Month']), axis=1)
dfout.head(1)

In [None]:
# Create WaDE TimeframeEnd

MonthDictionary = {
"January" : "01",
"February" : "02",
"March" : "03",
"April" : "04",
"May" : "05",
"June" : "06",
"July" : "07",
"August" : "08",
"September" : "09",
"October" : "10",
"November" : "11",
"December" : "12"}

def createTimeframeEnd(Year, Month, Day):
    yearString = str(Year).strip()
    monthString = str(MonthDictionary[str(Month).strip()]).strip()
    dayString = str(Day).strip()
    try:
        outString = yearString + "-" + monthString + "-" + dayString
    except:
        outString = ''
    return outString

dfout['in_TimeframeEnd'] = dfout.apply(lambda row: createTimeframeEnd(row['Year'], row['Month'], row['Days In Month']), axis=1)
dfout.head(1)

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'])
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'])
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout.head(3)

## Export Outputs

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

In [None]:
#Exporting to Finished File
dfout.to_csv('P_caSSMaster.csv', index=False)  # The output