# Pre-processing Aggregated Water Use data for WaDE upload.
Purpose:  To pre-process the aggregated water use data into one master file for simple DataFrame creation and extraction

In [5]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [6]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/WaDE Data Folder/California/WaterUse_AggregatedArea" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/WaDE Data Folder/California/WaterUse_AggregatedArea


## Input Source Data

In [7]:
# Input File - CA-DWR-WaterBalance-Level2-DP-1000-2002-2016-DAUCO
inputFile ="RawInputData/CA-DWR-WaterBalance-Level2-DP-1000-2002-2016-DAUCO.zip"
df_dau = pd.read_csv(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_dau:
    df_dau['WaDEUUID'] = "dau" + df_dau.index.astype(str)
    df_dau.to_csv('RawInputData/CA-DWR-WaterBalance-Level2-DP-1000-2002-2016-DAUCO.zip', compression=dict(method='zip', archive_name='DAU.csv'), index=False)

print(len(df_dau))
df_dau.head()

3014000


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB,WaDEUUID
0,Agriculture,Applied Water,2002,DAU04827,Pressure,3,Central Coast,301,-121.63771,36.63942,105.5,AG1,1,dau0
1,Agriculture,Applied Water - Groundwater Recharge,2002,DAU04827,Pressure,3,Central Coast,301,-121.63771,36.63942,0.0,AG2,2,dau1
2,Agriculture,Conveyance Deep Percolation,2002,DAU04827,Pressure,3,Central Coast,301,-121.63771,36.63942,0.0,AG22,22,dau2
3,Agriculture,Conveyance Deep Percolation to Mexico,2002,DAU04827,Pressure,3,Central Coast,301,-121.63771,36.63942,0.0,AG18F,18f,dau3
4,Agriculture,Conveyance Deep Percolation to Nevada,2002,DAU04827,Pressure,3,Central Coast,301,-121.63771,36.63942,0.0,AG18E,18e,dau4


In [8]:
# Input File - CA-DWR-WaterBalance-Level2-DP-1000-2002-2016-HR
inputFile ="RawInputData/CA-DWR-WaterBalance-Level2-DP-1000-2002-2016-HR.zip"
df_hr = pd.read_csv(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_hr:
    df_hr['WaDEUUID'] = "hr" + df_hr.index.astype(str)
    df_hr.to_csv('RawInputData/CA-DWR-WaterBalance-Level2-DP-1000-2002-2016-HR.zip', compression=dict(method='zip', archive_name='HR.csv'), index=False)

print(len(df_hr))
df_hr.head()

42610


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt,WaDEUUID
0,2002,Applied Water,AG1,Central Coast,Agriculture,1175.6,hr0
1,2002,Return Flow to Salt Sink,AG10A,Central Coast,Agriculture,83.7,hr1
2,2002,Return Flow for Delta Outflow,AG10B,Central Coast,Agriculture,0.0,hr2
3,2002,Return Flow to Developed Supply (Other DAUCO w...,AG11A,Central Coast,Agriculture,0.0,hr3
4,2002,Return Flow to Developed Supply (Other PA),AG11B,Central Coast,Agriculture,0.0,hr4


In [9]:
# Input File - CA-DWR-WaterBalance-Level2-DP-1000-2002-2016-PA
inputFile ="RawInputData/CA-DWR-WaterBalance-Level2-DP-1000-2002-2016-PA.zip"
df_pa = pd.read_csv(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_pa:
    df_pa['WaDEUUID'] = "pa" + df_pa.index.astype(str)
    df_pa.to_csv('RawInputData/CA-DWR-WaterBalance-Level2-DP-1000-2002-2016-PA.zip', compression=dict(method='zip', archive_name='PA.csv'), index=False)

print(len(df_pa))
df_pa.head()

238616


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt,WaDEUUID
0,2002,Applied Water,AG1,101,Agriculture,664.4,pa0
1,2002,Return Flow to Salt Sink,AG10A,101,Agriculture,9.3,pa1
2,2002,Return Flow for Delta Outflow,AG10B,101,Agriculture,0.0,pa2
3,2002,Return Flow to Developed Supply (Other DAUCO w...,AG11A,101,Agriculture,2.6,pa3
4,2002,Return Flow to Developed Supply (Other PA),AG11B,101,Agriculture,0.0,pa4


In [12]:
# Input File: WaDECADAU.shp
fileInput = "RawInputData/shapefiles/WaDECADAU.zip"
gdf_dau = gpd.read_file(fileInput).replace(np.nan, "")
print(len(gdf_dau))
gdf_dau.head(1)

520


Unnamed: 0,HR_Code,PA_Num,Shape_Leng,Shape_Le_1,RU_ID,Shape_Le_2,Shape_Area,geometry
0,1.0,101.0,2.96297,2.96297,DAU00125,2.96297,0.33027,"POLYGON ((-121.08710 41.99514, -120.70108 41.9..."


In [13]:
# Input File: Hydrologic_Regions.shp
fileInput = "RawInputData/shapefiles/Hydrologic_Regions.zip"
gdf_hr = gpd.read_file(fileInput).replace(np.nan, "")
print(len(gdf_hr))
gdf_hr.head(1)

10


Unnamed: 0,OBJECTID,HR_NAME,Shape_Leng,Shape_Area,geometry
0,13,Central Coast,15.90003,2.91428,"POLYGON ((-122.11808 37.25528, -122.11795 37.2..."


In [14]:
# Input File: Water_Plan_Planning_Areas.shp
fileInput = "RawInputData/shapefiles/Water_Plan_Planning_Areas.zip"
gdf_pa = gpd.read_file(fileInput).replace(np.nan, "")
print(len(gdf_pa))
gdf_pa.head(1)

59


Unnamed: 0,OBJECTID,PA_NO,Shape_Leng,Shape_Area,geometry
0,12,1001,9.38796,1.70894,"POLYGON ((-115.16056 35.35811, -115.16038 35.3..."


## Clean Data
- We only want the Applied Water Use and Depletion values.

In [18]:
# DAU
df_dau = df_dau[df_dau['CategoryC'].isin(['Applied Water Use', 'Depletion']) ].reset_index(drop=True)
df_dau = df_dau.sort_values(by=['Year', 'CategoryC', 'DAU', 'CategoryA', 'KAcreFt'], )
print(len(df_dau))
print(df_dau['CategoryC'].unique())
df_dau.head(1)

86832
['Applied Water Use' 'Depletion']


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB,WaDEUUID
132,Agriculture,Applied Water Use,2002,DAU00125,Lost River,1,North Coast,101,-121.0601,41.79382,132.3,AWUAG,Computed,dau2004


In [19]:
# HR
df_hr = df_hr[df_hr['CategoryC'].isin(['Applied Water Use', 'Depletion']) ].reset_index(drop=True)
df_hr = df_hr.sort_values(by=['Year', 'CategoryC', 'HR', 'CategoryA', 'KAcreFt'], )
print(len(df_hr))
print(df_hr['CategoryC'].unique())
df_hr.head(1)

1800
['Applied Water Use' 'Depletion']


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt,WaDEUUID
0,2002,Applied Water Use,AWUAG,Central Coast,Agriculture,1175.6,hr36


In [20]:
# PA
df_pa = df_pa[df_pa['CategoryC'].isin(['Applied Water Use', 'Depletion']) ].reset_index(drop=True)
df_pa = df_pa.sort_values(by=['Year', 'CategoryC', 'PA', 'CategoryA', 'KAcreFt'], )
print(len(df_pa))
print(df_pa['CategoryC'].unique())
df_pa.head(1)

10080
['Applied Water Use' 'Depletion']


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt,WaDEUUID
0,2002,Applied Water Use,AWUAG,101,Agriculture,664.4,pa36


## WaDE Data

In [22]:
# DAU
# Transfer input data to WaDE specific output

# create dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = df_dau['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CAwuag_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "" # determine below
df['in_AggregationIntervalUnitCV'] = "Annual"
df['in_VariableCV'] = df_dau['CategoryC']
df['in_VariableSpecificCV'] = "" # determine below

# Organization Info
df['in_OrganizationUUID'] = "CAwuag_O1"

# Water Source
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df["in_WaterSourceName"] = ""
df['in_WaterSourceNativeID'] = ""
df["in_WaterSourceTypeCV"] = ""

# ReportingUnits Info
df['in_EPSGCodeCV'] = 4326
df['in_ReportingUnitName'] = df_dau['DAU_NAME']
df['in_ReportingUnitNativeID'] = df_dau['DAU']
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Detailed Analysis Units by County"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "CA"

# AggregatedAmounts Info
df['in_AllocationCropDutyAmount'] = ""
df['in_Amount'] = df_dau['KAcreFt']
df['in_BeneficialUseCategory'] = df_dau['CategoryA']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_InterbasinTransferFromID'] = ""
df['in_InterbasinTransferToID'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategoryCV'] = ""
df['in_ReportYearCV'] = df_dau['Year'].astype(int)
df['in_SDWISIdentifierCV'] = ""
df['in_TimeframeEnd'] = df_dau['Year'].astype(str)  + "/12/31"
df['in_TimeframeStart'] = df_dau['Year'].astype(str) + "/01/01"

outdf_dau = df.copy()
outdf_dau = outdf_dau.drop_duplicates().reset_index(drop=True)
print(len(outdf_dau))
outdf_dau.head(5)

86832


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_VariableSpecificCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_EPSGCodeCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitProductVersion,in_ReportingUnitTypeCV,in_ReportingUnitUpdateDate,in_StateCV,in_AllocationCropDutyAmount,in_Amount,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_InterbasinTransferFromID,in_InterbasinTransferToID,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategoryCV,in_ReportYearCV,in_SDWISIdentifierCV,in_TimeframeEnd,in_TimeframeStart
0,dau2004,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,Lost River,DAU00125,,Detailed Analysis Units by County,,CA,,132.3,Agriculture,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
1,dau2028,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,Lost River,DAU00125,,Detailed Analysis Units by County,,CA,,0.0,Instream Flow Requirements,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
2,dau2020,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,Lost River,DAU00125,,Detailed Analysis Units by County,,CA,,111.4,Managed Wetlands,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
3,dau2036,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,Lost River,DAU00125,,Detailed Analysis Units by County,,CA,,0.0,Required Delta Outflow,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
4,dau2012,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,Lost River,DAU00125,,Detailed Analysis Units by County,,CA,,0.3,Urban,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01


In [24]:
# HR
# Transfer input data to WaDE specific output

# create dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = df_hr['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CAwuag_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "" # determine below
df['in_AggregationIntervalUnitCV'] = "Annual"
df['in_VariableCV'] = df_hr['CategoryC']
df['in_VariableSpecificCV'] = "" # determine below

# Organization Info
df['in_OrganizationUUID'] = "CAwuag_O1"

# Water Source
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df["in_WaterSourceName"] = ""
df['in_WaterSourceNativeID'] = ""
df["in_WaterSourceTypeCV"] = ""

# ReportingUnits Info
df['in_EPSGCodeCV'] = 4326
df['in_ReportingUnitName'] = df_hr['HR']
df['in_ReportingUnitNativeID'] = ""
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Hydrologic Region"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "CA"

# AggregatedAmounts Info
df['in_AllocationCropDutyAmount'] = ""
df['in_Amount'] = df_hr['KAcreFt']
df['in_BeneficialUseCategory'] = df_hr['CategoryA']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_InterbasinTransferFromID'] = ""
df['in_InterbasinTransferToID'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategoryCV'] = ""
df['in_ReportYearCV'] = df_hr['Year'].astype(int)
df['in_SDWISIdentifierCV'] = ""
df['in_TimeframeEnd'] = df_hr['Year'].astype(str)  + "/12/31"
df['in_TimeframeStart'] = df_hr['Year'].astype(str) + "/01/01"

outdf_hr = df.copy()
outdf_hr = outdf_hr.drop_duplicates().reset_index(drop=True)
print(len(outdf_hr))
outdf_hr.head(5)

1800


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_VariableSpecificCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_EPSGCodeCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitProductVersion,in_ReportingUnitTypeCV,in_ReportingUnitUpdateDate,in_StateCV,in_AllocationCropDutyAmount,in_Amount,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_InterbasinTransferFromID,in_InterbasinTransferToID,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategoryCV,in_ReportYearCV,in_SDWISIdentifierCV,in_TimeframeEnd,in_TimeframeStart
0,hr36,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,Central Coast,,,Hydrologic Region,,CA,,1175.6,Agriculture,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
1,hr44,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,Central Coast,,,Hydrologic Region,,CA,,10.7,Instream Flow Requirements,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
2,hr56,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,Central Coast,,,Hydrologic Region,,CA,,0.5,Managed Wetlands,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
3,hr98,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,Central Coast,,,Hydrologic Region,,CA,,0.0,Required Delta Outflow,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
4,hr104,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,Central Coast,,,Hydrologic Region,,CA,,291.8,Urban,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01


In [25]:
# PA
# Transfer input data to WaDE specific output

# create dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = df_pa['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CAwuag_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "" # determine below
df['in_AggregationIntervalUnitCV'] = "Annual"
df['in_VariableCV'] = df_pa['CategoryC']
df['in_VariableSpecificCV'] = "" # determine below

# Organization Info
df['in_OrganizationUUID'] = "CAwuag_O1"

# Water Source
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df["in_WaterSourceName"] = ""
df['in_WaterSourceNativeID'] = ""
df["in_WaterSourceTypeCV"] = ""

# ReportingUnits Info
df['in_EPSGCodeCV'] = 4326
df['in_ReportingUnitName'] = df_pa['PA']
df['in_ReportingUnitNativeID'] = df_pa['PA']
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Planning Area"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "CA"

# AggregatedAmounts Info
df['in_AllocationCropDutyAmount'] = ""
df['in_Amount'] = df_pa['KAcreFt']
df['in_BeneficialUseCategory'] = df_pa['CategoryA']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_InterbasinTransferFromID'] = ""
df['in_InterbasinTransferToID'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategoryCV'] = ""
df['in_ReportYearCV'] = df_pa['Year'].astype(int)
df['in_SDWISIdentifierCV'] = ""
df['in_TimeframeEnd'] = df_pa['Year'].astype(str)  + "/12/31"
df['in_TimeframeStart'] = df_pa['Year'].astype(str) + "/01/01"

outdf_pa = df.copy()
outdf_pa = outdf_pa.drop_duplicates().reset_index(drop=True)
print(len(outdf_pa))
outdf_pa.head(5)

10080


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_VariableSpecificCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_EPSGCodeCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitProductVersion,in_ReportingUnitTypeCV,in_ReportingUnitUpdateDate,in_StateCV,in_AllocationCropDutyAmount,in_Amount,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_InterbasinTransferFromID,in_InterbasinTransferToID,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategoryCV,in_ReportYearCV,in_SDWISIdentifierCV,in_TimeframeEnd,in_TimeframeStart
0,pa36,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,101,101,,Planning Area,,CA,,664.4,Agriculture,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
1,pa44,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,101,101,,Planning Area,,CA,,0.0,Instream Flow Requirements,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
2,pa56,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,101,101,,Planning Area,,CA,,343.3,Managed Wetlands,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
3,pa98,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,101,101,,Planning Area,,CA,,0.0,Required Delta Outflow,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01
4,pa104,CAwuag_M1,,Annual,Applied Water Use,,CAwuag_O1,,,,,,,4326,101,101,,Planning Area,,CA,,11.4,Urban,,,,,,,,,,,,,,2002,,2002/12/31,2002/01/01


## Concatenate Data (if needed).

In [27]:
# Concatenate dataframes

frames = [outdf_dau, outdf_hr, outdf_pa]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

98712


## Clean Data / Data Types

In [10]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

In [12]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [11]:
outdf['in_ReportingUnitName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

In [None]:
outdf['in_ReportingUnitTypeCV'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_ReportingUnitTypeCV']), axis=1)
outdf['in_ReportingUnitTypeCV'].unique()

In [13]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [14]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [15]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [16]:
outdf['in_ReportingUnitName'] = outdf.apply(lambda row: ensureEmptyString(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

In [17]:
outdf['in_ReportingUnitTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_ReportingUnitTypeCV']), axis=1)
outdf['in_ReportingUnitTypeCV'].unique()

In [18]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

In [None]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

In [None]:
# extract year out
outdf['in_ReportYearCV'] = pd.to_datetime(outdf['in_ReportYearCV'], utc=True, errors = 'coerce').fillna("")
outdf['in_ReportYearCV'] = pd.to_datetime(outdf["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].dt.year
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].fillna(0).astype(int)
outdf['in_ReportYearCV'].unique()

In [None]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

In [22]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_BeneficialUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

## Shapefile Data
- For attaching geometry to csv inputs.

In [23]:
# Shapefile Data (can just be copy of above input)\

shapefileInput = gpd.read_file('RawInputData/shapefiles/{enter file name here}.zip')
dfshapetemp = gpd.read_file(shapefileInput)
print(len(dfshapetemp))
dfshapetemp.head()

In [24]:
columnsList = ['in_ReportingUnitNativeID', 'geometry']
outshape = pd.DataFrame(columns=columnsList)
outshape['in_ReportingUnitNativeID'] = "" # chould match above 'in_ReportingUnitNativeID' if file not copied
outshape['geometry'] = dfshapetemp['geometry']
outshape = outshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
outshape.head(20)

# Export Data

In [25]:
outdf.info()

In [26]:
outdf

In [27]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pag_Main.zip', compression=dict(method='zip', archive_name='Pag_Main.csv'), index=False)   # The output, save as a zip
outshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.