# Working with CA Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

In [1]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

#Setting work directory, reading inputs, creating dataframe
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/California/AggregatedAmounts/RawInputData"
os.chdir(workingDir)

# Water Balance Data

In [2]:
#CSV input file
fileInput_2011 = "CA-DWR-WaterBalance-Level2-DP-1000-2011-DAUCO_input.csv"
fileInput_2012 = "CA-DWR-WaterBalance-Level2-DP-1000-2012-DAUCO_input.csv"
fileInput_2013 = "CA-DWR-WaterBalance-Level2-DP-1000-2013-DAUCO_input.csv"
fileInput_2014 = "CA-DWR-WaterBalance-Level2-DP-1000-2014-DAUCO_input.csv"
fileInput_2015 = "CA-DWR-WaterBalance-Level2-DP-1000-2015-DAUCO_input.csv"

In [3]:
# check input 2011
df2011 = pd.read_csv(fileInput_2011)
df2011.head(1)

Unnamed: 0,CategoryA,CategoryC,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,Year,CategoryD,CategoryB
0,Agriculture,Applied Water,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,89.2,2011,AG1,1


In [4]:
# check input 2012
df2012 = pd.read_csv(fileInput_2012)
df2012.head(1)

Unnamed: 0,CategoryA,CategoryC,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,Year,CategoryD,CategoryB
0,Agriculture,Applied Water,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,92.7,2012,AG1,1


In [5]:
# check input 2013
df2013 = pd.read_csv(fileInput_2013)
df2013.head(1)

Unnamed: 0,CategoryA,CategoryC,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,Year,CategoryD,CategoryB
0,Agriculture,Applied Water,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,112.2,2013,AG1,1


In [6]:
# check input 2014
df2014 = pd.read_csv(fileInput_2014)
df2014.head(1)

Unnamed: 0,CategoryA,CategoryC,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,Year,CategoryD,CategoryB
0,Agriculture,Applied Water,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,100.5,2014,AG1,1


In [7]:
# check input 2015
df2015 = pd.read_csv(fileInput_2015)
df2015.head(1)

Unnamed: 0,CategoryA,CategoryC,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,Year,CategoryD,CategoryB
0,Agriculture,Applied Water,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,100.5,2015,AG1,1


In [8]:
# Concatenate DAUCO datafraes together.
frames = [df2011, df2012, df2013, df2014, df2015]
dfAll = pd.concat(frames)
dfAll

Unnamed: 0,CategoryA,CategoryC,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,Year,CategoryD,CategoryB
0,Agriculture,Applied Water,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,89.2,2011,AG1,1
1,Agriculture,Applied Water - Groundwater Recharge,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2011,AG2,2
2,Agriculture,Conveyance Deep Percolation,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2011,AG22,22
3,Agriculture,Conveyance Deep Percolation to Mexico,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2011,AG18F,18f
4,Agriculture,Conveyance Deep Percolation to Nevada,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2011,AG18E,18e
...,...,...,...,...,...,...,...,...,...,...,...,...,...
202324,Water Supplies,Total Return Flow and Reuse (TRFR),DAU15831,American River,5,Sacramento River,508,-120.610120,39.108586,207.3,2015,SPL24,Computed
202325,Water Supplies,Total Supply and Retuse (TSR),DAU15831,American River,5,Sacramento River,508,-120.610120,39.108586,214.4,2015,SPL25,Computed
202326,Water Supplies,Total Reuse of Deep Percolation (TRDP),DAU15831,American River,5,Sacramento River,508,-120.610120,39.108586,1.4,2015,SPL26,Computed
202327,Water Supplies,Total Net Supply (TNS),DAU15831,American River,5,Sacramento River,508,-120.610120,39.108586,5.7,2015,SPL27,Computed


### Seperate Water Balance out by Reporting Unit Type, Sum by Year.

In [9]:
# Create Individual Reporting Unit dataframes - PA
# Group by Reporting Unit type and Sum the KAcreFt field.

dfPAtemp = dfAll.groupby(['PA', 'Year', 'CategoryA'])['KAcreFt'].sum().reset_index()

columnsList = ['inReportingUnitTypeCV', 'inReportingUnitName', 'inReportingUnitNativeID', 'inYear', 'inBenUse', 'inAmount']
dfPA = pd.DataFrame(columns=columnsList)
dfPA['inReportingUnitName'] = dfPAtemp['PA']
dfPA['inReportingUnitNativeID'] = dfPAtemp['PA']
dfPA['inYear'] = dfPAtemp['Year']
dfPA['inBenUse'] = dfPAtemp['CategoryA']
dfPA['inAmount'] = dfPAtemp['KAcreFt'] * 1000
dfPA.inReportingUnitTypeCV = "Planning Area"
dfPA

Unnamed: 0,inReportingUnitTypeCV,inReportingUnitName,inReportingUnitNativeID,inYear,inBenUse,inAmount
0,Planning Area,101,101,2011,Agriculture,3389600.0
1,Planning Area,101,101,2011,Instream Flow Requirements,0.0
2,Planning Area,101,101,2011,Managed Wetlands,1399800.0
3,Planning Area,101,101,2011,Required Delta Outflow,0.0
4,Planning Area,101,101,2011,Urban,59100.0
...,...,...,...,...,...,...
1955,Planning Area,1006,1006,2015,Managed Wetlands,356000.0
1956,Planning Area,1006,1006,2015,Required Delta Outflow,0.0
1957,Planning Area,1006,1006,2015,Urban,442400.0
1958,Planning Area,1006,1006,2015,Water Supplies,11397600.0


In [10]:
# Create Individual Reporting Unit dataframes - HR
# Group by Reporting Unit type and Sum the KAcreFt field.

dfHRtemp = dfAll.groupby(['HR_NAME', 'HR_CODE', 'Year', 'CategoryA'])['KAcreFt'].sum().reset_index()

columnsList = ['inReportingUnitTypeCV', 'inReportingUnitName', 'inReportingUnitNativeID', 'inYear', 'inBenUse', 'inAmount']
dfHR = pd.DataFrame(columns=columnsList)
dfHR['inReportingUnitName'] = dfHRtemp['HR_NAME']
dfHR['inReportingUnitNativeID'] = dfHRtemp['HR_CODE']
dfHR['inYear'] = dfHRtemp['Year']
dfHR['inBenUse'] = dfHRtemp['CategoryA']
dfHR['inAmount'] = dfHRtemp['KAcreFt'] * 1000
dfHR.inReportingUnitTypeCV = "Hydrologic Region"
dfHR

Unnamed: 0,inReportingUnitTypeCV,inReportingUnitName,inReportingUnitNativeID,inYear,inBenUse,inAmount
0,Hydrologic Region,Central Coast,3,2011,Agriculture,7045800.0
1,Hydrologic Region,Central Coast,3,2011,Instream Flow Requirements,127000.0
2,Hydrologic Region,Central Coast,3,2011,Managed Wetlands,3200.0
3,Hydrologic Region,Central Coast,3,2011,Required Delta Outflow,0.0
4,Hydrologic Region,Central Coast,3,2011,Urban,1686260.0
...,...,...,...,...,...,...
345,Hydrologic Region,Tulare Lake,7,2015,Managed Wetlands,631800.0
346,Hydrologic Region,Tulare Lake,7,2015,Required Delta Outflow,0.0
347,Hydrologic Region,Tulare Lake,7,2015,Urban,2819000.0
348,Hydrologic Region,Tulare Lake,7,2015,Water Supplies,49526840.0


In [11]:
# Create Individual Reporting Unit dataframes - DAUCO
# Group by Reporting Unit type and Sum the KAcreFt field.

dfDAUCOtemp = dfAll.groupby(['DAU', 'DAU_NAME', 'Year', 'CategoryA'])['KAcreFt'].sum().reset_index()

columnsList = ['inReportingUnitTypeCV', 'inReportingUnitName', 'inReportingUnitNativeID', 'inYear', 'inBenUse', 'inAmount']
dfDAUCO = pd.DataFrame(columns=columnsList)
dfDAUCO['inReportingUnitName'] = dfDAUCOtemp['DAU_NAME']
dfDAUCO['inReportingUnitNativeID'] = dfDAUCOtemp['DAU']
dfDAUCO['inYear'] = dfDAUCOtemp['Year']
dfDAUCO['inBenUse'] = dfDAUCOtemp['CategoryA']
dfDAUCO['inAmount'] = dfDAUCOtemp['KAcreFt'] * 1000
dfDAUCO.inReportingUnitTypeCV = "Detailed Analysis Units by County"
dfDAUCO

Unnamed: 0,inReportingUnitTypeCV,inReportingUnitName,inReportingUnitNativeID,inYear,inBenUse,inAmount
0,Detailed Analysis Units by County,Lost River,DAU00125,2011,Agriculture,773500.0
1,Detailed Analysis Units by County,Lost River,DAU00125,2011,Instream Flow Requirements,0.0
2,Detailed Analysis Units by County,Lost River,DAU00125,2011,Managed Wetlands,637000.0
3,Detailed Analysis Units by County,Lost River,DAU00125,2011,Required Delta Outflow,0.0
4,Detailed Analysis Units by County,Lost River,DAU00125,2011,Urban,1300.0
...,...,...,...,...,...,...
16963,Detailed Analysis Units by County,Dry Creek,DAU40549,2015,Managed Wetlands,0.0
16964,Detailed Analysis Units by County,Dry Creek,DAU40549,2015,Required Delta Outflow,0.0
16965,Detailed Analysis Units by County,Dry Creek,DAU40549,2015,Urban,11000.0
16966,Detailed Analysis Units by County,Dry Creek,DAU40549,2015,Water Supplies,45500.0


In [12]:
# Concatenate grouped-by reporting unit type datafraes together.
frames = [dfPA, dfHR, dfDAUCO]
dfAllbyType = pd.concat(frames)
dfAllbyType

Unnamed: 0,inReportingUnitTypeCV,inReportingUnitName,inReportingUnitNativeID,inYear,inBenUse,inAmount
0,Planning Area,101,101,2011,Agriculture,3389600.0
1,Planning Area,101,101,2011,Instream Flow Requirements,0.0
2,Planning Area,101,101,2011,Managed Wetlands,1399800.0
3,Planning Area,101,101,2011,Required Delta Outflow,0.0
4,Planning Area,101,101,2011,Urban,59100.0
...,...,...,...,...,...,...
16963,Detailed Analysis Units by County,Dry Creek,DAU40549,2015,Managed Wetlands,0.0
16964,Detailed Analysis Units by County,Dry Creek,DAU40549,2015,Required Delta Outflow,0.0
16965,Detailed Analysis Units by County,Dry Creek,DAU40549,2015,Urban,11000.0
16966,Detailed Analysis Units by County,Dry Creek,DAU40549,2015,Water Supplies,45500.0


# Shapefile Data

In [13]:
# Shapefile input
WaterPlanAreaShape = gpd.read_file('C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/California/AggregatedAmounts/RawInputData/Water_Plan_Planning_Areas-shp/Water_Plan_Planning_Areas.shp')
HydrologicRegionsShape = gpd.read_file('C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/California/AggregatedAmounts/RawInputData/Hydrologic_Regions-shp/Hydrologic_Regions.shp')
DAUCOShape = gpd.read_file('C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/California/AggregatedAmounts/RawInputData/DAUCO-shp/WaDECADAU.shp')

In [14]:
#check shp input Water Plan Area
dfWPAshapetemp = pd.DataFrame(WaterPlanAreaShape)

columnsList = ['RU_ID', 'geometry']
dfWPAshape = pd.DataFrame(columns=columnsList)
dfWPAshape['RU_ID'] = dfWPAshapetemp['PA_NO']
dfWPAshape['geometry'] = dfWPAshapetemp['geometry']
dfWPAshape = dfWPAshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfWPAshape.head(3)

Unnamed: 0,RU_ID,geometry
0,1001,"POLYGON ((-115.16056 35.35811, -115.16038 35.3..."
1,1002,"POLYGON ((-116.62283 34.16694, -116.62253 34.1..."
2,1003,"POLYGON ((-114.80515 34.22629, -114.80435 34.2..."


In [15]:
#check shp input Hydrologic Region
dfHRshapetemp = pd.DataFrame(HydrologicRegionsShape)

HydrologicRegionIDdict = {
"North Coast" : "1",
"San Francisco Bay" : "2",
"Central Coast" : "3",
"South Coast" : "4",
"Sacramento River" : "5",
"San Joaquin River" : "6",
"Tulare Lake" : "7",
"North Lahontan" : "8",
"South Lahontan" : "9",
"Colorado River" : "10"}
    
def retrieveHRID(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        String1 = colrowValue.strip()
        try:
            outList = HydrologicRegionIDdict[String1]
        except:
            outList = ''
    return outList

columnsList = ['RU_ID', 'geometry']
dfHRshape = pd.DataFrame(columns=columnsList)
dfHRshape['RU_ID'] = dfHRshapetemp.apply(lambda row: retrieveHRID(row['HR_NAME']), axis=1)
dfHRshape['geometry'] = dfHRshapetemp['geometry']
dfHRshape = dfHRshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfHRshape

Unnamed: 0,RU_ID,geometry
0,3,"POLYGON ((-122.11808 37.25528, -122.11795 37.2..."
1,10,"POLYGON ((-115.12583 35.39706, -115.09804 35.3..."
2,1,"POLYGON ((-122.30410 42.00836, -122.28218 42.0..."
3,8,"POLYGON ((-119.99946 41.99466, -119.99940 41.9..."
4,5,"POLYGON ((-120.20882 41.99296, -120.20892 41.9..."
5,2,"POLYGON ((-122.60736 38.65174, -122.60694 38.6..."
6,6,"POLYGON ((-120.53749 38.75047, -120.53636 38.7..."
7,4,"POLYGON ((-119.10918 34.82375, -119.10905 34.8..."
8,9,"POLYGON ((-118.88460 38.22193, -118.87599 38.2..."
9,7,"POLYGON ((-118.89596 37.20829, -118.89539 37.2..."


In [16]:
#check shp input DAUCO
dfDAUCOshapetemp = pd.DataFrame(DAUCOShape)

columnsList = ['RU_ID', 'geometry']
dfDAUCOshape = pd.DataFrame(columns=columnsList)
dfDAUCOshape['RU_ID'] = dfDAUCOshapetemp['RU_ID']
dfDAUCOshape['geometry'] = dfDAUCOshapetemp['geometry']
dfDAUCOshape = dfDAUCOshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfDAUCOshape.head(3)

Unnamed: 0,RU_ID,geometry
0,DAU00125,"POLYGON ((-121.08710 41.99514, -120.70108 41.9..."
1,DAU00147,"POLYGON ((-121.88226 42.00329, -121.44784 41.9..."
2,DAU00247,"POLYGON ((-122.02221 42.00440, -121.94694 42.0..."


In [17]:
# Concatenate shp datafraes together.
frames = [dfWPAshape, dfHRshape, dfDAUCOshape]
dfAllShape = pd.concat(frames).reset_index()
dfAllShape

Unnamed: 0,index,RU_ID,geometry
0,0,1001,"POLYGON ((-115.16056 35.35811, -115.16038 35.3..."
1,1,1002,"POLYGON ((-116.62283 34.16694, -116.62253 34.1..."
2,2,1003,"POLYGON ((-114.80515 34.22629, -114.80435 34.2..."
3,3,1004,"POLYGON ((-115.12583 35.39706, -115.09804 35.3..."
4,4,1005,"POLYGON ((-116.56630 33.54937, -116.56550 33.5..."
...,...,...,...
584,515,DAU40417,"POLYGON ((-122.82745 38.85906, -122.83830 38.8..."
585,516,DAU40423,"POLYGON ((-122.85634 38.86348, -122.84992 38.8..."
586,517,DAU40449,"POLYGON ((-122.82249 38.85118, -122.81670 38.8..."
587,518,DAU40523,"POLYGON ((-123.10947 38.87033, -123.11125 38.8..."


### Inspect Output Data & Export

In [18]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfAllbyType.dtypes)

inReportingUnitTypeCV       object
inReportingUnitName         object
inReportingUnitNativeID     object
inYear                       int64
inBenUse                    object
inAmount                   float64
dtype: object


In [19]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfAllShape.dtypes)

index          int64
RU_ID         object
geometry    geometry
dtype: object


In [20]:
# Export out to CSV.
dfAllbyType.to_csv('P_caAggMaster.csv', index=False) # The output.
dfAllShape.to_csv('P_caGeometry.csv', index=False) # The output geometry.