# Working with CA Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

#### Notes:
- Three ReportingUnitTypeCV: PA, HR, DAU

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
#Setting work directory, reading inputs, creating dataframe
workingDir = "G:/Shared drives/WaDE Data/California/AggregatedAmounts/RawInputData"
os.chdir(workingDir)

## Input Data

### hydrologic region (HR_CODE)

In [3]:
HR_2002 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2002-HR.csv"
dfhr2002 = pd.read_csv(HR_2002)
print(len(dfhr2002))
dfhr2002.head(1)

2850


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2002,Applied Water,AG1,Central Coast,Agriculture,1175.6


In [4]:
HR_2003 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2003-HR.csv"
dfhr2003 = pd.read_csv(HR_2003)
print(len(dfhr2003))
dfhr2003.head(1)

2850


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2003,Applied Water,AG1,Central Coast,Agriculture,956.9


In [5]:
HR_2004 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2004-HR.csv"
dfhr2004 = pd.read_csv(HR_2004)
print(len(dfhr2004))
dfhr2004.head(1)

2830


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2004,Applied Water,AG1,Central Coast,Agriculture,1208.5


In [6]:
HR_2005 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2005-HR.csv"
dfhr2005 = pd.read_csv(HR_2005)
print(len(dfhr2005))
dfhr2005.head(1)

2830


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2005,Applied Water,AG1,Central Coast,Agriculture,897.4


In [7]:
HR_2006 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2006-HR.csv"
dfhr2006 = pd.read_csv(HR_2006)
print(len(dfhr2006))
dfhr2006.head(1)

2830


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2006,Applied Water,AG1,Central Coast,Agriculture,789.0


In [8]:
HR_2007 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2007-HR.csv"
dfhr2007 = pd.read_csv(HR_2007)
print(len(dfhr2007))
dfhr2007.head(1)

2830


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2007,Applied Water,AG1,Central Coast,Agriculture,1200.2


In [9]:
HR_2008 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2008-HR.csv"
dfhr2008 = pd.read_csv(HR_2008)
print(len(dfhr2008))
dfhr2008.head(1)

2830


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2008,Applied Water,AG1,Central Coast,Agriculture,1101.6


In [10]:
HR_2009 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2009-HR.csv"
dfhr2009 = pd.read_csv(HR_2009)
print(len(dfhr2009))
dfhr2009.head(1)

2830


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2009,Applied Water,AG1,Central Coast,Agriculture,1034.4


In [11]:
HR_2010 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2010-HR.csv"
dfhr2010 = pd.read_csv(HR_2010)
print(len(dfhr2010))
dfhr2010.head(1)

2830


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2010,Applied Water,AG1,Central Coast,Agriculture,859.5


In [12]:
HR_2011 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2011-HR.csv"
dfhr2011 = pd.read_csv(HR_2011)
print(len(dfhr2011))
dfhr2011.head(1)

2850


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2011,Applied Water,AG1,Central Coast,Agriculture,893.8


In [13]:
HR_2012 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2012-HR.csv"
dfhr2012 = pd.read_csv(HR_2012)
print(len(dfhr2012))
dfhr2012.head(1)

2850


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2012,Applied Water,AG1,Central Coast,Agriculture,991.6


In [14]:
HR_2013 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2013-HR.csv"
dfhr2013 = pd.read_csv(HR_2013)
print(len(dfhr2013))
dfhr2013.head(1)

2850


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2013,Applied Water,AG1,Central Coast,Agriculture,1246.7


In [15]:
HR_2014 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2014-HR.csv"
dfhr2014 = pd.read_csv(HR_2014)
print(len(dfhr2014))
dfhr2014.head(1)

2850


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2014,Applied Water,AG1,Central Coast,Agriculture,1256.4


In [16]:
HR_2015 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2015-HR.csv"
dfhr2015 = pd.read_csv(HR_2015)
print(len(dfhr2015))

2850


In [17]:
HR_2016 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2016-HR.csv"
dfhr2016 = pd.read_csv(HR_2016)
print(len(dfhr2016))
dfhr2016.head(1)

2850


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2016,Applied Water,AG1,Central Coast,Agriculture,1028.9


In [18]:
# Concatenate HR inputs into one dataframe
frames = [dfhr2002,dfhr2003,dfhr2004,dfhr2005,dfhr2006,dfhr2007,dfhr2008,
          dfhr2009,dfhr2010,dfhr2011,dfhr2012,dfhr2013,dfhr2014,dfhr2015,dfhr2016]
dfHR = pd.concat(frames).reset_index(drop=True)
print(len(dfHR))
dfHR['Year'].unique()

42610


array([2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016], dtype=int64)

#### planning area (PA)

In [19]:
PA_2002 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2002-PA.csv"
dfpa2002 = pd.read_csv(PA_2002)
print(len(dfpa2002))
dfpa2002.head(1)

15960


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2002,Applied Water,AG1,101,Agriculture,664.4


In [20]:
PA_2003 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2003-PA.csv"
dfpa2003 = pd.read_csv(PA_2003)
print(len(dfpa2003))
dfpa2003.head(1)

15960


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2003,Applied Water,AG1,101,Agriculture,542.0


In [21]:
PA_2004 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2004-PA.csv"
dfpa2004 = pd.read_csv(PA_2004)
print(len(dfpa2004))
dfpa2004.head(1)

15848


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2004,Applied Water,AG1,101,Agriculture,562.9


In [22]:
PA_2005 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2005-PA.csv"
dfpa2005 = pd.read_csv(PA_2005)
print(len(dfpa2005))
dfpa2005.head(1)

15848


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2005,Applied Water,AG1,101,Agriculture,464.2


In [23]:
PA_2006 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2006-PA.csv"
dfpa2006 = pd.read_csv(PA_2006)
print(len(dfpa2006))
dfpa2006.head(1)

15848


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2006,Applied Water,AG1,101,Agriculture,595.1


In [24]:
PA_2007 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2007-PA.csv"
dfpa2007 = pd.read_csv(PA_2007)
print(len(dfpa2007))
dfpa2007.head(1)

15848


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2007,Applied Water,AG1,101,Agriculture,617.5


In [25]:
PA_2008 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2008-PA.csv"
dfpa2008 = pd.read_csv(PA_2008)
print(len(dfpa2008))
dfpa2008.head(1)

15848


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2008,Applied Water,AG1,101,Agriculture,581.4


In [26]:
PA_2009 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2009-PA.csv"
dfpa2009 = pd.read_csv(PA_2009)
print(len(dfpa2009))
dfpa2009.head(1)

15848


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2009,Applied Water,AG1,101,Agriculture,550.8


In [27]:
PA_2010 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2010-PA.csv"
dfpa2010 = pd.read_csv(PA_2010)
print(len(dfpa2010))
dfpa2010.head(1)

15848


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2010,Applied Water,AG1,101,Agriculture,616.7


In [28]:
PA_2011 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2011-PA.csv"
dfpa2011 = pd.read_csv(PA_2011)
print(len(dfpa2011))
dfpa2011.head(1)

15960


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2011,Applied Water,AG1,101,Agriculture,347.2


In [29]:
PA_2012 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2012-PA.csv"
dfpa2012 = pd.read_csv(PA_2012)
print(len(dfpa2012))
dfpa2012.head(1)

15960


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2012,Applied Water,AG1,101,Agriculture,452.3


In [30]:
PA_2013 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2013-PA.csv"
dfpa2013 = pd.read_csv(PA_2013)
print(len(dfpa2013))
dfpa2013.head(1)

15960


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2013,Applied Water,AG1,101,Agriculture,432.5


In [31]:
PA_2014 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2014-PA.csv"
dfpa2014 = pd.read_csv(PA_2014)
print(len(dfpa2014))
dfpa2014.head(1)

15960


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2014,Applied Water,AG1,101,Agriculture,421.0


In [32]:
PA_2015 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2015-PA.csv"
dfpa2015 = pd.read_csv(PA_2015)
print(len(dfpa2015))
dfpa2015.head(1)

15960


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2015,Applied Water,AG1,101,Agriculture,392.2


In [33]:
PA_2016 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2016-PA.csv"
dfpa2016 = pd.read_csv(PA_2016)
print(len(dfpa2016))
dfpa2016.head(1)

15960


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2016,Applied Water,AG1,101,Agriculture,455.5


In [34]:
# Concatenate PA inputs into one dataframe
frames = [dfpa2002, dfpa2003, dfpa2004, dfpa2005, dfpa2006, dfpa2007, dfpa2008,
          dfpa2009, dfpa2010, dfpa2011, dfpa2012, dfpa2013, dfpa2014, dfpa2015, dfpa2016]
dfPA = pd.concat(frames).reset_index(drop=True)
print(len(dfPA))
dfPA['Year'].unique()

238616


array([2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016], dtype=int64)

### Detailed Analysis Units by County (DAU)

In [35]:
DAU_2002 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2002-DAUCO.csv"
dfdau2002 = pd.read_csv(DAU_2002)
print(len(dfdau2002))
dfdau2002.head(1)

135945


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2002,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,105.5,AG1,1


In [36]:
DAU_2003 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2003-DAUCO.csv"
dfdau2003 = pd.read_csv(DAU_2003)
print(len(dfdau2003))
dfdau2003.head(1)

135945


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2003,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,104.4,AG1,1


In [37]:
DAU_2004 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2004-DAUCO.csv"
dfdau2004 = pd.read_csv(DAU_2004)
print(len(dfdau2004))
dfdau2004.head(1)

135557


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2004,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,129.4,AG1,1


In [38]:
DAU_2005 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2005-DAUCO.csv"
dfdau2005 = pd.read_csv(DAU_2005)
print(len(dfdau2005))
dfdau2005.head(1)

135557


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2005,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,104.3,AG1,1


In [39]:
DAU_2006 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2006-DAUCO.csv"
dfdau2006 = pd.read_csv(DAU_2006)
print(len(dfdau2006))
dfdau2006.head(1)

136406


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2006,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,77.1,AG1,1


In [40]:
DAU_2007 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2007-DAUCO.csv"
dfdau2007 = pd.read_csv(DAU_2007)
print(len(dfdau2007))
dfdau2007.head(1)

136406


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2007,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,124.5,AG1,1


In [41]:
DAU_2008 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2008-DAUCO.csv"
dfdau2008 = pd.read_csv(DAU_2008)
print(len(dfdau2008))
dfdau2008.head(1)

136689


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2008,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,128.4,AG1,1


In [42]:
DAU_2009 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2009-DAUCO.csv"
dfdau2009 = pd.read_csv(DAU_2009)
print(len(dfdau2009))
dfdau2009.head(1)

136972


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2009,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,119.3,AG1,1


In [43]:
DAU_2010 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2010-DAUCO.csv"
dfdau2010 = pd.read_csv(DAU_2010)
print(len(dfdau2010))
dfdau2010.head(1)

136972


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2010,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,89.3,AG1,1


In [44]:
DAU_2011 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2011-DAUCO.csv"
dfdau2011 = pd.read_csv(DAU_2011)
print(len(dfdau2011))
dfdau2011.head(1)

321070


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2011,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,89.2,AG1,1


In [45]:
DAU_2012 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2012-DAUCO.csv"
dfdau2012 = pd.read_csv(DAU_2012)
print(len(dfdau2012))
dfdau2012.head(1)

311696


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2012,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,92.7,AG1,1


In [46]:
DAU_2013 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2013-DAUCO.csv"
dfdau2013 = pd.read_csv(DAU_2013)
print(len(dfdau2013))
dfdau2013.head(1)

304095


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2013,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,112.2,AG1,1


In [47]:
DAU_2014 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2014-DAUCO.csv"
dfdau2014 = pd.read_csv(DAU_2014)
print(len(dfdau2014))
dfdau2014.head(1)

470935


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2014,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,100.5,AG1,1


In [48]:
DAU_2015 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2015-DAUCO.csv"
dfdau2015 = pd.read_csv(DAU_2015)
print(len(dfdau2015))
dfdau2015.head(1)

235710


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2015,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,100.5,AG1,1


In [49]:
DAU_2016 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2016-DAUCO.csv"
dfdau2016 = pd.read_csv(DAU_2016)
print(len(dfdau2016))
dfdau2016.head(1)

144045


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
0,Agriculture,Applied Water,2016,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.63942,0.0,AG1,1


In [50]:
# Concatenate DAU inputs into one dataframe
frames = [dfdau2002, dfdau2003, dfdau2004, dfdau2005, dfdau2006, dfdau2007, dfdau2008, dfdau2009,
          dfdau2010, dfdau2011, dfdau2012, dfdau2013, dfdau2014, dfdau2015, dfdau2016]
dfDAU = pd.concat(frames).reset_index(drop=True)
print(len(dfDAU))
dfDAU['Year'].unique()

3014000


array([2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016], dtype=int64)

## Clean Data
- We only want the Applied Water Use and Depletion values.

In [51]:
# HR
dfHR_2 = dfHR.copy()
dfHR_2 = dfHR_2[dfHR_2['CategoryC'].isin(['Applied Water Use', 'Depletion']) ].reset_index(drop=True)
dfHR_2 = dfHR_2.sort_values(by=['Year', 'CategoryC', 'HR', 'CategoryA', 'KAcreFt'], )
print(len(dfHR_2))
print(dfHR_2['CategoryC'].unique())
dfHR_2.head(1)

1800
['Applied Water Use' 'Depletion']


Unnamed: 0,Year,CategoryC,CategoryD,HR,CategoryA,KAcreFt
0,2002,Applied Water Use,AWUAG,Central Coast,Agriculture,1175.6


In [52]:
# PA
dfPA_2 = dfPA.copy()
dfPA_2 = dfPA_2[dfPA_2['CategoryC'].isin(['Applied Water Use', 'Depletion']) ].reset_index(drop=True)
dfPA_2 = dfPA_2.sort_values(by=['Year', 'CategoryC', 'PA', 'CategoryA', 'KAcreFt'], )
print(len(dfPA_2))
print(dfPA_2['CategoryC'].unique())
dfPA_2.head(1)

10080
['Applied Water Use' 'Depletion']


Unnamed: 0,Year,CategoryC,CategoryD,PA,CategoryA,KAcreFt
0,2002,Applied Water Use,AWUAG,101,Agriculture,664.4


In [53]:
# DAU
dfDAU_2 = dfDAU.copy()
dfDAU_2 = dfDAU_2[dfDAU_2['CategoryC'].isin(['Applied Water Use', 'Depletion']) ].reset_index(drop=True)
dfDAU_2 = dfDAU_2.sort_values(by=['Year', 'CategoryC', 'DAU', 'CategoryA', 'KAcreFt'], )
print(len(dfDAU_2))
print(dfDAU_2['CategoryC'].unique())
dfDAU_2.head(1)

86832
['Applied Water Use' 'Depletion']


Unnamed: 0,CategoryA,CategoryC,Year,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,CategoryD,CategoryB
132,Agriculture,Applied Water Use,2002,DAU00125,Lost River,1,North Coast,101,-121.060095,41.793815,132.3,AWUAG,Computed


## Output Dataframes

In [54]:
# HR
dfHR_3 = pd.DataFrame(index=dfHR_2.index)

# Variable Info
dfHR_3['VariableCV'] = dfHR_2['CategoryC']
dfHR_3['in_VariableSpecificCV'] = dfHR_2['CategoryC'] + "_Annual_" + dfHR_2['CategoryA'] + "_Surface Ground Water"

# ReportingUnits Info
dfHR_3['in_ReportingUnitName'] = dfHR_2['HR']
dfHR_3['in_ReportingUnitNativeID'] = ""
dfHR_3['in_ReportingUnitTypeCV'] = "Hydrologic Region"

# AggregatedAmounts Info
dfHR_3['in_Amount'] = dfHR_2['KAcreFt']
dfHR_3['in_BenUse'] = dfHR_2['CategoryA']
dfHR_3['in_ReportYearCV'] =  dfHR_2['Year'].astype(int)
dfHR_3['in_TimeframeStart'] = dfHR_2['Year'].astype(str) + "/01/01"
dfHR_3['in_TimeframeEnd'] = dfHR_2['Year'].astype(str)  + "/12/31"

print(len(dfHR_3))
dfHR_3.head(1)

1800


Unnamed: 0,VariableCV,in_VariableSpecificCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitTypeCV,in_Amount,in_BenUse,in_ReportYearCV,in_TimeframeStart,in_TimeframeEnd
0,Applied Water Use,Applied Water Use_Annual_Agriculture_Surface G...,Central Coast,,Hydrologic Region,1175.6,Agriculture,2002,2002/01/01,2002/12/31


In [55]:
# For creating ReportingUnitNativeID for HR data

dftempHR_CODE = pd.DataFrame(columns = ['HR_CODE', 'HR_NAME'])
dftempHR_CODE['HR_CODE'] = dfDAU_2['HR_CODE']
dftempHR_CODE['HR_NAME'] = dfDAU_2['HR_NAME']
dftempHR_CODE = dftempHR_CODE.drop_duplicates().reset_index(drop=True)

HR_Code_dict = pd.Series(dftempHR_CODE.HR_CODE.values, index=dftempHR_CODE.HR_NAME).to_dict()

def retrieveReportingUnitNativeID(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outString = ''
    else:
        String1 = colrowValue
        try:
            outString = HR_Code_dict[String1]
        except:
            outString = colrowValue
    return outString

dfHR_3['in_ReportingUnitNativeID'] = dfHR_3.apply(lambda row: retrieveReportingUnitNativeID(row['in_ReportingUnitName']), axis=1)
dfHR_3['in_ReportingUnitNativeID'].unique()

array([ 3, 10,  1,  8,  5,  2,  6,  4,  9,  7], dtype=int64)

In [56]:
# PA
dfPA_3 = pd.DataFrame(index=dfPA_2.index)

# Variable Info
dfPA_3['VariableCV'] = dfPA_2['CategoryC']
dfPA_3['in_VariableSpecificCV'] = dfPA_2['CategoryC'] + "_Annual_" + dfPA_2['CategoryA'] + "_Surface Ground Water"

# ReportingUnits Info
dfPA_3['in_ReportingUnitName'] = dfPA_2['PA']
dfPA_3['in_ReportingUnitNativeID'] = dfPA_2['PA'] # setting native id = name in this unique situation
dfPA_3['in_ReportingUnitTypeCV'] = "Planning Area"

# AggregatedAmounts Info
dfPA_3['in_Amount'] = dfPA_2['KAcreFt']
dfPA_3['in_BenUse'] = dfPA_2['CategoryA']
dfPA_3['in_ReportYearCV'] =  dfPA_2['Year'].astype(int)
dfPA_3['in_TimeframeStart'] = dfPA_2['Year'].astype(str) + "/01/01"
dfPA_3['in_TimeframeEnd'] = dfPA_2['Year'].astype(str)  + "/12/31"

print(len(dfPA_3))
dfPA_3.head(1)

10080


Unnamed: 0,VariableCV,in_VariableSpecificCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitTypeCV,in_Amount,in_BenUse,in_ReportYearCV,in_TimeframeStart,in_TimeframeEnd
0,Applied Water Use,Applied Water Use_Annual_Agriculture_Surface G...,101,101,Planning Area,664.4,Agriculture,2002,2002/01/01,2002/12/31


In [57]:
# DAU
dfDAU_3 = pd.DataFrame(index=dfDAU_2.index)

# Variable Info
dfDAU_3['VariableCV'] = dfDAU_2['CategoryC']
dfDAU_3['in_VariableSpecificCV'] = dfDAU_2['CategoryC'] + "_Annual_" + dfDAU_2['CategoryA'] + "_Surface Ground Water"

# ReportingUnits Info
dfDAU_3['in_ReportingUnitName'] = dfDAU_2['DAU_NAME']
dfDAU_3['in_ReportingUnitNativeID'] = dfDAU_2['DAU']
dfDAU_3['in_ReportingUnitTypeCV'] = "Detailed Analysis Units by County"

# AggregatedAmounts Info
dfDAU_3['in_Amount'] = dfDAU_2['KAcreFt']
dfDAU_3['in_BenUse'] = dfDAU_2['CategoryA']
dfDAU_3['in_ReportYearCV'] =  dfDAU_2['Year'].astype(int)
dfDAU_3['in_TimeframeStart'] = dfDAU_2['Year'].astype(str) + "/01/01"
dfDAU_3['in_TimeframeEnd'] = dfDAU_2['Year'].astype(str)  + "/12/31"

print(len(dfDAU_3))
dfDAU_3.head(1)

86832


Unnamed: 0,VariableCV,in_VariableSpecificCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitTypeCV,in_Amount,in_BenUse,in_ReportYearCV,in_TimeframeStart,in_TimeframeEnd
132,Applied Water Use,Applied Water Use_Annual_Agriculture_Surface G...,Lost River,DAU00125,Detailed Analysis Units by County,132.3,Agriculture,2002,2002/01/01,2002/12/31


In [58]:
# Concatenate HR, PA, & DAU datfames into single output dataframe.
frames = [dfHR_3, dfPA_3, dfDAU_3]
dfout = pd.concat(frames).reset_index(drop=True)
print(len(dfout))
dfout['in_ReportingUnitTypeCV'].unique()

98712


array(['Hydrologic Region', 'Planning Area',
       'Detailed Analysis Units by County'], dtype=object)

In [59]:
# Convert History Year to YYYY-MM-DD format.

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], errors = 'coerce')
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], errors = 'coerce')
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout.head()

Unnamed: 0,VariableCV,in_VariableSpecificCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitTypeCV,in_Amount,in_BenUse,in_ReportYearCV,in_TimeframeStart,in_TimeframeEnd
0,Applied Water Use,Applied Water Use_Annual_Agriculture_Surface G...,Central Coast,3,Hydrologic Region,1175.6,Agriculture,2002,2002-01-01,2002-12-31
1,Applied Water Use,Applied Water Use_Annual_Instream Flow Require...,Central Coast,3,Hydrologic Region,10.7,Instream Flow Requirements,2002,2002-01-01,2002-12-31
2,Applied Water Use,Applied Water Use_Annual_Managed Wetlands_Surf...,Central Coast,3,Hydrologic Region,0.5,Managed Wetlands,2002,2002-01-01,2002-12-31
3,Applied Water Use,Applied Water Use_Annual_Required Delta Outflo...,Central Coast,3,Hydrologic Region,0.0,Required Delta Outflow,2002,2002-01-01,2002-12-31
4,Applied Water Use,Applied Water Use_Annual_Urban_Surface Ground ...,Central Coast,3,Hydrologic Region,291.8,Urban,2002,2002-01-01,2002-12-31


## WaDE Custom Elements (due to missing info)

# Shapefile Data

In [60]:
# Shapefile input
HydrologicRegionsShape = gpd.read_file('Hydrologic_Regions-shp/Hydrologic_Regions.shp', crs="EPSG:4326")
WaterPlanAreaShape = gpd.read_file('Water_Plan_Planning_Areas-shp/Water_Plan_Planning_Areas.shp', crs="EPSG:4326")
DAUCOShape = gpd.read_file('DAUCO-shp/WaDECADAU.shp', crs="EPSG:4326")

In [61]:
#check shp input Hydrologic Region
dfHRshapetemp = pd.DataFrame(HydrologicRegionsShape)

HydrologicRegionIDdict = {
"North Coast" : "1",
"San Francisco Bay" : "2",
"Central Coast" : "3",
"South Coast" : "4",
"Sacramento River" : "5",
"San Joaquin River" : "6",
"Tulare Lake" : "7",
"North Lahontan" : "8",
"South Lahontan" : "9",
"Colorado River" : "10"}
    
def retrieveHRID(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        String1 = colrowValue.strip()
        try:
            outList = HydrologicRegionIDdict[String1]
        except:
            outList = ''
    return outList

columnsList = ['RU_ID', 'geometry']
dfHRshape = pd.DataFrame(columns=columnsList)
dfHRshape['RU_ID'] = dfHRshapetemp.apply(lambda row: retrieveHRID(row['HR_NAME']), axis=1)
dfHRshape['geometry'] = dfHRshapetemp['geometry']
dfHRshape = dfHRshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfHRshape

Unnamed: 0,RU_ID,geometry
0,3,"POLYGON ((-122.11808 37.25528, -122.11795 37.2..."
1,10,"POLYGON ((-115.12583 35.39706, -115.09804 35.3..."
2,1,"POLYGON ((-122.30410 42.00836, -122.28218 42.0..."
3,8,"POLYGON ((-119.99946 41.99466, -119.99940 41.9..."
4,5,"POLYGON ((-120.20882 41.99296, -120.20892 41.9..."
5,2,"POLYGON ((-122.60736 38.65174, -122.60694 38.6..."
6,6,"POLYGON ((-120.53749 38.75047, -120.53636 38.7..."
7,4,"POLYGON ((-119.10918 34.82375, -119.10905 34.8..."
8,9,"POLYGON ((-118.88460 38.22193, -118.87599 38.2..."
9,7,"POLYGON ((-118.89596 37.20829, -118.89539 37.2..."


In [62]:
#check shp input Water Plan Area
dfWPAshapetemp = pd.DataFrame(WaterPlanAreaShape)

columnsList = ['RU_ID', 'geometry']
dfWPAshape = pd.DataFrame(columns=columnsList)
dfWPAshape['RU_ID'] = dfWPAshapetemp['PA_NO']
dfWPAshape['geometry'] = dfWPAshapetemp['geometry']
dfWPAshape = dfWPAshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfWPAshape.head(3)

Unnamed: 0,RU_ID,geometry
0,1001,"POLYGON ((-115.16056 35.35811, -115.16038 35.3..."
1,1002,"POLYGON ((-116.62283 34.16694, -116.62253 34.1..."
2,1003,"POLYGON ((-114.80515 34.22629, -114.80435 34.2..."


In [63]:
#check shp input DAUCO
dfDAUCOshapetemp = pd.DataFrame(DAUCOShape)

columnsList = ['RU_ID', 'geometry']
dfDAUCOshape = pd.DataFrame(columns=columnsList)
dfDAUCOshape['RU_ID'] = dfDAUCOshapetemp['RU_ID']
dfDAUCOshape['geometry'] = dfDAUCOshapetemp['geometry']
dfDAUCOshape = dfDAUCOshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfDAUCOshape.head(3)

Unnamed: 0,RU_ID,geometry
0,DAU00125,"POLYGON ((-121.08710 41.99514, -120.70108 41.9..."
1,DAU00147,"POLYGON ((-121.88226 42.00329, -121.44784 41.9..."
2,DAU00247,"POLYGON ((-122.02221 42.00440, -121.94694 42.0..."


In [64]:
# Concatenate shp datafraes together.
frames = [dfHRshape, dfWPAshape, dfDAUCOshape]
dfShape = pd.concat(frames).reset_index(drop=True)
dfShape

Unnamed: 0,RU_ID,geometry
0,3,"POLYGON ((-122.11808 37.25528, -122.11795 37.2..."
1,10,"POLYGON ((-115.12583 35.39706, -115.09804 35.3..."
2,1,"POLYGON ((-122.30410 42.00836, -122.28218 42.0..."
3,8,"POLYGON ((-119.99946 41.99466, -119.99940 41.9..."
4,5,"POLYGON ((-120.20882 41.99296, -120.20892 41.9..."
...,...,...
584,DAU40417,"POLYGON ((-122.82745 38.85906, -122.83830 38.8..."
585,DAU40423,"POLYGON ((-122.85634 38.86348, -122.84992 38.8..."
586,DAU40449,"POLYGON ((-122.82249 38.85118, -122.81670 38.8..."
587,DAU40523,"POLYGON ((-123.10947 38.87033, -123.11125 38.8..."


### Inspect Output Data & Export

In [65]:
dfout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98712 entries, 0 to 98711
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   VariableCV                98712 non-null  object        
 1   in_VariableSpecificCV     98712 non-null  object        
 2   in_ReportingUnitName      98712 non-null  object        
 3   in_ReportingUnitNativeID  98712 non-null  object        
 4   in_ReportingUnitTypeCV    98712 non-null  object        
 5   in_Amount                 98712 non-null  float64       
 6   in_BenUse                 98712 non-null  object        
 7   in_ReportYearCV           98712 non-null  int32         
 8   in_TimeframeStart         98712 non-null  datetime64[ns]
 9   in_TimeframeEnd           98712 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int32(1), object(6)
memory usage: 7.2+ MB


In [66]:
dfShape.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   RU_ID     589 non-null    object  
 1   geometry  589 non-null    geometry
dtypes: geometry(1), object(1)
memory usage: 9.3+ KB


In [67]:
# Export out to CSV.
dfout.to_csv('P_caAggMaster.csv', index=False) # The output.
dfShape.to_csv('P_caGeometry.csv', index=False) # The output geometry.

In [68]:
dfout['in_VariableSpecificCV'].unique()

array(['Applied Water Use_Annual_Agriculture_Surface Ground Water',
       'Applied Water Use_Annual_Instream Flow Requirements_Surface Ground Water',
       'Applied Water Use_Annual_Managed Wetlands_Surface Ground Water',
       'Applied Water Use_Annual_Required Delta Outflow_Surface Ground Water',
       'Applied Water Use_Annual_Urban_Surface Ground Water',
       'Applied Water Use_Annual_Wild and Scenic River_Surface Ground Water',
       'Depletion_Annual_Agriculture_Surface Ground Water',
       'Depletion_Annual_Instream Flow Requirements_Surface Ground Water',
       'Depletion_Annual_Managed Wetlands_Surface Ground Water',
       'Depletion_Annual_Required Delta Outflow_Surface Ground Water',
       'Depletion_Annual_Urban_Surface Ground Water',
       'Depletion_Annual_Wild and Scenic River_Surface Ground Water'],
      dtype=object)

In [69]:
# # Creating WaDE Custom reporting unit native ID for easy water site identification
# # create by unique ReportingUnitName & ReportingUnitTypeCV
# # only need for PA areas.
# # ----------------------------------------------------------------------------------------------------

# # Create temp ReportingUnitNativeID dataframe of unique reporting unit native ID areas.
# def assignReportingUnitNativeID(colrowValue):
#     string1 = str(colrowValue)
#     outstring = "WaDECA_RU" + string1
#     return outstring

# dfReportingUnitNativeID = pd.DataFrame()
# dfReportingUnitNativeID['in_ReportingUnitName'] = dfout['in_ReportingUnitName']
# dfReportingUnitNativeID['in_ReportingUnitTypeCV'] = dfout['in_ReportingUnitTypeCV']
# dfReportingUnitNativeID = dfReportingUnitNativeID.drop_duplicates()

# dftemp = pd.DataFrame(index=dfReportingUnitNativeID.index)
# dftemp["Count"] = range(1, len(dftemp.index) + 1)
# dfReportingUnitNativeID['in_ReportingUnitNativeID'] = dftemp.apply(lambda row: assignReportingUnitNativeID(row['Count']), axis=1)

# # ----------------------------------------------------------------------------------------------------

# # Retreive WaDE Custom reporting unit native ID areas.
# def retrieveReportingUnitNativeID(A, B, C):
#     # check if A is empty or null
#     if A == "" or pd.isnull(A):
#         ml = dfReportingUnitNativeID.loc[(dfReportingUnitNativeID['in_ReportingUnitName'] == B) & 
#                                          (dfReportingUnitNativeID['in_ReportingUnitTypeCV'] == C), 'in_ReportingUnitNativeID']
#         if not (ml.empty):  # check if the series is empty
#             outString = ml.iloc[0]
#         else:
#             outString = A
#     else:
#         outString = A

#     return outString

# dfout['in_ReportingUnitNativeID'] = dfout.apply(lambda row: retrieveReportingUnitNativeID(row['in_ReportingUnitNativeID'],
#                                                                                           row['in_ReportingUnitName'],
#                                                                                           row['in_ReportingUnitTypeCV']), axis=1)
# print(dfout['in_ReportingUnitNativeID'].unique())