# Working with AZ Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

In [1]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
#Setting work directory, reading inputs, creating dataframe
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/AggregatedAmounts"
os.chdir(workingDir)

#CSV input file
fileInput = "RawInputData/AMA Demand Supply from DW_use as input.xlsx"
df = pd.read_excel(fileInput)

#Shapefile input
AZ_AMA = gpd.read_file('C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/AggregatedAmounts/RawInputData/AMA_and_INA-shp/AMA_and_INA.shp')
dfshape = pd.DataFrame(AZ_AMA)

In [3]:
#check csv input
df.head(3)

Unnamed: 0,AMA,YEAR,SECTOR,CATEGORY,PARENT WATER TYPE OR SECTOR,Custom WSWC Water Type Translation,BUDGET ELEMENT,QUANTITY
0,TUCSON AMA,1985,Agricultural,Allotment,Agricultural,Groundwater,Groundwater Allotment,212718
1,TUCSON AMA,1985,Agricultural,Demand,Agricultural,Unspecified,Non-Exempt IGFRs,114879
2,TUCSON AMA,1985,Agricultural,Incidental Recharge,,Recharge,Incidental Recharge Lagged,44371


In [4]:
#check shp input
dfshape.head(3)

Unnamed: 0,OBJECTID,BASIN_NAME,NAME_ABBR,BASIN_NA_1,Shape_Leng,Shape_Area,geometry
0,1,SANTA CRUZ AMA,SCA,SANTA CRUZ AMA,227784.354771,1853991000.0,"POLYGON ((481155.981 3524735.269, 481185.919 3..."
1,2,PRESCOTT AMA,PRE,PRESCOTT AMA,194403.011308,1244308000.0,"POLYGON ((357041.632 3843374.192, 357053.632 3..."
2,3,HARQUAHALA INA,HAR,HARQUAHALA BASIN,239098.91634,1983267000.0,"POLYGON ((287010.098 3746236.654, 287022.598 3..."


In [5]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

AMA                                   object
YEAR                                   int64
SECTOR                                object
CATEGORY                              object
PARENT WATER TYPE OR SECTOR           object
Custom WSWC Water Type Translation    object
BUDGET ELEMENT                        object
QUANTITY                               int64
dtype: object


In [6]:
#Dropping fields we don't need.
df = df.drop(['PARENT WATER TYPE OR SECTOR', 'BUDGET ELEMENT'], axis=1)
df

Unnamed: 0,AMA,YEAR,SECTOR,CATEGORY,Custom WSWC Water Type Translation,QUANTITY
0,TUCSON AMA,1985,Agricultural,Allotment,Groundwater,212718
1,TUCSON AMA,1985,Agricultural,Demand,Unspecified,114879
2,TUCSON AMA,1985,Agricultural,Incidental Recharge,Recharge,44371
3,TUCSON AMA,1985,Agricultural,Incidental Recharge,Recharge,28720
4,TUCSON AMA,1985,Indian,Demand,Unspecified,72
...,...,...,...,...,...,...
7494,PHOENIX AMA,2016,Industrial,Supply,Unknown,689
7495,PHOENIX AMA,2017,Agricultural,Supply,Unknown,5142
7496,PHOENIX AMA,2017,Industrial,Supply,Unknown,799
7497,PHOENIX AMA,2018,Agricultural,Supply,Unknown,4752


In [7]:
#Group by and Sum the 'Quantity' field.
df = df.groupby(['AMA','YEAR', 'SECTOR', 'CATEGORY', 'Custom WSWC Water Type Translation'])['QUANTITY'].sum().reset_index()
df

Unnamed: 0,AMA,YEAR,SECTOR,CATEGORY,Custom WSWC Water Type Translation,QUANTITY
0,PHOENIX AMA,1985,Agricultural,Allotment,Groundwater,1877572
1,PHOENIX AMA,1985,Agricultural,Demand,Lost,42832
2,PHOENIX AMA,1985,Agricultural,Demand,Unspecified,1222803
3,PHOENIX AMA,1985,Agricultural,Incidental Recharge,Recharge,1013442
4,PHOENIX AMA,1985,Agricultural,Supply,Effluent,30138
...,...,...,...,...,...,...
4177,TUCSON AMA,2018,Municipal,Population,Groundwater,32434
4178,TUCSON AMA,2018,Municipal,Population,Unspecified,998987
4179,TUCSON AMA,2018,Municipal,Supply,Effluent,13342
4180,TUCSON AMA,2018,Municipal,Supply,Groundwater,31666


In [8]:
#transfer WKT gemetry from dfshape
def retrieveGeometry(colrowValue, dfshape):
    if (colrowValue == '') or (pd.isnull(colrowValue)):
        outList = ''
    else:
        ml = dfshape.loc[(dfshape['BASIN_NAME'] == colrowValue), 'geometry']
        if not(ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['Geometry'] = df.apply(lambda row: retrieveGeometry(row['AMA'], dfshape), axis=1)
df

Unnamed: 0,AMA,YEAR,SECTOR,CATEGORY,Custom WSWC Water Type Translation,QUANTITY,Geometry
0,PHOENIX AMA,1985,Agricultural,Allotment,Groundwater,1877572,"POLYGON ((401515.8422997798 3762674.077700167,..."
1,PHOENIX AMA,1985,Agricultural,Demand,Lost,42832,"POLYGON ((401515.8422997798 3762674.077700167,..."
2,PHOENIX AMA,1985,Agricultural,Demand,Unspecified,1222803,"POLYGON ((401515.8422997798 3762674.077700167,..."
3,PHOENIX AMA,1985,Agricultural,Incidental Recharge,Recharge,1013442,"POLYGON ((401515.8422997798 3762674.077700167,..."
4,PHOENIX AMA,1985,Agricultural,Supply,Effluent,30138,"POLYGON ((401515.8422997798 3762674.077700167,..."
...,...,...,...,...,...,...,...
4177,TUCSON AMA,2018,Municipal,Population,Groundwater,32434,"POLYGON ((506728.9803997681 3630477.361000173,..."
4178,TUCSON AMA,2018,Municipal,Population,Unspecified,998987,"POLYGON ((506728.9803997681 3630477.361000173,..."
4179,TUCSON AMA,2018,Municipal,Supply,Effluent,13342,"POLYGON ((506728.9803997681 3630477.361000173,..."
4180,TUCSON AMA,2018,Municipal,Supply,Groundwater,31666,"POLYGON ((506728.9803997681 3630477.361000173,..."


In [9]:
#Issue of lengthy geometry result exceeding the nvchar(250) limit of an excel cell.  Using xlsx file to check completness, and csv as input.

#Printing file to xlsx
df.to_excel('RawInputData/P_AZagg.xlsx', index=False)

#Printing file to xlsx
df.to_csv('RawInputData/P_AZagg.csv', index=False)