# Working with AZ Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

In [None]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
#Setting work directory, reading inputs, creating dataframe
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/AggregatedAmounts"
os.chdir(workingDir)

#CSV input file
fileInput = "RawInputData/AMA Demand Supply from DW_use as input.xlsx"
df = pd.read_excel(fileInput)

#Shapefile input
AZ_AMA = gpd.read_file('C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/AggregatedAmounts/RawInputData/AMA_and_INA-shp/AMA_and_INA.shp')
dfshape = pd.DataFrame(AZ_AMA)

In [None]:
#check csv input
df.head(3)

In [None]:
#check shp input
dfshape.head(3)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

In [None]:
# Build WaterSourceTypeCV using provdied ADWR water source dictionary.
ADWRWaSoDict = {
"CAP" : "Surface Water",
"CAP GSF" : "Surface Water",
"Effluent" : "Effluent",
"Groundwater" : "Groundwater",
"GSF (CAP)" : "Surface Water",
"GSF (Reclaimed Water)" : "Effluent",
"Other" : "Surface Water",
"Poor Quality GW" : "Groundwater",
"Reclaimed Water" : "Effluent",
"Recovered CAP" : "Surface Water",
"Recovered Effluent" : "Effluent",
"Recovered Reclaimed Water" : "Effluent",
"Recovered Surface Water" : "Surface Water",
"Remediation water" : "Groundwater",
"Spill" : "Surface Water",
"Surface water" : "Surface Water",
"Water withdrawn from wells" : "Groundwater",
"Weighted Exchange Reclaimed Water" : "Effluent"
}

def createWSTypeCV(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = 'Unknown'
    else:
        String1 = colrowValue  # remove whitespace chars
        try:
            outList = ADWRWaSoDict[String1]
        except:
            outList = 'Unknown'
    return outList

df['inputWaterSourceTypeCV'] = df.apply(lambda row: createWSTypeCV(row['BUDGET ELEMENT']), axis=1)
df

In [None]:
#Dropping Columns we don't need fields we don't need.
df = df.drop(['PARENT WATER TYPE OR SECTOR', 'BUDGET ELEMENT'], axis=1)
df

In [None]:
#Dropping rows of CATEGORY we don't need.  Only include 'Demand' & 'Supply'.
df = df[(df.CATEGORY == 'Demand') | (df.CATEGORY == 'Supply')]
df

In [None]:
#Group by and Sum the 'Quantity' field.
df = df.groupby(['AMA','YEAR', 'SECTOR', 'CATEGORY', 'inputWaterSourceTypeCV'])['QUANTITY'].sum().reset_index()
df

In [None]:
#transfer WKT gemetry from dfshape
def retrieveGeometry(colrowValue, dfshape):
    if (colrowValue == '') or (pd.isnull(colrowValue)):
        outList = ''
    else:
        ml = dfshape.loc[(dfshape['BASIN_NAME'] == colrowValue), 'geometry']
        if not(ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['Geometry'] = df.apply(lambda row: retrieveGeometry(row['AMA'], dfshape), axis=1)
df

In [None]:
#Issue of lengthy geometry result exceeding the nvchar(250) limit of an excel cell.  Using xlsx file to check completness, and csv as input.

# #Printing file to xlsx. Easier to manualky check a xlsx than a csv, but csv loads faster into WaDE system.
# df.to_excel('RawInputData/P_AZagg.xlsx', index=False)

#Printing file to xlsx
df.to_csv('RawInputData/P_AZagg.csv', index=False)