# Working with AZ Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

In [1]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
#Setting work directory, reading inputs, creating dataframe
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/AggregatedAmounts"
os.chdir(workingDir)

#CSV input file
fileInput = "RawInputData/rawSafeYield.csv"
df = pd.read_csv(fileInput)

#Shapefile input
AZ_AMA = gpd.read_file('C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/AggregatedAmounts/RawInputData/AMA_and_INA-shp/AMA_and_INA.shp')
dfshape = pd.DataFrame(AZ_AMA)

In [3]:
#check csv input
df

Unnamed: 0,Basin Code,Inflow/Outflow,Sector,Year,Quantity,Water Type,Component
0,PHX,Inflow,Agricultural ...,1985,585460.7374,Groundwater ...,Incidental Recharge
1,PHX,Inflow,Agricultural ...,1986,577371.6134,Groundwater ...,Incidental Recharge
2,PHX,Inflow,Agricultural ...,1987,569202.1827,Groundwater ...,Incidental Recharge
3,PHX,Inflow,Agricultural ...,1988,522300.0849,Groundwater ...,Incidental Recharge
4,PHX,Inflow,Agricultural ...,1989,514523.2052,Groundwater ...,Incidental Recharge
...,...,...,...,...,...,...,...
2571,TUC,Outflow,Natural,2015,5330.0000,Groundwater ...,Riparian Demand
2572,TUC,Outflow,Natural,2016,23650.0000,Groundwater ...,Groundwater Outflow
2573,TUC,Outflow,Natural,2016,4400.0000,Groundwater ...,Riparian Demand
2574,TUC,Outflow,Natural,2017,,Groundwater ...,Groundwater Outflow


In [4]:
#check shp input
dfshape

Unnamed: 0,OBJECTID,BASIN_NAME,NAME_ABBR,BASIN_NA_1,Shape_Leng,Shape_Area,geometry
0,1,SANTA CRUZ AMA,SCA,SANTA CRUZ AMA,227784.354771,1853991000.0,"POLYGON ((481155.981 3524735.269, 481185.919 3..."
1,2,PRESCOTT AMA,PRE,PRESCOTT AMA,194403.011308,1244308000.0,"POLYGON ((357041.632 3843374.192, 357053.632 3..."
2,3,HARQUAHALA INA,HAR,HARQUAHALA BASIN,239098.91634,1983267000.0,"POLYGON ((287010.098 3746236.654, 287022.598 3..."
3,4,JOSEPH CITY INA,JCI,,96874.388475,467928800.0,"POLYGON ((566517.367 3872842.551, 566546.615 3..."
4,5,PHOENIX AMA,PHX,PHOENIX AMA,844515.038021,13950010000.0,"POLYGON ((401515.842 3762674.078, 401552.531 3..."
5,6,TUCSON AMA,TUC,TUCSON AMA,620924.45552,10019520000.0,"POLYGON ((506728.980 3630477.361, 506694.731 3..."
6,7,DOUGLAS INA,DIN,,198604.224004,1436741000.0,"POLYGON ((643213.248 3467601.452, 642645.622 3..."
7,8,PINAL AMA,PIN,PINAL AMA,668757.778618,10607380000.0,"POLYGON ((480036.328 3671581.352, 479984.327 3..."


In [5]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

Basin Code         object
Inflow/Outflow     object
Sector             object
Year                int64
Quantity          float64
Water Type         object
Component          object
dtype: object


In [6]:
#Removing NaN, and missing (999) values from Quantit
df['Quantity'] = df['Quantity'].fillna(0)
df['Quantity'] = df['Quantity'].replace(999, 0)
df

Unnamed: 0,Basin Code,Inflow/Outflow,Sector,Year,Quantity,Water Type,Component
0,PHX,Inflow,Agricultural ...,1985,585460.7374,Groundwater ...,Incidental Recharge
1,PHX,Inflow,Agricultural ...,1986,577371.6134,Groundwater ...,Incidental Recharge
2,PHX,Inflow,Agricultural ...,1987,569202.1827,Groundwater ...,Incidental Recharge
3,PHX,Inflow,Agricultural ...,1988,522300.0849,Groundwater ...,Incidental Recharge
4,PHX,Inflow,Agricultural ...,1989,514523.2052,Groundwater ...,Incidental Recharge
...,...,...,...,...,...,...,...
2571,TUC,Outflow,Natural,2015,5330.0000,Groundwater ...,Riparian Demand
2572,TUC,Outflow,Natural,2016,23650.0000,Groundwater ...,Groundwater Outflow
2573,TUC,Outflow,Natural,2016,4400.0000,Groundwater ...,Riparian Demand
2574,TUC,Outflow,Natural,2017,0.0000,Groundwater ...,Groundwater Outflow


In [7]:
#Creating AMA name for Watersources from Basin Code
AMANameDic = {
"SCA" : "SANTA CRUZ AMA",
"PRE" : "PRESCOTT AMA",
"HAR" : "HARQUAHALA INA",
"JCI" : "JOSEPH CITY INA",
"PHX" : "PHOENIX AMA",
"TUC" : "TUCSON AMA",
"DIN" : "DOUGLAS INA",
"PIN" : "PINAL AMA"}

def assignAMAName(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ""
    else:
        String1 = colrowValue  # remove whitespace chars
        try:
            outList = AMANameDic[String1].strip()
        except:
            outList = ""
    return outList

df['AMA Name'] = df.apply(lambda row: assignAMAName(row['Basin Code']), axis=1)
df

Unnamed: 0,Basin Code,Inflow/Outflow,Sector,Year,Quantity,Water Type,Component,AMA Name
0,PHX,Inflow,Agricultural ...,1985,585460.7374,Groundwater ...,Incidental Recharge,PHOENIX AMA
1,PHX,Inflow,Agricultural ...,1986,577371.6134,Groundwater ...,Incidental Recharge,PHOENIX AMA
2,PHX,Inflow,Agricultural ...,1987,569202.1827,Groundwater ...,Incidental Recharge,PHOENIX AMA
3,PHX,Inflow,Agricultural ...,1988,522300.0849,Groundwater ...,Incidental Recharge,PHOENIX AMA
4,PHX,Inflow,Agricultural ...,1989,514523.2052,Groundwater ...,Incidental Recharge,PHOENIX AMA
...,...,...,...,...,...,...,...,...
2571,TUC,Outflow,Natural,2015,5330.0000,Groundwater ...,Riparian Demand,TUCSON AMA
2572,TUC,Outflow,Natural,2016,23650.0000,Groundwater ...,Groundwater Outflow,TUCSON AMA
2573,TUC,Outflow,Natural,2016,4400.0000,Groundwater ...,Riparian Demand,TUCSON AMA
2574,TUC,Outflow,Natural,2017,0.0000,Groundwater ...,Groundwater Outflow,TUCSON AMA


In [8]:
#transfer WKT gemetry from dfshape
def retrieveGeometry(colrowValue, dfshape):
    if (colrowValue == '') or (pd.isnull(colrowValue)):
        outList = ''
    else:
        ml = dfshape.loc[(dfshape['NAME_ABBR'] == colrowValue), 'geometry']
        if not(ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['Geometry'] = df.apply(lambda row: retrieveGeometry(row['Basin Code'], dfshape), axis=1)
df

Unnamed: 0,Basin Code,Inflow/Outflow,Sector,Year,Quantity,Water Type,Component,AMA Name,Geometry
0,PHX,Inflow,Agricultural ...,1985,585460.7374,Groundwater ...,Incidental Recharge,PHOENIX AMA,"POLYGON ((401515.8422997798 3762674.077700167,..."
1,PHX,Inflow,Agricultural ...,1986,577371.6134,Groundwater ...,Incidental Recharge,PHOENIX AMA,"POLYGON ((401515.8422997798 3762674.077700167,..."
2,PHX,Inflow,Agricultural ...,1987,569202.1827,Groundwater ...,Incidental Recharge,PHOENIX AMA,"POLYGON ((401515.8422997798 3762674.077700167,..."
3,PHX,Inflow,Agricultural ...,1988,522300.0849,Groundwater ...,Incidental Recharge,PHOENIX AMA,"POLYGON ((401515.8422997798 3762674.077700167,..."
4,PHX,Inflow,Agricultural ...,1989,514523.2052,Groundwater ...,Incidental Recharge,PHOENIX AMA,"POLYGON ((401515.8422997798 3762674.077700167,..."
...,...,...,...,...,...,...,...,...,...
2571,TUC,Outflow,Natural,2015,5330.0000,Groundwater ...,Riparian Demand,TUCSON AMA,"POLYGON ((506728.9803997681 3630477.361000173,..."
2572,TUC,Outflow,Natural,2016,23650.0000,Groundwater ...,Groundwater Outflow,TUCSON AMA,"POLYGON ((506728.9803997681 3630477.361000173,..."
2573,TUC,Outflow,Natural,2016,4400.0000,Groundwater ...,Riparian Demand,TUCSON AMA,"POLYGON ((506728.9803997681 3630477.361000173,..."
2574,TUC,Outflow,Natural,2017,0.0000,Groundwater ...,Groundwater Outflow,TUCSON AMA,"POLYGON ((506728.9803997681 3630477.361000173,..."


In [10]:
#Issue of lengthy geometry result exceeding the nvchar(250) limit of an excel cell.  Using xlsx file to check completness, and csv as input.

#Printing file to xlsx
df.to_excel('RawInputData/P_AZagg.xlsx', index=False)

#Printing file to xlsx
df.to_csv('RawInputData/P_AZagg.csv', index=False)