# Working with AZ Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

In [1]:
# Libaries

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
#Setting work directory, reading inputs, creating dataframe
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/AggregatedAmounts"
os.chdir(workingDir)

#CSV input file
fileInput = "RawInputData/AMA Demand Supply from DW_use as input.xlsx"
df = pd.read_excel(fileInput)

#Shapefile input
AZ_AMA = gpd.read_file('C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/AggregatedAmounts/RawInputData/AMA_and_INA-shp/AMA_and_INA.shp')
dfshape = pd.DataFrame(AZ_AMA)

In [3]:
#check csv input
print(len(df))
df.head(1)

7499


Unnamed: 0,AMA,YEAR,SECTOR,CATEGORY,PARENT WATER TYPE OR SECTOR,BUDGET ELEMENT,QUANTITY
0,TUCSON AMA,1985,Agricultural,Allotment,Agricultural,Groundwater Allotment,212718


In [4]:
#check shp input
print(len(dfshape))
dfshape.head(3)

8


Unnamed: 0,OBJECTID,BASIN_NAME,NAME_ABBR,Shape_Leng,Shape_Area,geometry
0,1,SANTA CRUZ AMA,SCA,2.245176,0.176233,"POLYGON ((481155.981 3524735.269, 481185.919 3..."
1,2,PRESCOTT AMA,PRE,1.930985,0.122395,"POLYGON ((357041.632 3843374.192, 357053.632 3..."
2,3,HARQUAHALA INA,HAR,2.365999,0.192482,"POLYGON ((287010.098 3746236.654, 287022.598 3..."


In [5]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

AMA                            object
YEAR                            int64
SECTOR                         object
CATEGORY                       object
PARENT WATER TYPE OR SECTOR    object
BUDGET ELEMENT                 object
QUANTITY                        int64
dtype: object


In [6]:
# WaterSourceType.  Use the BUDGET ELEMENT Use to help determine this.

listOfSurfaceWater = ['Surface Water', 'Surface water', "Streambed"]
listOfGroundwater = ['Groundwater', 'Ground  Water', 'Well', 'well', 'Wells', 'well']
listOfEffluent = ['Effluent', 'EFFLUENT']

def defineWaterSourceType(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outString = 'Unspecified'
    else:
        try:
            if any(word in colrowValue for word in listOfSurfaceWater):
                outString = "Surface Water"
            elif any(word in colrowValue for word in listOfGroundwater):
                outString = "Groundwater"
            elif any(word in colrowValue for word in listOfEffluent):
                outString = "Effluent"
            else:
                outString = "Unspecified"
        except:
            outString = 'Unspecified'
    return outString

df['in_WaterSourceType'] = df.apply(lambda row: defineWaterSourceType(row['BUDGET ELEMENT']), axis=1)
df['in_WaterSourceType'].unique()

array(['Groundwater', 'Unspecified', 'Surface Water', 'Effluent'],
      dtype=object)

In [7]:
#Dropping Columns we don't need fields we don't need.
df = df.drop(['PARENT WATER TYPE OR SECTOR', 'BUDGET ELEMENT'], axis=1)
df.head(1)

Unnamed: 0,AMA,YEAR,SECTOR,CATEGORY,QUANTITY,in_WaterSourceType
0,TUCSON AMA,1985,Agricultural,Allotment,212718,Groundwater


In [8]:
#Dropping rows of CATEGORY we don't need.  Only include 'Demand' & 'Supply'.
df = df[(df.CATEGORY == 'Demand') | (df.CATEGORY == 'Supply')]
print(len(df))
df.head(1)

5112


Unnamed: 0,AMA,YEAR,SECTOR,CATEGORY,QUANTITY,in_WaterSourceType
1,TUCSON AMA,1985,Agricultural,Demand,114879,Unspecified


In [9]:
#Group by and Sum the 'Quantity' field.
df = df.groupby(['AMA','YEAR', 'SECTOR', 'CATEGORY', 'in_WaterSourceType'])['QUANTITY'].sum().reset_index()
print(len(df))
df.head(1)

2308


Unnamed: 0,AMA,YEAR,SECTOR,CATEGORY,in_WaterSourceType,QUANTITY
0,PHOENIX AMA,1985,Agricultural,Demand,Unspecified,1265635


In [10]:
# TimeframeStart & TimeframeEnd

df['in_TimeframeStart'] = '01/01/' + df['YEAR'].astype(str)
df['in_TimeframeEnd'] = '12/31/' + df['YEAR'].astype(str)

## WaDE Custom Elements (due to missing reporting unit and water source info)

In [11]:
# Creating VariableSpecific

def createVariableSpecificCV(Cat, Sec, WST):
    Cat = str(Cat).strip()
    Sec = str(Sec).strip()
    WST = str(WST).strip()
    
    outString = Cat + "_Annual_" +  Sec + "_" + WST
    
    return outString

df['in_VariableSpecificCV'] = df.apply(lambda row: createVariableSpecificCV(row['CATEGORY'], row['SECTOR'], row['in_WaterSourceType']), axis=1)
df['in_VariableSpecificCV'].unique()
df['in_VariableSpecificCV'].unique()

array(['Demand_Annual_Agricultural_Unspecified',
       'Supply_Annual_Agricultural_Effluent',
       'Supply_Annual_Agricultural_Groundwater',
       'Supply_Annual_Agricultural_Surface Water',
       'Supply_Annual_Agricultural_Unspecified',
       'Demand_Annual_Indian_Unspecified',
       'Supply_Annual_Indian_Groundwater',
       'Supply_Annual_Indian_Surface Water',
       'Demand_Annual_Industrial_Unspecified',
       'Supply_Annual_Industrial_Effluent',
       'Supply_Annual_Industrial_Groundwater',
       'Supply_Annual_Industrial_Surface Water',
       'Demand_Annual_Municipal_Unspecified',
       'Supply_Annual_Municipal_Groundwater',
       'Supply_Annual_Municipal_Surface Water',
       'Supply_Annual_Municipal_Unspecified',
       'Supply_Annual_Industrial_Unspecified',
       'Supply_Annual_Municipal_Effluent',
       'Supply_Annual_Indian_Unspecified',
       'Supply_Annual_Indian_Effluent'], dtype=object)

In [12]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEAZ_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceType'] = df['in_WaterSourceType']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceType'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['in_WaterSourceNativeID'] = df.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceType']), axis=1)
df['in_WaterSourceNativeID'].unique()

array(['WaDEAZ_WS1', 'WaDEAZ_WS2', 'WaDEAZ_WS3', 'WaDEAZ_WS4'],
      dtype=object)

In [13]:
# Creating WaDE Custom reportingunit native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp ReportingUnitNativeID dataframe of unique reporting units.
def assignReportingUnitNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEAZ_RU" + string1
    return outstring

dfReportingUnitNativeID = pd.DataFrame()
dfReportingUnitNativeID['in_ReportingUnitName'] = df['AMA']
dfReportingUnitNativeID = dfReportingUnitNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfReportingUnitNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfReportingUnitNativeID['in_ReportingUnitNativeID'] = dftemp.apply(lambda row: assignReportingUnitNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom reportingunit native ID
def retrieveReportingUnitNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfReportingUnitNativeID.loc[(dfReportingUnitNativeID['in_ReportingUnitName'] == A), 'in_ReportingUnitNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['in_ReportingUnitNativeID'] = df.apply(lambda row: retrieveReportingUnitNativeID(row['AMA']), axis=1)
df['in_ReportingUnitNativeID'].unique()

array(['WaDEAZ_RU1', 'WaDEAZ_RU2', 'WaDEAZ_RU3', 'WaDEAZ_RU4',
       'WaDEAZ_RU5'], dtype=object)

## Shape Info

In [14]:
#transfer WKT gemetry from dfshape
def retrieveGeometry(colrowValue, dfshape):
    if (colrowValue == '') or (pd.isnull(colrowValue)):
        outList = ''
    else:
        ml = dfshape.loc[(dfshape['BASIN_NAME'] == colrowValue), 'geometry']
        if not(ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['Geometry'] = df.apply(lambda row: retrieveGeometry(row['AMA'], dfshape), axis=1)
df

Unnamed: 0,AMA,YEAR,SECTOR,CATEGORY,in_WaterSourceType,QUANTITY,in_TimeframeStart,in_TimeframeEnd,in_VariableSpecificCV,in_WaterSourceNativeID,in_ReportingUnitNativeID,Geometry
0,PHOENIX AMA,1985,Agricultural,Demand,Unspecified,1265635,01/01/1985,12/31/1985,Demand_Annual_Agricultural_Unspecified,WaDEAZ_WS1,WaDEAZ_RU1,"POLYGON ((401515.8422997798 3762674.077700167,..."
1,PHOENIX AMA,1985,Agricultural,Supply,Effluent,30138,01/01/1985,12/31/1985,Supply_Annual_Agricultural_Effluent,WaDEAZ_WS2,WaDEAZ_RU1,"POLYGON ((401515.8422997798 3762674.077700167,..."
2,PHOENIX AMA,1985,Agricultural,Supply,Groundwater,647719,01/01/1985,12/31/1985,Supply_Annual_Agricultural_Groundwater,WaDEAZ_WS3,WaDEAZ_RU1,"POLYGON ((401515.8422997798 3762674.077700167,..."
3,PHOENIX AMA,1985,Agricultural,Supply,Surface Water,367146,01/01/1985,12/31/1985,Supply_Annual_Agricultural_Surface Water,WaDEAZ_WS4,WaDEAZ_RU1,"POLYGON ((401515.8422997798 3762674.077700167,..."
4,PHOENIX AMA,1985,Agricultural,Supply,Unspecified,220629,01/01/1985,12/31/1985,Supply_Annual_Agricultural_Unspecified,WaDEAZ_WS1,WaDEAZ_RU1,"POLYGON ((401515.8422997798 3762674.077700167,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2303,TUCSON AMA,2018,Industrial,Supply,Unspecified,123,01/01/2018,12/31/2018,Supply_Annual_Industrial_Unspecified,WaDEAZ_WS1,WaDEAZ_RU5,"POLYGON ((506728.9803997681 3630477.361000173,..."
2304,TUCSON AMA,2018,Municipal,Demand,Unspecified,151568,01/01/2018,12/31/2018,Demand_Annual_Municipal_Unspecified,WaDEAZ_WS1,WaDEAZ_RU5,"POLYGON ((506728.9803997681 3630477.361000173,..."
2305,TUCSON AMA,2018,Municipal,Supply,Effluent,13342,01/01/2018,12/31/2018,Supply_Annual_Municipal_Effluent,WaDEAZ_WS2,WaDEAZ_RU5,"POLYGON ((506728.9803997681 3630477.361000173,..."
2306,TUCSON AMA,2018,Municipal,Supply,Groundwater,23246,01/01/2018,12/31/2018,Supply_Annual_Municipal_Groundwater,WaDEAZ_WS3,WaDEAZ_RU5,"POLYGON ((506728.9803997681 3630477.361000173,..."


In [15]:
#Issue of lengthy geometry result exceeding the nvchar(250) limit of an excel cell.  Using xlsx file to check completness, and csv as input.

# #Printing file to xlsx. Easier to manualky check a xlsx than a csv, but csv loads faster into WaDE system.
# df.to_excel('RawInputData/P_AZagg.xlsx', index=False)

#Printing file to xlsx
df.to_csv('RawInputData/P_AZagg.csv', index=False)