# Working with AZ Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
#Setting work directory, reading inputs, creating dataframe
workingDir = "G:/Shared drives/WaDE Data/Arizona/AggregatedAmounts/RawInputData"
os.chdir(workingDir)

## Input Files

In [None]:
#CSV input file
fileInput = "AMA Demand Supply from DW_use as input.xlsx"
df = pd.read_excel(fileInput)
print(len(df))
df.head(1)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

In [None]:
# WaterSourceType.  Use the BUDGET ELEMENT Use to help determine this.

listOfSurfaceWater = ['Surface Water', 'Surface water', "Streambed"]
listOfGroundwater = ['Groundwater', 'Ground  Water', 'Well', 'well', 'Wells', 'well']
listOfEffluent = ['Effluent', 'EFFLUENT']

def defineWaterSourceType(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outString = 'Unspecified'
    else:
        try:
            if any(word in colrowValue for word in listOfSurfaceWater):
                outString = "Surface Water"
            elif any(word in colrowValue for word in listOfGroundwater):
                outString = "Groundwater"
            elif any(word in colrowValue for word in listOfEffluent):
                outString = "Effluent"
            else:
                outString = "Unspecified"
        except:
            outString = 'Unspecified'
    return outString

df['in_WaterSourceType'] = df.apply(lambda row: defineWaterSourceType(row['BUDGET ELEMENT']), axis=1)
df['in_WaterSourceType'].unique()

In [None]:
#Dropping Columns we don't need fields we don't need.
df = df.drop(['PARENT WATER TYPE OR SECTOR', 'BUDGET ELEMENT'], axis=1)
df.head(1)

In [None]:
#Dropping rows of CATEGORY we don't need.  Only include 'Demand' & 'Supply'.
df = df[(df.CATEGORY == 'Demand') | (df.CATEGORY == 'Supply')]
print(len(df))
df.head(1)

In [None]:
#Group by and Sum the 'Quantity' field.
df = df.groupby(['AMA','YEAR', 'SECTOR', 'CATEGORY', 'in_WaterSourceType'])['QUANTITY'].sum().reset_index()
print(len(df))
df.head(1)

In [None]:
# TimeframeStart & TimeframeEnd

df['in_TimeframeStart'] = '01/01/' + df['YEAR'].astype(str)
df['in_TimeframeEnd'] = '12/31/' + df['YEAR'].astype(str)

## WaDE Custom Elements (due to missing reporting unit and water source info)

In [None]:
# Creating VariableSpecific

def createVariableSpecificCV(Cat, Sec, WST):
    Cat = str(Cat).strip()
    Sec = str(Sec).strip()
    WST = str(WST).strip()
    
    outString = Cat + "_Annual_" +  Sec + "_" + WST
    
    return outString

df['in_VariableSpecificCV'] = df.apply(lambda row: createVariableSpecificCV(row['CATEGORY'], row['SECTOR'], row['in_WaterSourceType']), axis=1)
df['in_VariableSpecificCV'].unique()
df['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEAZ_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceType'] = df['in_WaterSourceType']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceType'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['in_WaterSourceNativeID'] = df.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceType']), axis=1)
df['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom reportingunit native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp ReportingUnitNativeID dataframe of unique reporting units.
def assignReportingUnitNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEAZ_RU" + string1
    return outstring

dfReportingUnitNativeID = pd.DataFrame()
dfReportingUnitNativeID['in_ReportingUnitName'] = df['AMA']
dfReportingUnitNativeID = dfReportingUnitNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfReportingUnitNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfReportingUnitNativeID['in_ReportingUnitNativeID'] = dftemp.apply(lambda row: assignReportingUnitNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom reportingunit native ID
def retrieveReportingUnitNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfReportingUnitNativeID.loc[(dfReportingUnitNativeID['in_ReportingUnitName'] == A), 'in_ReportingUnitNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['in_ReportingUnitNativeID'] = df.apply(lambda row: retrieveReportingUnitNativeID(row['AMA']), axis=1)
df['in_ReportingUnitNativeID'].unique()

In [None]:
df['AMA'].unique()

# Shapefile Data

In [None]:
# Shapefile input
AZ_AMA = gpd.read_file('AMA_and_INA-shp/AMA_and_INA2.shp', crs="EPSG:4326")
dfshape = pd.DataFrame(AZ_AMA)
print(len(dfshape))
dfshape

In [None]:
#transfer WKT gemetry from dfshape
def retrieveRUID(colrowValue, df):
    ml = df.loc[(df['AMA'] == colrowValue), 'in_ReportingUnitNativeID']
    if not(ml.empty):  # check if the series is empty
        outList = ml.iloc[0]
    else:
        outList = ''
    return outList

dfshape['in_ReportingUnitNativeID'] = dfshape.apply(lambda row: retrieveRUID(row['BASIN_NAME'], df), axis=1)
dfshape

In [None]:
# UT, Subarea
# get name and geometry
###########################################################################
columnsList = ['in_ReportingUnitNativeID', 'geometry']
dfgAZ = pd.DataFrame(columns=columnsList)
dfgAZ['in_ReportingUnitNativeID'] = dfshape['in_ReportingUnitNativeID']
dfgAZ['geometry'] = dfshape['geometry']
dfgAZ = dfgAZ[dfgAZ['in_ReportingUnitNativeID'] != ""].reset_index(drop=True)
dfgAZ

In [None]:
# Export out to CSV.
df.to_csv('P_AZagg.csv', index=False) # The output.
dfgAZ.to_csv('P_agGeometry.csv', index=False) # The output geometry.