# Working with UT Aggregated Data

Preprocessing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
#Setting work directory, reading inputs, creating dataframe
workingDir = "G:/Shared drives/WaDE Data/Utah/AggregatedAmounts/RawInputData"
os.chdir(workingDir)

# Water Budget Data

In [3]:
#CSV input file
fileInput = "UT Water Budget Data 11102020.xlsx"

In [4]:
# check input 2011
df = pd.read_excel(fileInput)
print(len(df))
df.head(1)

13071


Unnamed: 0,AllocationCropDutyAmount,Amount,BeneficialUseCategory,CommunityWaterSupplySystem,CropTypeCV,CustomerTypeCV,DataPublicationDate,Geometry,InterbasinTransferFromID,InterbasinTransferToID,IrrigatedAcreage,Unnamed: 11,MethodUUID,NAICSCodeCV,OrganizationUUID,PopulationServed,PowerGeneratedGWh,PowerType,Unnamed: 18,Unnamed: 19,ReportingUnitName,ReportingUnitNativeID,ReportingUnitTypeCV,ReportYearCV,Unnamed: 24,TimeframeEnd,TimeframeStart,USGSCategoryCV,UT_VariableCV,UT_VariableSpecificCV,WaterSourceID
0,,80061.753287,Agriculture,,,,,,,,,,Consumptive Use Estimate,111000,UTDWRE,,,,,,Beaver,49001,County,2005,,2006-09-30,2005-10-01,Irrigation,Consumptive Use,"Consumptive Use, Irrigation",Fresh_SW_GW


In [5]:
# Build WaterSourceTypeCV using provdied ADWR water source dictionary.
WSDict = {
"Fresh_SW_GW" : "Surface and Groundwater",
"Fresh_Groundwater" : "Groundwater",
"Fresh_Surface Water" : "Surface Water"}

def createWSTypeCV(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        String1 = colrowValue  # remove whitespace chars
        try:
            outList = WSDict[String1]
        except:
            outList = ''
    return outList

df['tempWST'] = df.apply(lambda row: createWSTypeCV(row['WaterSourceID']), axis=1)
df['tempWST'].unique()

array(['Surface and Groundwater', 'Groundwater', 'Surface Water'],
      dtype=object)

In [6]:
# Creating VariableSpecificCV

def createVariableSpecificCV(VarT, BeUT, WST):
    VarT = str(VarT).strip()
    UT = str(BeUT).strip()
    WST = str(WST).strip()
    
    outString = VarT + "_Annual_" +  BeUT + "_" + WST
    
    return outString

df['in_VariableSpecificCV'] = df.apply(lambda row: createVariableSpecificCV(row['UT_VariableCV'], row['BeneficialUseCategory'], row['tempWST']), axis=1)
df['in_VariableSpecificCV'].unique()

array(['Consumptive Use_Annual_Agriculture_Surface and Groundwater',
       'Withdrawal_Annual_Agriculture_Groundwater',
       'Withdrawal_Annual_Agriculture_Surface Water',
       'Consumptive Use_Annual_Agriculture_Surface Water',
       'Withdrawal_Annual_Municipal/Industrial_Groundwater',
       'Withdrawal_Annual_Municipal/Industrial_Surface Water'],
      dtype=object)

In [7]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEUT_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = df['WaterSourceID']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['in_WaterSourceNativeID'] = df.apply(lambda row: retrieveWaterSourceNativeID(row['WaterSourceID']), axis=1)
df['in_WaterSourceNativeID'].unique()

array(['WaDEUT_WS1', 'WaDEUT_WS2', 'WaDEUT_WS3'], dtype=object)

# Shapefile Data

In [14]:
# Shapefile input
UT_County_SF = gpd.read_file('UT_Counties_SF/UT_Counties.shp', crs="EPSG:4326")
UT_HUC8_SF = gpd.read_file('UT_HUC8_SF/UT_HUC8.shp', crs="EPSG:4326")
UT_Subarea_SF = gpd.read_file('UT_Subarea_SF/UT_Subarea.shp', crs="EPSG:4326")

In [16]:
# UT, County
# get name and geometry
###########################################################################
columnsList = ['RU_Name', 'RU_ID', 'RU_Type','geometry']
dfCountyShape = pd.DataFrame(columns=columnsList)
dfCountyShape['RU_Name'] = UT_County_SF['NAME']
dfCountyShape['RU_ID'] = UT_County_SF['State_RU']
dfCountyShape['RU_ID'] = dfCountyShape['RU_ID'].map(lambda x: x.lstrip("46-"))
dfCountyShape['RU_ID'] = "4" + dfCountyShape['RU_ID']
dfCountyShape['RU_Type'] = 'County'
dfCountyShape['geometry'] = UT_County_SF['geometry']
dfCountyShape.head(3)

Unnamed: 0,RU_Name,RU_ID,RU_Type,geometry
0,Beaver,49001,County,"POLYGON ((-114.05049 38.49996, -114.05015 38.5..."
1,Box Elder,49003,County,"POLYGON ((-114.04255 41.21092, -114.04172 41.9..."
2,Duchesne,49013,County,"POLYGON ((-110.90435 40.70150, -110.89777 40.7..."


In [17]:
# UT, HUC8
# get name and geometry
###########################################################################
columnsList = ['RU_Name', 'RU_ID', 'RU_Type','geometry']
dfHUC8Shape = pd.DataFrame(columns=columnsList)
dfHUC8Shape['RU_Name'] = UT_HUC8_SF['NAME']
dfHUC8Shape['RU_ID'] = UT_HUC8_SF['HUC8']
dfHUC8Shape['RU_Type'] = 'HUC8'
dfHUC8Shape['geometry'] = UT_HUC8_SF['geometry']
dfHUC8Shape.head(3)

Unnamed: 0,RU_Name,RU_ID,RU_Type,geometry
0,Meadow Valley Wash,15010013,HUC8,"POLYGON ((-114.03649 37.95701, -114.03647 37.9..."
1,Lower Virgin,15010010,HUC8,"POLYGON ((-114.06750 37.60696, -114.06708 37.6..."
2,Fort Pearce Wash,15010009,HUC8,"POLYGON ((-112.75584 37.06340, -112.75670 37.0..."


In [18]:
# UT, Subarea
# get name and geometry
###########################################################################
columnsList = ['RU_Name', 'RU_ID', 'RU_Type','geometry']
dfSubareaShape = pd.DataFrame(columns=columnsList)
dfSubareaShape['RU_Name'] = UT_Subarea_SF['RU_Name']
dfSubareaShape['RU_ID'] = UT_Subarea_SF['RU_ID']
dfSubareaShape['RU_Type'] = 'Subarea'
dfSubareaShape['geometry'] = UT_Subarea_SF['geometry']
dfSubareaShape.head(3)

Unnamed: 0,RU_Name,RU_ID,RU_Type,geometry
0,Curlew Valley,00-01-03,Subarea,"POLYGON ((-112.58699 42.00092, -112.57918 41.9..."
1,Clear Creek,000-01-03,Subarea,"POLYGON ((-113.17778 42.00082, -113.20384 41.9..."
2,Promontory Point,00-07-02,Subarea,"POLYGON ((-112.36394 42.00018, -112.38128 41.9..."


In [19]:
# Concatenate shp datafraes together.
frames = [dfCountyShape, dfHUC8Shape, dfSubareaShape]
dfAllShape = pd.concat(frames).reset_index()
dfAllShape

Unnamed: 0,index,RU_Name,RU_ID,RU_Type,geometry
0,0,Beaver,49001,County,"POLYGON ((-114.05049 38.49996, -114.05015 38.5..."
1,1,Box Elder,49003,County,"POLYGON ((-114.04255 41.21092, -114.04172 41.9..."
2,2,Duchesne,49013,County,"POLYGON ((-110.90435 40.70150, -110.89777 40.7..."
3,3,Emery,49015,County,"POLYGON ((-111.30701 38.67233, -111.29914 38.6..."
4,4,Grand,49019,County,"POLYGON ((-110.17897 38.90920, -110.17447 38.9..."
...,...,...,...,...,...
240,144,Brigham City,01-01-07,Subarea,"POLYGON ((-112.01361 41.64418, -112.00460 41.6..."
241,145,Cache Valley,01-01-04,Subarea,"POLYGON ((-111.50799 42.00025, -111.50842 41.9..."
242,146,Randolph,01-03-02,Subarea,"POLYGON ((-111.04977 41.80850, -111.05113 41.5..."
243,147,Evanston,01-03-01,Subarea,"MULTIPOLYGON (((-110.72426 40.99222, -110.7332..."


### Inspect Output Data & Export

In [20]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

AllocationCropDutyAmount             float64
Amount                               float64
BeneficialUseCategory                 object
CommunityWaterSupplySystem           float64
CropTypeCV                           float64
CustomerTypeCV                       float64
DataPublicationDate                  float64
Geometry                             float64
InterbasinTransferFromID             float64
InterbasinTransferToID               float64
IrrigatedAcreage                     float64
Unnamed: 11                          float64
MethodUUID                            object
NAICSCodeCV                            int64
OrganizationUUID                      object
PopulationServed                     float64
PowerGeneratedGWh                    float64
PowerType                            float64
Unnamed: 18                          float64
Unnamed: 19                          float64
ReportingUnitName                     object
ReportingUnitNativeID                 object
ReportingU

In [21]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfAllShape.dtypes)

index          int64
RU_Name       object
RU_ID         object
RU_Type       object
geometry    geometry
dtype: object


In [22]:
# Export out to CSV.
df.to_csv('P_utAggMaster.csv', index=False) # The output.
dfAllShape.to_csv('P_utGeometry.csv', index=False) # The output geometry.