# Pre-processing Utah Allocation data for WaDEQA upload.
- Purpose:  To pre-process the Utah data into one master file for simple DataFrame creation and extraction

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Utah/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Point of Diversion Data

In [None]:
# Input File
FI_PoD = "PointsOfDiversion_input.zip"
dfinPOD = pd.read_csv(FI_PoD, encoding = "ISO-8859-1")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "utD" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('PointsOfDiversion_input.zip', compression=dict(method='zip', archive_name='PointsOfDiversion_input.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head()

In [None]:
# For creating BeneficialUseCategory
benUseDict = {
    "I" : "Irrigation",
    "S" : "Stockwatering",
    "D" : "Domestic",
    "M" : "Municipal",
    "X" : "Mining",
    "P" : "Power",
    "O" : "Other"}
def assignBenUseCategory(colrowValue):
    colrowValue = str(colrowValue).strip()
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        outList = ",".join(benUseDict[inx] for inx in list(str(colrowValue)))
    return outList


dfinPOD['in_BeneficialUseCategory'] = dfinPOD.apply(lambda row: assignBenUseCategory(row['USES']), axis=1)
dfinPOD['in_BeneficialUseCategory'].unique()

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "UTwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "UTwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "UTwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Unspecified"
df['in_WaterSourceNativeID'] = "" #auto fill in below
df['in_WaterSourceTypeCV'] = dfinPOD['TYPE']

# Site Info
df['in_CoordinateAccuracy'] = "WaDE Unspecified"
df['in_CoordinateMethodCV'] = "WaDE Unspecified"
df['in_County'] = "WaDE Unspecified"
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD['Latitude']
df['in_Longitude'] = dfinPOD['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = dfinPOD['SOURCE']
df['in_SiteNativeID'] = "POD" + dfinPOD['OBJECTID'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfinPOD['SOURCE']
df['in_StateCV'] = "UT"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD['CFS'].astype(float)
df['in_AllocationLegalStatusCV'] = dfinPOD['STATUS']
df['in_AllocationNativeID'] =  dfinPOD['WRNUM'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfinPOD['OWNER']
df['in_AllocationPriorityDate'] = dfinPOD['PRIORITY']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfinPOD['ACFT']
df['in_BeneficialUseCategory'] = dfinPOD['in_BeneficialUseCategory']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfinPOD['WebLink']

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

## Place of Use Data

In [None]:
# Input File
FI_POU = "Utah_Place_of_Use_Irrigation.zip"
dfinPOU = pd.read_csv(FI_POU, encoding = "ISO-8859-1") # Place of Use Input

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOU:
    dfinPOU['WaDEUUID'] = "utU" + dfinPOU.index.astype(str)
    dfinPOU.to_csv('Utah_Place_of_Use_Irrigation.zip', compression=dict(method='zip', archive_name='Utah_Place_of_Use_Irrigation.csv'), index=False)

dfinPOU = dfinPOU.drop_duplicates().reset_index(drop=True)
print(len(dfinPOU))
dfinPOU.head()

In [None]:
# # I manually solved this to prevet future errors.

# # # Need to split out WRNUMS into their own row
# # # The explode() method explodes lists into separate rows.
# # dfPOUin = dfPOUin.assign(WRNUMS=dfPOUin['WRNUMS'].str.split(',')).explode('WRNUMS').reset_index()
# # dfPOUin = dfPOUin.rename({'WRNUMS': 'WRNUM'}, axis=1)
# # dfPOUin = dfPOUin.replace(np.nan, "").reset_index()
# # print(len(dfPOUin))
# # dfPOUin.head(1)

# dfPOUin = dfPOUin.drop_duplicates().reset_index(drop=True)
# print(len(dfPOUin))

In [None]:
# CFS not provided for POU data.  Will instead assume they share values.
# merging dfPOD data to ensure that the POUs are using the same CFS and AF as the PODS.
dfinPOU = pd.merge(dfinPOU, outPOD, left_on='WRNUM', right_on='in_AllocationNativeID', how='left')
print(len(dfinPOU))
dfinPOU.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOU['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "UTwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "UTwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "UTwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Unspecified"
df['in_WaterSourceNativeID'] = "" #auto fill in below
df['in_WaterSourceTypeCV'] = "WaDE Unspecified"

# Site Info
df['in_CoordinateAccuracy'] = "WaDE Unspecified"
df['in_CoordinateMethodCV'] = "WaDE Unspecified"
df['in_County'] = "WaDE Unspecified"
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOU['Latitude']
df['in_Longitude'] = dfinPOU['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POU"
df['in_SiteName'] = "WaDE Unspecified"
df['in_SiteNativeID'] = "POU" + dfinPOU['RECORD_ID'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "WaDE Unspecified"
df['in_StateCV'] = "UT"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOU['in_AllocationFlow_CFS'].astype(float) # from POD data
df['in_AllocationLegalStatusCV'] = dfinPOU['in_AllocationLegalStatusCV'] # from POD data
df['in_AllocationNativeID'] =  dfinPOU['WRNUM'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfinPOU['in_AllocationOwner'] # from POD data
df['in_AllocationPriorityDate'] = dfinPOU['in_AllocationPriorityDate'] # from POD data
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfinPOU['in_AllocationVolume_AF'].astype(float)
df['in_BeneficialUseCategory'] = dfinPOU['in_BeneficialUseCategory'] # from POD data
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOU['ACRES'].astype(float)
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfinPOU['in_WaterAllocationNativeURL'] # from POD data

outPOU = df.copy()
outPOU = outPOU.drop_duplicates().reset_index(drop=True)
print(len(outPOU))
outPOU.head()

## Concatenate POD and POU Data.  Clean Data.

In [None]:
# Concatenate dataframes
frames = [outPOD, outPOU]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

In [None]:
# Creating WaterSourceTypeCV

WaterSourceTypeCVDictionary={
"Underground" : "Groundwater",
"Abandonded Well" : "Groundwater",
"Point to Point" : "Surface Water",
"Surface" : "Surface Water",
"Return" : "Surface Water",
"Drain" : "Surface Water",
"Spring" : "Surface Water",
"Rediversion" : "Surface Water"}
def CreateWaterSourceTypeCV(val):
    if val == "" or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        val = val.strip()
        try:
            outString = WaterSourceTypeCVDictionary[val]
        except:
            outString = "WaDE Unspecified"
    return outString

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: CreateWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
# Assign SiteTypeCV value.
# Uses the re library, but requires for loop.
# Order that the lists are inputed into dictoinary is important, want to overide generic search with a more specific search.

# Create the Lists
canalList = ["canal", "canals"]
creekList = ["creek"]
ditchList = ["ditch"]
drainList = ["drain", "drains"]
lakeList = ["lake"]
pondList = ["pond"]
reservoirList = ["reservoir"]
riverList = ["river", "fork", "surface"]
sloughList = ["slough"]
springList = ["spring", "springs", "gulch", "seep"]
tunnelList = ["tunnel", "tunnels"]
washList = ["wash"]
wellList = ["well", "wells", "well:", "draw", "hollow"]

# Making the dictionary
listDictionary = {}
listDictionary["Canal"] = canalList
listDictionary["Creek"] = creekList
listDictionary["Ditch"] = ditchList
listDictionary["Drain"] = drainList
listDictionary["Lake"] = lakeList
listDictionary["Pond"] = pondList
listDictionary["Reservoir"] = reservoirList
listDictionary["River"] = riverList
listDictionary["Slough"] = sloughList
listDictionary["Spring"] = springList
listDictionary["Tunnel"] = tunnelList
listDictionary["Wash"] = washList
listDictionary["Well"] = wellList

def CreateSiteTypeCV(val):
    if val == "" or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = "WaDE Unspecified" # Default
        
        # Cleaning text / simple search format
        val = val.replace(",", " ")
        val = val.replace(".", " ")
        val = val.replace(";", " ")
        val = val.replace("-", " ")
        val = val.replace("/", " ")
        val = val.replace("(", " ")
        val = val.replace(")", " ")
        val = val.lower().strip()
        val = " "+val+" "
        
        for x in listDictionary:
            labelString = x
            valueList = listDictionary[x]
            for words in valueList:
                if re.search(" "+words+ " ", val): outString = x
            
    return outString

outdf['in_SiteTypeCV'] = outdf.apply(lambda row: CreateSiteTypeCV( row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
# Assign LegalStatusCV value.
# Uses the re library, but requires for loop.
# Order that the lists are inputed into dictoinary is important, want to overide generic search with a more specific search.

# Create the Lists
ADECList = ["ADEC"]
ADVList = ["ADV"]
APPList = ["APP"]
CERTList = ["CERT"]
DECList = ["DEC"]
DILList = ["DIL"]
DISList = ["DIS"]
EXPList = ["EXP"]
FORFList = ["FORF"]
LAPList = ["LAP"]
NPRList = ["NPR"]
NUSEList = ["NUSE"]
PERFList = ["PERF"]
REJList = ["REJ"]
RNUMList = ["RNUM"]
STATUSList = ["STATUS"]
TEMPList = ["TEMP"]
TERMList = ["TERM"]
UGWCList = ["UGWC"]
UNAPList = ["UNAP"]
WDList = ["WD"]
WUCList = ["WUC"]


# Making the dictionary
listDictionary = {}

listDictionary["Lapsed"] = LAPList

listDictionary["Adjudication Decree"] = ADECList
listDictionary["Adverse Use Claim"] = ADVList
listDictionary["Approved"] = APPList
listDictionary["Certificated"] = CERTList
listDictionary["Decree"] = DECList
listDictionary["Diligence Claim"] = DILList
listDictionary["Disallowed"] = DISList
listDictionary["Expired"] = EXPList
listDictionary["Forfeited"] = FORFList
listDictionary["No Proof Required"] = NPRList
listDictionary["Nonuse"] = NUSEList
listDictionary["Perfected"] = PERFList
listDictionary["Rejected"] = REJList
listDictionary["Renumbered"] = RNUMList
listDictionary["Deff"] = STATUSList
listDictionary["Temp Applications"] = TEMPList
listDictionary["Terminated"] = TERMList
listDictionary["Underground Water Claim"] = UGWCList
listDictionary["Unapproved"] = UNAPList
listDictionary["Withdrawn"] = WDList
listDictionary["Water User`s Claim"] = WUCList


def CreateLegalStatus(val):
    val = str(val).strip()
    if val == "" or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = ""
        for x in listDictionary:
            valueList = listDictionary[x]
            for words in valueList:
                if words in val: outString = x
    
    if outString == "" or pd.isnull(val):
        outString = "WaDE Unspecified"

    return outString

outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: CreateLegalStatus( row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

In [None]:
# Clean Owner info. Remove special characters
import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().strip()
    return Val

outdf['in_AllocationOwner'] = outdf.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# Fixing empty string names

def fixEmptyString(val):
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: fixEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fixEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# in_Latitude & in_Longitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna(0)
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna(0)
outdf.head(1)

In [None]:
# Changing datatype of used date fields. 
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').fillna(0)
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').fillna(0)
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Fixing in_IrrigatedAcreage datatype
outdf['in_IrrigatedAcreage'] = pd.to_numeric(outdf['in_IrrigatedAcreage'], errors='coerce').fillna(0)
outdf['in_IrrigatedAcreage'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [None]:
# PoU Shapefile Data
# Shapefile input
dfPoUshapetemp = gpd.read_file('Utah_Place_of_Use_Irrigation/Utah_Place_of_Use_Irrigation.shp')
print(len(dfPoUshapetemp))
dfPoUshapetemp.head(1)

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['RECORD_ID'].replace("", 0).fillna(0).astype(int).astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

## Export Data

In [None]:
outdf.info

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('Pwr_utMain.zip', index=False, compression="zip")  # The output, save as a zip
dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.