# Pre-processing Utah Allocation data for WaDEQA upload.
- Purpose:  To pre-process the Utah data into one master file for simple DataFrame creation and extraction

In [None]:
#Needed Libararies
import os
import re
import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

# Working Directory
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Utah/WaterAllocation/RawInputData"
os.chdir(workingDir)

from pyproj import Transformer, transform
transformer = Transformer.from_proj(26912, 4326)  # A trick to drastically optimize the Transformer of pyproj.
# Utah projection = EPSG:26912.  WGS84 projection used by WaDE 2.0 = epsg:4326.

## Point of Diversion Data

In [None]:
# Input Files
FI_PoD = "Point of Diversion/PointsOfDiversion_input.csv"
FI_WMs = "Point of Diversion/WRCHEX_WATER_MASTER.csv"
FI_Irr = "Point of Diversion/IRRIGATION_MASTER.csv"
FI_Mun = "Point of Diversion/WTRUSE_MUNICIPAL.csv"
FI_Pow = "Point of Diversion/WTRUSE_POWER.csv"

In [None]:
# Dataframe creation
dfPODin = pd.read_csv(FI_PoD, encoding = "ISO-8859-1") # Point of Diversion Input
dfWMain = pd.read_csv(FI_WMs, encoding = "ISO-8859-1") # Irrigation Input
dfIrrin = pd.read_csv(FI_Irr, encoding = "ISO-8859-1") # Irrigation Input
dfMunin = pd.read_csv(FI_Mun, encoding = "ISO-8859-1") # Municiplal Input
dfPowin = pd.read_csv(FI_Pow, encoding = "ISO-8859-1") # Power Inpu
dfPOD = pd.DataFrame() # Output

In [None]:
# Merging dataframes into one, using left-join.
dfPOD = pd.merge(dfPODin, dfWMain[['WRNUM', 'DATE_FILED', 'DATE_TERMINATED', 'IRRIGATION_DEPLETION']], on='WRNUM', how='left')
dfPOD = pd.merge(dfPOD, dfIrrin[['WRNUM', 'USE_END_DATE', 'USE_BEG_DATE']], on='WRNUM', how='left')
dfPOD = pd.merge(dfPOD, dfMunin[['WRNUM', 'MUNICIPALITY']], on='WRNUM', how='left')
dfPOD = pd.merge(dfPOD, dfPowin[['WRNUM', 'POWER_CAPACITY']], on='WRNUM', how='left')
print(len(dfPOD))
dfPOD.head(3)

In [None]:
# Assign PODorPOUSite value
dfPOD['in_PODorPOUSite'] = "POD"

In [None]:
dfPOD

## Place of Use Data

In [None]:
# Input Files
FI_POU = "Place of Use/Utah_Place_of_Use_input.csv"

In [None]:
# Dataframe creation
dfPOU = pd.DataFrame() # Output
dfPOUin = pd.read_csv(FI_POU, encoding = "ISO-8859-1") # Place of Use Input
print(len(dfPOUin))
dfPOUin

In [None]:
# Remove empty WRNUMS rows, can't match those to anything.
def emptyWRNUMS(val):
    val = str(val).strip()
    val = val.rstrip(",")  # strip trailing commas
    return val

dfPOUin['WRNUMS'] = dfPOUin.apply(lambda row: emptyWRNUMS(row['WRNUMS']), axis=1)
dfPOUin = dfPOUin[dfPOUin['WRNUMS'] != '']
dfPOUin = dfPOUin.reset_index(drop=True)

In [None]:
# Need to split out WRNUMS into their own row
# The explode() method explodes lists into separate rows.
dfPOUin = dfPOUin.assign(WRNUMS=dfPOUin['WRNUMS'].str.split(',')).explode('WRNUMS')
dfPOUin = dfPOUin.rename({'WRNUMS': 'WRNUM'}, axis=1)
dfPOUin

In [None]:
# Merging dataframes into one, using left-join.
dfPOU = pd.merge(dfPOUin, dfWMain[['WRNUM', 'DATE_FILED', 'DATE_TERMINATED', 'IRRIGATION_DEPLETION']], on='WRNUM', how='left')
dfPOU = pd.merge(dfPOU, dfIrrin[['WRNUM', 'USE_END_DATE', 'USE_BEG_DATE']], on='WRNUM', how='left')
dfPOU = pd.merge(dfPOU, dfMunin[['WRNUM', 'MUNICIPALITY']], on='WRNUM', how='left')
dfPOU = pd.merge(dfPOU, dfPowin[['WRNUM', 'POWER_CAPACITY']], on='WRNUM', how='left')
print(len(dfPOU))
dfPOU.head(3)

In [None]:
# Assign PODorPOUSite value
dfPOU['in_PODorPOUSite'] = "POU"

In [None]:
dfPOU

## Concatenate POD and POU Data

In [None]:
# Concatenate
frames = [dfPOD, dfPOU]
dfout = pd.concat(frames)

#Removing all NaN Values and replacing with blank
dfout = dfout.replace(np.nan, "", regex=True)

print(len(dfout))
dfout

In [None]:
# Making Sure datatype of Long, Lat, Wrex, Irrigation are Float
dfout['CFS'] = pd.to_numeric(dfout['CFS'], errors='coerce')
dfout['ACFT'] = pd.to_numeric(dfout['ACFT'], errors='coerce')
dfout['IRRIGATION_DEPLETION'] = pd.to_numeric(dfout['IRRIGATION_DEPLETION'], errors='coerce')

In [None]:
# Changing datatype of used date fields. 
dfout['PRIORITY'] = pd.to_datetime(dfout['PRIORITY'], errors = 'coerce')
dfout['PRIORITY'] = pd.to_datetime(dfout["PRIORITY"].dt.strftime('%m/%d/%Y'))

dfout['DATE_FILED'] = pd.to_datetime(dfout['DATE_FILED'], errors = 'coerce')
dfout['DATE_FILED'] = pd.to_datetime(dfout["DATE_FILED"].dt.strftime('%m/%d/%Y'))

dfout['DATE_TERMINATED'] = pd.to_datetime(dfout['DATE_TERMINATED'], errors = 'coerce')
dfout['DATE_TERMINATED'] = pd.to_datetime(dfout["DATE_TERMINATED"].dt.strftime('%m/%d/%Y'))

In [None]:
# Creating WaterSourceTypeCV

WaterSourceTypeCVDictionary={
"Underground" : "Groundwater",
"Abandonded Well" : "Groundwater",
"Point to Point" : "Surface Water",
"Surface" : "Surface Water",
"Return" : "Surface Water",
"Drain" : "Surface Water",
"Spring" : "Surface Water",
"Rediversion" : "Surface Water"}
def CreateWaterSourceTypeCV(val):
    if val == '' or pd.isnull(val):
        outString = "Unspecified"
    else:
        val = val.strip()
        try:
            outString = WaterSourceTypeCVDictionary[val]
        except:
            outString = "Unspecified"
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: CreateWaterSourceTypeCV(row['TYPE']), axis=1)
dfout

In [None]:
#Compiling 'AllocationTimeframeStart' & 'AllocationTimeframeEnd'
#Both can have a string format for WaDE 2.0.
    
def assignTime(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outlist = ""
    else:
        colrowValue = str(colrowValue)
        colrowValue = colrowValue.strip()
        if len(colrowValue) == 4:
            startmonth = colrowValue[0:1]
            startday = colrowValue[1:2]
            outlist = "0" + startmonth + "/" + "0" + startday
        elif len(colrowValue) == 5:
            startmonth = colrowValue[0:1]
            startday = colrowValue[1:3]
            outlist = "0" + startmonth + "/" + startday
        elif len(colrowValue) == 6:
            startmonth = colrowValue[0:2]
            startday = colrowValue[2:4]
            outlist = startmonth + "/" + startday
        else:
            outlist = ""
  
    return outlist


dfout['in_AllocationTimeframeStart'] = dfout.apply(lambda row: assignTime(row['USE_BEG_DATE']), axis=1)
dfout['in_AllocationTimeframeEnd'] = dfout.apply(lambda row: assignTime(row['USE_END_DATE']), axis=1)

In [None]:
# Assign SiteTypeCV value.
# Uses the re library, but requires for loop.
# Order that the lists are inputed into dictoinary is important, want to overide generic search with a more specific search.

# Create the Lists
canalList = ["canal", "canals"]
creekList = ["creek"]
ditchList = ["ditch"]
drainList = ["drain", "drains"]
lakeList = ["lake"]
pondList = ["pond"]
reservoirList = ["reservoir"]
riverList = ["river", "fork", "surface"]
sloughList = ["slough"]
springList = ["spring", "springs", "gulch", "seep"]
tunnelList = ["tunnel", "tunnels"]
washList = ["wash"]
wellList = ["well", "wells", "well:", "draw", "hollow"]

# Making the dictionary
listDictionary = {}
listDictionary["Canal"] = canalList
listDictionary["Creek"] = creekList
listDictionary["Ditch"] = ditchList
listDictionary["Drain"] = drainList
listDictionary["Lake"] = lakeList
listDictionary["Pond"] = pondList
listDictionary["Reservoir"] = reservoirList
listDictionary["River"] = riverList
listDictionary["Slough"] = sloughList
listDictionary["Spring"] = springList
listDictionary["Tunnel"] = tunnelList
listDictionary["Wash"] = washList
listDictionary["Well"] = wellList

def CreateSiteTypeCV(val):
    if val == '' or pd.isnull(val):
        outString = "Unspecified"
    else:
        outString = "Unspecified" # Default
        
        # Cleaning text / simple search format
        val = val.replace(",", " ")
        val = val.replace(".", " ")
        val = val.replace(";", " ")
        val = val.replace("-", " ")
        val = val.replace("/", " ")
        val = val.replace("(", " ")
        val = val.replace(")", " ")
        val = val.lower().strip()
        val = " "+val+" "
        
        for x in listDictionary:
            labelString = x
            valueList = listDictionary[x]
            for words in valueList:
                if re.search(" "+words+ " ", val): outString = x
            
    return outString

dfout['in_SiteTypeCV'] = dfout.apply(lambda row: CreateSiteTypeCV( row['SOURCE']), axis=1)

In [None]:
# Assign LegalStatusCV value.
# Uses the re library, but requires for loop.
# Order that the lists are inputed into dictoinary is important, want to overide generic search with a more specific search.

# Create the Lists
ADECList = ["ADEC"]
ADVList = ["ADV"]
APPList = ["APP"]
CERTList = ["CERT"]
DECList = ["DEC"]
DILList = ["DIL"]
DISList = ["DIS"]
EXPList = ["EXP"]
FORFList = ["FORF"]
LAPList = ["LAP"]
NPRList = ["NPR"]
NUSEList = ["NUSE"]
PERFList = ["PERF"]
REJList = ["REJ"]
RNUMList = ["RNUM"]
STATUSList = ["STATUS"]
TEMPList = ["TEMP"]
TERMList = ["TERM"]
UGWCList = ["UGWC"]
UNAPList = ["UNAP"]
WDList = ["WD"]
WUCList = ["WUC"]


# Making the dictionary
listDictionary = {}

listDictionary["Lapsed"] = LAPList

listDictionary["Adjudication Decree"] = ADECList
listDictionary["Adverse Use Claim"] = ADVList
listDictionary["Approved"] = APPList
listDictionary["Certificated"] = CERTList
listDictionary["Decree"] = DECList
listDictionary["Diligence Claim"] = DILList
listDictionary["Disallowed"] = DISList
listDictionary["Expired"] = EXPList
listDictionary["Forfeited"] = FORFList
listDictionary["No Proof Required"] = NPRList
listDictionary["Nonuse"] = NUSEList
listDictionary["Perfected"] = PERFList
listDictionary["Rejected"] = REJList
listDictionary["Renumbered"] = RNUMList
listDictionary["Deff"] = STATUSList
listDictionary["Temp Applications"] = TEMPList
listDictionary["Terminated"] = TERMList
listDictionary["Underground Water Claim"] = UGWCList
listDictionary["Unapproved"] = UNAPList
listDictionary["Withdrawn"] = WDList
listDictionary["Water User`s Claim"] = WUCList


def CreateLegalStatus(val):
    val = str(val).strip()
    if val == "" or pd.isnull(val):
        outString = ""
    else:
        outString = ""
        for x in listDictionary:
            valueList = listDictionary[x]
            for words in valueList:
                if words in val: outString = x

    return outString

dfout['in_LegalStatus'] = dfout.apply(lambda row: CreateLegalStatus( row['STATUS']), axis=1)

## WaDE Custom Elements (due to missing state site info)

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEUT_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceTypeCV']), axis=1)

In [None]:
#Removing all NaN Values and replacing with blank
dfout = dfout.replace(np.nan, "", regex=True)

dfout

In [None]:
#Exporting to Finished File
dfout.to_csv('P_UtahMaster.csv', index=False)  # The output