# Pre-processing Idaho Allocation data for WaDEQA upload.
- Date Updated: 09/09/2022
- Purpose:  To pre-process the Idaho data into one master file for simple DataFrame creation and extraction.  Working Idaho data for WaDEQA 2.0 is mostly composed of point of diversion data.
- Notes: working with POD and POU data.  Working with assumption that both POD and POU data share the same water right record information.

In [None]:
# Needed Libararies
import os
import numpy as np
import pandas as pd
import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Idaho/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Import Input Data

In [None]:
# POD Data
FI_POD = "Water_Right_PODs_input.csv"
dfinPOD = pd.read_csv(FI_POD)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "idD" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('Water_Right_PODs_input.csv', index=False)

print(len(dfinPOD))
dfinPOD.head(1)

In [None]:
# POU
FI_POU = "WaterRightPOUs_input.csv"
dfinPOU = pd.read_csv(FI_POU)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOU:
    dfinPOU['WaDEUUID'] = "idU" + dfinPOU.index.astype(str)
    dfinPOU.to_csv('WaterRightPOUs_input.csv', index=False)

print(len(dfinPOU))
dfinPOU.head(1)

## POD Sites Data

In [None]:
# extract a few POU elements

dfinPODm = pd.merge(dfinPOD, dfinPOU[['RightID', 'WaterUse', 'AcreLimit', 'TotalAcres']], on='RightID', how='left')
print(len(dfinPODm))
dfinPODm.head(1)

In [None]:
# Idaho projection = EPSG:8826.  WGS84 projection used by WaDE 2.0 = epsg:4326.
from pyproj import Transformer, transform
transformer = Transformer.from_proj(8826, 4326)  # A trick to drastically optimize the Transformer of pyproj.

# For converting projection latitude.
def assignLat(colrowValLat, colrowValLong):
    lat, long = transformer.transform(colrowValLat, colrowValLong)
    return lat

# For converting projection longitude.
def assignLong(colrowValLat, colrowValLong):
    lat, long = transformer.transform(colrowValLat, colrowValLong)
    return long


dfinPODm['Latitude'] = dfinPODm.apply(lambda row: assignLat(row['Y'], row['X']), axis=1)
dfinPODm['Longitude'] = dfinPODm.apply(lambda row: assignLong(row['Y'], row['X']), axis=1)
dfinPODm.head(1)

In [None]:
# IrrigatedAcreage

def assignIrrigatedAcreage(AcreLimit, TotalAcres):
    AcreLimit = float(AcreLimit)
    TotalAcres = float(TotalAcres)
    if AcreLimit > 0:
        outVal = AcreLimit
    else:
        outVal = TotalAcres
    return outVal

dfinPODm['in_IrrigatedAcreage'] = dfinPODm.apply(lambda row: assignIrrigatedAcreage(row['AcreLimit'], row['TotalAcres']), axis=1)
dfinPODm['in_IrrigatedAcreage'].unique()

In [None]:
# create output POD dataframe

dfPOD = pd.DataFrame()

# Data Assessment UUID
dfPOD['WaDEUUID'] = dfinPODm['WaDEUUID']

# Water Source
dfPOD['in_WaterSourceName'] = dfinPODm['Source'].str.strip()
dfPOD['in_WaterSourceTypeCV'] = dfinPODm['Source'].str.strip()

# Site
dfPOD['in_CoordinateAccuracy'] = "Unspecified"
dfPOD['in_CoordinateMethodCV'] = dfinPODm['DataSource']
dfPOD['in_County'] = "Unspecified"
dfPOD['in_Latitude'] = dfinPODm['Y']
dfPOD['in_Longitude'] = dfinPODm['X']
dfPOD['in_PODorPOUSite'] = "POD"
dfPOD['in_SiteName'] = dfinPODm['DiversionName']
dfPOD['in_SiteNativeID'] = "POD" + dfinPODm['PointOfDiversionID'].astype(str).str.strip()
dfPOD['in_SiteTypeCV'] = "Unspecified"

# Allocation Fact
dfPOD['in_AllocationNativeID'] = dfinPODm['WaterRightNumber']
dfPOD['in_AllocationOwner'] = dfinPODm['Owner']
dfPOD['in_AllocationPriorityDate'] = dfinPODm['PriorityDate']
dfPOD['in_AllocationFlow_CFS'] = dfinPODm['OverallMaxDiversionRate']
dfPOD['in_AllocationBasisCV'] = dfinPODm['Basis'].str.strip()
dfPOD['in_AllocationLegalStatusCV'] = dfinPODm['Status'].str.strip()
dfPOD['in_BeneficialUseCategory'] = dfinPODm['WaterUse'].str.strip()
dfPOD['in_IrrigatedAcreage'] = dfinPODm['in_IrrigatedAcreage']
dfPOD['in_WaterAllocationNativeURL'] = dfinPODm['WRDocs']

print(len(dfPOD))
dfPOD.head(1)

## POU Site Data

In [None]:
# extract POD elements

dfinPOUm = pd.merge(dfinPOU, dfinPOD, on='RightID', how='left')
print(len(dfinPOUm))
dfinPOUm.head(1)

In [None]:
# IrrigatedAcreage

def assignIrrigatedAcreage(AcreLimit, TotalAcres):
    AcreLimit = float(AcreLimit)
    TotalAcres = float(TotalAcres)
    if AcreLimit > 0:
        outVal = AcreLimit
    else:
        outVal = TotalAcres
    return outVal

dfinPOUm['in_IrrigatedAcreage'] = dfinPOUm.apply(lambda row: assignIrrigatedAcreage(row['AcreLimit'], row['TotalAcres']), axis=1)
dfinPOUm['in_IrrigatedAcreage'].unique()

In [None]:
# create output POD dataframe

dfPOU = pd.DataFrame()

# Data Assessment UUID
dfPOU['WaDEUUID'] = dfinPOUm['WaDEUUID_x']

# Water Source
dfPOU['in_WaterSourceName'] = dfinPOUm['Source_x'].str.strip()
dfPOU['in_WaterSourceTypeCV'] = dfinPOUm['Source_x'].str.strip()

# Site
dfPOU['in_CoordinateAccuracy'] = "Unspecified"
dfPOU['in_CoordinateMethodCV'] = "Centroid"
dfPOU['in_County'] = "Unspecified"
dfPOU['in_Latitude'] = dfinPOUm['Latitude']
dfPOU['in_Longitude'] = dfinPOUm['Longitude']
dfPOU['in_PODorPOUSite'] = "POU"
dfPOU['in_SiteName'] = "Unspecified"
dfPOU['in_SiteTypeCV'] = "Unspecified"
dfPOU['in_SiteNativeID'] = "POU" + dfinPOUm['PlaceOfUse'].astype(str).str.strip()

# Allocation Fact
dfPOU['in_AllocationNativeID'] = dfinPOUm['WaterRight']
dfPOU['in_AllocationOwner'] = dfinPOUm['Owner_x']
dfPOU['in_AllocationPriorityDate'] = dfinPOUm['PriorityDa']
dfPOU['in_AllocationFlow_CFS'] = dfinPOUm['OverallMaxDiversionRate']
dfPOU['in_AllocationBasisCV'] = dfinPOUm['Basis'].str.strip()
dfPOU['in_AllocationLegalStatusCV'] = dfinPOUm['Status_x'].str.title()
dfPOU['in_BeneficialUseCategory'] = dfinPOUm['WaterUse'].str.strip()
dfPOU['in_IrrigatedAcreage'] = dfinPOUm['in_IrrigatedAcreage']
dfPOU['in_WaterAllocationNativeURL'] = dfinPOUm['WRDocs_x']

print(len(dfPOU))
dfPOU.head(1)

## Concatenate and Clean Data

In [None]:
# Concatenate dataframes
frames = [dfPOD, dfPOU]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates()
print(len(outdf))

In [None]:
# clean up AllocationNativeID

def cleanAllocationNativeID(Val):
    Val = str(Val).strip()
    return Val

outdf['in_AllocationNativeID'] = outdf.apply(lambda row: cleanAllocationNativeID(row['in_AllocationNativeID']), axis=1)
outdf['in_AllocationNativeID'].unique()

In [None]:
# remove special characters AllocationOwner

import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,'/\)(-]", "", Val).strip()
    return Val

outdf['in_AllocationOwner'] = outdf.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# WaterSourceType

WaterSourceTypeDict = {
"River" : "Surface Water",
"Ground Water" : "Groundwater",
"Spring" :  "Groundwater",
"Lake" :  "Surface Water",
"Pond" :  "Surface Water", 
"Canal" :  "Surface Water",
"Creek" : "Surface Water",
"Fork" : "Surface Water",
"Waste Water" : "Reuse",
"Drain" : "Surface Water",
"Gluch" : "Surface Water",
"Reservoir" : "Surface Water",
"Slough" : "Surface Water",
"Ditch" : "Surface Water",
"Channel" : "Surface Water",
"Dry" : "Surface Water"}

def assignWaterSourceType(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = 'Unspecified'
    elif colrowValue == "GROUND WATER":
        outList = "Groundwater"
    else:
        String1 = colrowValue.strip()  # remove whitespace chars
        String1 = String1.title()  # change to title format
        list_of_words = String1.split()
        for i in list_of_words:
            if i in WaterSourceTypeDict.keys():
                outList = WaterSourceTypeDict[i]
            else:
                outList = 'Unspecified'
    return outList

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: assignWaterSourceType(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
# Update datatype of Priority Date to fit WaDE 2.0 structure

def formatDateString(inString1):
    inString = str(inString1).strip()
    try:
        if inString == "" or pd.isnull(inString):
            valndf = ""
        else:
            valD = pd.to_datetime(inString)
            valnDd = valD.date()
            valndf = valnDd.strftime('%m/%d/%Y')
    except:
        valndf = ""
    return valndf

outdf['in_AllocationPriorityDate'] = outdf.apply(lambda row: formatDateString(row['in_AllocationPriorityDate']), axis=1)
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEID_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

## Shapefile Data
- For attaching gemetry to csv inputs.

In [None]:
# Shapefile input
dfPoUshapetemp = gpd.read_file('shapefile/WaterRightPOUs.shp')
print(len(dfPoUshapetemp))
dfPoUshapetemp.head(1)

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['PlaceOfUse'].astype(str).str.strip()
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

## Review and Export

In [None]:
outdf.dtypes

In [None]:
#Exporting to Finished File
outdf.to_csv('P_IdahoMaster.csv', index=False)  # The output
dfPoUshape.to_csv('P_idGeometry.csv', index=False) # The output geometry.

In [None]:
# fileInput = "G:/Shared drives/WaDE Data/Idaho/WaterAllocation/ProcessedInputData/sites.csv"
# df = pd.read_csv(FI_POD)
# df.head(1)