# Pre-processing Idaho Allocation data for WaDEQA upload.
- Purpose:  To pre-process the Idaho data into one master file for simple DataFrame creation and extraction.  Working Idaho data for WaDEQA 2.0 is mostly composed of point of diversion data.
- Notes: working with POD and POU data.  Working with assumption that both POD and POU data share the same water right record information.

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Idaho/WaterAllocation"
os.chdir(workingDir)

## POD Sites Data

In [3]:
# POD Data
# read in shapefile data, then just export a zipped csv.
FI_POD = "RawInputData/shapefiles/Water_Right_PODs.zip"
dfinPOD = gpd.read_file(FI_POD).replace(np.nan, "")
dfinPOD = dfinPOD.drop(['geometry'], axis=1) # don't want geometry for POD sites.

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "idD" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('RawInputData/Water_Right_PODs.zip', compression=dict(method='zip', archive_name='Water_Right_PODs.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head(1)

KeyboardInterrupt: 

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "IDwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "IDwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "IDwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOD['Source'].replace(np.nan, "").astype(str).astype(str).str.title()
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = "" # autfo fill in below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = dfinPOD['DataSource']
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD['Latitude']
df['in_Longitude'] = dfinPOD['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = dfinPOD['DiversionN']
df['in_SiteNativeID'] = "POD" + dfinPOD['PointOfDiv'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "ID"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = dfinPOD['Basis']
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD['OverallMax'].astype(float)
df['in_AllocationLegalStatusCV'] = dfinPOD['Status']
df['in_AllocationNativeID'] =  dfinPOD['WaterRight'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfinPOD['Owner']
df['in_AllocationPriorityDate'] = dfinPOD['PriorityDa']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfinPOD['OverallM_1'].astype(float)
df['in_BeneficialUseCategory'] = dfinPOD['Uses']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfinPOD['WRReport']

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True).replace(np.nan, '')
print(len(outPOD))
outPOD.head()

## POU Site Data

In [None]:
# POU - use shapefile
# read in shapefile data, then just export a zipped csv.
FI_POU = "RawInputData/shapefiles/WaterRightPOUs.zip"
dfinPOU = gpd.read_file(FI_POU).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOU:
    dfinPOU['WaDEUUID'] = "idU" + dfinPOU.index.astype(str)
    dfinPOU.to_csv('RawInputData/WaterRightPOUs.zip', compression=dict(method='zip', archive_name='WaterRightPOUs.csv'), index=False)

print(len(dfinPOU))
dfinPOU.head()

In [None]:
# create output POU dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOU['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "IDwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "IDwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "IDwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOU['Source'].replace(np.nan, "").astype(str).str.title()
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = "" # autfo fill in below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = "Centroid"
df['in_County'] = "WaDE Unspecified"
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOU['cent_Latit']
df['in_Longitude'] = dfinPOU['cent_Longi']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POU"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "POU" + dfinPOU['PlaceOfUse'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "ID"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = ""
df['in_AllocationLegalStatusCV'] = dfinPOU['Status']
df['in_AllocationNativeID'] =  dfinPOU['WaterRight'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfinPOU['Owner']
df['in_AllocationPriorityDate'] = dfinPOU['PriorityDa']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfinPOU['WaterUse']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOU['AcreLimit']
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfinPOU['WRReport']

outPOU = df.copy()
outPOU = outPOU.drop_duplicates().reset_index(drop=True)
print(len(outPOU))
outPOU.head()

## Concatenate and Clean Data

In [None]:
# Concatenate dataframes
frames = [outPOD, outPOU]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

In [None]:
# Fix ID Owner Name
# Given ID Owner is not full concatenated record. Use provided xlsx from Danielle Favreau to fix.

fileInput = "ownername_USactingThrough_WaDE_05112023.xlsx"
df_idown = pd.read_excel(fileInput)
df_idown['WRNO'] = df_idown['WRNO'].replace(" ", "").replace("", 0).fillna(0).astype(str).str.strip()
df_idown['OrgName2'] = df_idown['OrgName2'].replace("DIRECTOR PN CODE-", "").astype(str).str.strip()
IdahoOwnerNameFixdict = pd.Series(df_idown.OrgName2.values, index=df_idown.WRNO.astype(str)).to_dict()


# Retreive WaDE Custom water source native ID
def retrieveOwnerName(valID, valOwn):
    try:
        outString = IdahoOwnerNameFixdict[valID]
    except:
        outString = valOwn
    return outString

outdf['in_AllocationOwner'] = outdf.apply(lambda row: retrieveOwnerName(row['in_AllocationNativeID'], row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# Clean owner name up
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# ID POD source data has a few names that contain a ',' in them, but should still be okay
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
# Ensure Empty String

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_CoordinateMethodCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_CoordinateMethodCV']), axis=1)
outdf['in_CoordinateMethodCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_AllocationBasisCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationBasisCV']), axis=1)
outdf['in_AllocationBasisCV'].unique()

In [None]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# WaterSourceType
# searach water source name for keywords

WaterSourceTypeDict = {
"sub" : "Groundwater",
"ground water" : "Groundwater",
"canal" :  "Surface Water",
"channel" : "Surface Water",
"creek" : "Surface Water",
"ditch" : "Surface Water",
"drain" : "Surface Water",
"drains" : "Surface Water",
"draw" : "Surface Water",
"dry" : "Surface Water",
"fork" : "Surface Water",
"gluch" : "Surface Water",
"gulch": "Surface Water",
"hole" : "Surface Water",
"holes" : "Surface Water",
"hollow"  : "Surface Water",
"lake" :  "Surface Water",
"lakes" :  "Surface Water",
"pond" :  "Surface Water",
"reservoir" : "Surface Water",
"river" : "Surface Water",
"runoff" : "Surface Water",
"seep" : "Surface Water",
"slough" : "Surface Water",
"spring" :  "Surface Water",
"springs" :  "Surface Water",
"spr" :  "Surface Water",
"stream" : "Surface Water",
"streams" : "Surface Water",
"surface" : "Surface Water",
"swamp" : "Surface Water",
"swamps" : "Surface Water",
"wash" : "Surface Water",
"fargo wasteway" : "Surface Water",
"frozen dog wasteway" : "Surface Water",
"tunnel no 7 wasteway" : "Surface Water",
"waste water" : "Reuse",
"wastewater" : "Reuse",
"treated municipal wastewater" : "Reuse"}

def assignWaterSourceType(val):
    val = val.lower().strip()
    if val == "" or pd.isnull(val):
        outList = ""
    elif val == "ground water": 
        outList = "Groundwater"
    else:
        for i in WaterSourceTypeDict.keys():
            if i in val:
                outList = WaterSourceTypeDict[i]
                break
            else:
                outList = ""
    return outList

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: assignWaterSourceType(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
# in_Latitude & in_Longitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna("")
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna("")
outdf.head(1)

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Fixing in_IrrigatedAcreage datatype
outdf['in_IrrigatedAcreage'] = pd.to_numeric(outdf['in_IrrigatedAcreage'], errors='coerce').fillna("")
outdf['in_IrrigatedAcreage'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

## Shapefile Data
- For attaching gemetry to csv inputs.

In [None]:
# Shapefile input
# use same as input above

dfPoUshapetemp = dfinPOD.copy()
print(len(dfPoUshapetemp))
dfPoUshapetemp.head(1)

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['PlaceOfUse'].replace("", 0).fillna(0).astype(int).astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

## Review and Export

In [None]:
outdf.dtypes

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwr_idMain.zip', compression=dict(method='zip', archive_name='Pwr_idMain.csv'), index=False) # The output, save as a zip
dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.