# Pre-processing South Dakota Allocation data for WaDE upload.
Purpose: To pre-process the South Dakota data into one master file for simple DataFrame creation and extraction

Notes: N/A

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/SouthDakota/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Data: waterights

In [None]:
# Input File
fileInput = "waterights_input.csv"
dfinPOD = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "sdwr" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('waterights_input.zip', compression=dict(method='zip', archive_name='waterights_input.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head(1)

In [None]:
# first & last name funciton
def assignownerName(fName, lName):
    
    # Cleaning Text
    fName = str(fName)
    lName = str(lName)
    fName = fName.replace("*", "")
    lName = lName.replace("*", "")      
    
    # Check if first or last name are empty
    if fName == "" or pd.isnull(fName):
        outList1 = ""
    else:
        outList1 = fName.strip()
        
    if lName == "" or pd.isnull(lName):
        outList2 = ""
    else:
        outList2 = lName.strip()

    # ouput
    if outList1 == "" and outList2 == "":
        outList = ""
    elif outList1 == "":
        outList = outList2
    elif outList2 == "":
        outList = outList1
    else:
        outList = " ".join(map(str, [fName, lName]))
    return outList

dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: assignownerName(row['FIRST_NAME'], row['LAST_NAME']), axis=1)


import re
def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
dfinPOD['in_AllocationOwner'].unique()

In [None]:
#Creating Beneficial Use.
#Need to translate SD abbreviatoins to a workable format.

BenUseDict = {
"COM" : "Commercial",
"DOM" : "Domestic",
"FCP" : "Flood Control Permit",
"FWP" : "Fish And Wildlife Propagation",
"GEO" : "Geothermal",
"GWR" : "Ground Water Remediation",
"IND" : "Industrial",
"INS" : "Institutional",
"IRR" : "Irrigation",
"MUN" : "Municipal",
"REC" : "Recreation",
"RWS" : "Rural Water System",
"SHD" : "Suburban Housing Development"}

def retrieveBenUse(A, B, C, D):
    A = str(A).strip()
    B = str(B).strip()
    C = str(C).strip()
    D = str(D).strip()

    if A == "":
        outA = ""
    else:
        try:
            outA = BenUseDict[A]
        except:
            outA = ""

    if B == "":
        outB = ""
    else:
        try:
            outB = ", " + BenUseDict[B]
        except:
            outB = ", " + ""

    if C == "":
        outC = ""
    else:
        try:
            outC = ", " + BenUseDict[C]
        except:
            outC = ", " + ""

    if D == "":
        outD = ""
    else:
        try:
            outD = ", " + BenUseDict[D]
        except:
            outD = ", " + ""

    outList = outA + outB + outC + outD
    outList = outList.strip()

    return outList

dfinPOD['in_BeneficialUseCategory'] = dfinPOD.apply(lambda row: retrieveBenUse(row['USE_TYPE1'], 
                                                          row['USE_TYPE2'], 
                                                          row['USE_TYPE4'], 
                                                          row['USE_TYPE5']), axis=1)
dfinPOD['in_BeneficialUseCategory'].unique()

In [None]:
#Creating WaterSourceTypeCV field

WSTypeDict = {
    "S" : "Surface Water",
    "G" : "Groundwater",
    "B" : "Surface Water and Groundwater"}

def retrieveWSType(colrowValue):
    colrowValue = str(colrowValue).strip()
    if colrowValue == "" or pd.isnull(colrowValue):
        outString = ""
    else:
        try:
            outString = WSTypeDict[colrowValue]
        except:
            outString = ""
    return outString

dfinPOD['in_WaterSourceTypeCV'] = dfinPOD.apply(lambda row: retrieveWSType(row['SOURCE']), axis=1)
dfinPOD['in_WaterSourceTypeCV'].unique()

In [None]:
#Creating allocation status

AlloStatusDict = {
"CA" : "Cancelled",
"DF" : "Deferred",
"DN" : "Denied",
"FU" : "Future Use",
"HD" : "Hold",
"IP" : "Incorporated",
"LC" : "License",
"OC" : "Owner Change",
"PE" : "Permit",
"WI" : "Withdrawn"}

def retrieveStatus(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = ""
    else:
        String1 = str(colrowValue).strip()
        try:
            outList = AlloStatusDict[String1]
        except:
            outList = ""
    return outList

dfinPOD['in_AllocationLegalStatusCV'] = dfinPOD.apply(lambda row: retrieveStatus(row['STATUS']), axis=1)
dfinPOD['in_AllocationLegalStatusCV'].unique()

In [None]:
dfinPOD.head(1)

In [None]:
# creating a usable native url
dfinPOD['new_LINK'] = dfinPOD['LINK'].str.replace('\\', "/").str.replace('R:/work/wr/imaging/wrinfo/', "/")
dfinPOD['new_LINK']

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "SDwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "SDwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "SDwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOD['DIVERSION1'].str.title()
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = dfinPOD['in_WaterSourceTypeCV']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = dfinPOD['HYDROUNIT1']
df['in_Latitude'] = dfinPOD['LATITUDE']
df['in_Longitude'] = dfinPOD['LONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "POD" + dfinPOD['OID_'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "SD"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD['PER_CFS'].astype(float)
df['in_AllocationLegalStatusCV'] = dfinPOD['in_AllocationLegalStatusCV']
df['in_AllocationNativeID'] =  dfinPOD['PERMIT_NO'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfinPOD['in_AllocationOwner']
df['in_AllocationPriorityDate'] = dfinPOD['PRIORDATE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfinPOD['PER_ACRES'].astype(float)
df['in_BeneficialUseCategory'] = dfinPOD['in_BeneficialUseCategory']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://danr.sd.gov/wrimage/wrinfo" + dfinPOD['new_LINK'].astype(str)

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

## Clean Output

In [None]:
# Concatenate dataframes
frames = [outPOD] # just POD for now
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

In [None]:
# Clean owner name up
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
# Ensure Empty String

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# in_Latitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna("")
outdf['in_Latitude'].unique()

In [None]:
# in_Longitude
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna("")
outdf['in_Longitude'].unique()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

## Drop non-Active AllocationLegalStatusCV Water Rights
- For SD, we don't want water rights that are considered: Cancelled, Denied, Withdrawn

In [None]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Cancelled", "Denied", "Withdrawn"]

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

## Export Outputs

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('Pwr_sdMain.zip', compression=dict(method='zip', archive_name='Pwr_sdMain.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.