# Pre-processing Allocation data for WaDE upload.
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
import os
import sys
print(os.environ['CONDA_DEFAULT_ENV'])
print(sys.version)

base
3.12.3 | packaged by conda-forge | (main, Apr 15 2024, 18:20:11) [MSC v.1938 64 bit (AMD64)]


In [2]:
# Needed Libraries / Modules

# ---- working with data ----
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [3]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/WaDE Data Folder/SouthDakota/WaterAllocation" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/WaDE Data Folder/SouthDakota/WaterAllocation


## Data: waterights

In [4]:
# Input File
fileInput = "RawInputData/waterights_input.zip"
dfinPOD = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "sdwr" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('waterights_input.zip', compression=dict(method='zip', archive_name='waterights_input.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head(1)

19238


  dfinPOD = pd.read_csv(fileInput).replace(np.nan, "")


Unnamed: 0,WaDEUUID,ReasonRemoved,IncompleteField,OID_,PERMIT_NO,LATITUDE,LONGITUDE,LAST_NAME,FIRST_NAME,ADDRESS2,CITY,STATE,ZIP,PLUS4,COUNTY_1,BASIN,HYDROUNIT1,PRIORDATE,STATUS,SOURCE,AQUIFER,MNG_UNIT,DIVERSION1,USE_TYPE1,USE_TYPE2,USE_TYPE4,USE_TYPE5,USE_TYPE6,PER_CFS,LIC_CFS,PER_ACRES,LIC_ACRES,CMPLTN_DTE,LIC_DTE,INSPT_DATE,INSPECTOR,METHODCODE,REFERENCE,ACCURACY,LINK
0,sdwr0,,,1,FC10-3,43.71384,-97.6078,MCCOOK COUNTY,,BOX 550,SALEM,SD,57058,550,MC,JR,10160010,12/8/1988 0:00,PE,S,,,WOLF CREEK,FCP,,,,,0.0,0.0,0.0,0.0,3/1/1994 0:00,,,,TRSOTH100,DIVERSION POINT,140,R:\work\wr\imaging\wrinfo\wr_div3\FC10-3.pdf


In [5]:
# first & last name funciton
def assignownerName(fName, lName):
    
    # Cleaning Text
    fName = str(fName)
    lName = str(lName)
    fName = fName.replace("*", "")
    lName = lName.replace("*", "")      
    
    # Check if first or last name are empty
    if fName == "" or pd.isnull(fName):
        outList1 = ""
    else:
        outList1 = fName.strip()
        
    if lName == "" or pd.isnull(lName):
        outList2 = ""
    else:
        outList2 = lName.strip()

    # ouput
    if outList1 == "" and outList2 == "":
        outList = ""
    elif outList1 == "":
        outList = outList2
    elif outList2 == "":
        outList = outList1
    else:
        outList = " ".join(map(str, [fName, lName]))
    return outList

dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: assignownerName(row['FIRST_NAME'], row['LAST_NAME']), axis=1)


import re
def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
dfinPOD['in_AllocationOwner'].unique()

array(['Mccook County', 'Harold Dykstra', 'Eltor Brenner', ...,
       'Robert F Richards', 'Tinisu Springs Llc', 'Mary G Madsen'],
      dtype=object)

In [6]:
#Creating Beneficial Use.
#Need to translate SD abbreviatoins to a workable format.

BenUseDict = {
"COM" : "Commercial",
"DOM" : "Domestic",
"FCP" : "Flood Control Permit",
"FWP" : "Fish And Wildlife Propagation",
"GEO" : "Geothermal",
"GWR" : "Ground Water Remediation",
"IND" : "Industrial",
"INS" : "Institutional",
"IRR" : "Irrigation",
"MUN" : "Municipal",
"REC" : "Recreation",
"RWS" : "Rural Water System",
"SHD" : "Suburban Housing Development"}

def retrieveBenUse(A, B, C, D):
    A = str(A).strip()
    B = str(B).strip()
    C = str(C).strip()
    D = str(D).strip()

    if A == "":
        outA = ""
    else:
        try:
            outA = BenUseDict[A]
        except:
            outA = ""

    if B == "":
        outB = ""
    else:
        try:
            outB = ", " + BenUseDict[B]
        except:
            outB = ", " + ""

    if C == "":
        outC = ""
    else:
        try:
            outC = ", " + BenUseDict[C]
        except:
            outC = ", " + ""

    if D == "":
        outD = ""
    else:
        try:
            outD = ", " + BenUseDict[D]
        except:
            outD = ", " + ""

    outList = outA + outB + outC + outD
    outList = outList.strip()

    return outList

dfinPOD['in_BeneficialUseCategory'] = dfinPOD.apply(lambda row: retrieveBenUse(row['USE_TYPE1'], 
                                                          row['USE_TYPE2'], 
                                                          row['USE_TYPE4'], 
                                                          row['USE_TYPE5']), axis=1)
dfinPOD['in_BeneficialUseCategory'].unique()

array(['Flood Control Permit', 'Irrigation', 'Municipal',
       'Municipal, Industrial, Rural Water System, Suburban Housing Development',
       'Industrial', 'Fish And Wildlife Propagation', 'Commercial',
       'Geothermal', 'Rural Water System', 'Suburban Housing Development',
       'Domestic, Municipal, Fish And Wildlife Propagation',
       'Commercial, Domestic', 'Irrigation, Commercial', 'Recreation',
       'Institutional', 'Irrigation, Industrial',
       'Suburban Housing Development, Commercial', 'Irrigation, Domestic',
       'Domestic', 'Recreation, Commercial',
       'Fish And Wildlife Propagation, Domestic',
       'Domestic, Recreation, Municipal, Industrial',
       'Recreation, Fish And Wildlife Propagation',
       'Municipal, Industrial',
       'Fish And Wildlife Propagation, Recreation',
       'Domestic, Fish And Wildlife Propagation',
       'Commercial, Recreation', 'Recreation, Domestic',
       'Municipal, Recreation', 'Ground Water Remediation',
       '

In [7]:
#Creating WaterSourceTypeCV field

WSTypeDict = {
    "S" : "Surface Water",
    "G" : "Groundwater",
    "B" : "Surface Water and Groundwater"}

def retrieveWSType(colrowValue):
    colrowValue = str(colrowValue).strip()
    if colrowValue == "" or pd.isnull(colrowValue):
        outString = ""
    else:
        try:
            outString = WSTypeDict[colrowValue]
        except:
            outString = ""
    return outString

dfinPOD['in_WaterSourceTypeCV'] = dfinPOD.apply(lambda row: retrieveWSType(row['SOURCE']), axis=1)
dfinPOD['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater', 'Surface Water and Groundwater',
       ''], dtype=object)

In [8]:
#Creating allocation status

AlloStatusDict = {
"CA" : "Cancelled",
"DF" : "Deferred",
"DN" : "Denied",
"FU" : "Future Use",
"HD" : "Hold",
"IP" : "Incorporated",
"LC" : "License",
"OC" : "Owner Change",
"PE" : "Permit",
"WI" : "Withdrawn"}

def retrieveStatus(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = ""
    else:
        String1 = str(colrowValue).strip()
        try:
            outList = AlloStatusDict[String1]
        except:
            outList = ""
    return outList

dfinPOD['in_AllocationLegalStatusCV'] = dfinPOD.apply(lambda row: retrieveStatus(row['STATUS']), axis=1)
dfinPOD['in_AllocationLegalStatusCV'].unique()

array(['Permit', 'License', 'Cancelled', '', 'Owner Change', 'Future Use',
       'Withdrawn', 'Denied', 'Incorporated', 'Deferred'], dtype=object)

In [9]:
dfinPOD.head(1)

Unnamed: 0,WaDEUUID,ReasonRemoved,IncompleteField,OID_,PERMIT_NO,LATITUDE,LONGITUDE,LAST_NAME,FIRST_NAME,ADDRESS2,CITY,STATE,ZIP,PLUS4,COUNTY_1,BASIN,HYDROUNIT1,PRIORDATE,STATUS,SOURCE,AQUIFER,MNG_UNIT,DIVERSION1,USE_TYPE1,USE_TYPE2,USE_TYPE4,USE_TYPE5,USE_TYPE6,PER_CFS,LIC_CFS,PER_ACRES,LIC_ACRES,CMPLTN_DTE,LIC_DTE,INSPT_DATE,INSPECTOR,METHODCODE,REFERENCE,ACCURACY,LINK,in_AllocationOwner,in_BeneficialUseCategory,in_WaterSourceTypeCV,in_AllocationLegalStatusCV
0,sdwr0,,,1,FC10-3,43.71384,-97.6078,MCCOOK COUNTY,,BOX 550,SALEM,SD,57058,550,MC,JR,10160010,12/8/1988 0:00,PE,S,,,WOLF CREEK,FCP,,,,,0.0,0.0,0.0,0.0,3/1/1994 0:00,,,,TRSOTH100,DIVERSION POINT,140,R:\work\wr\imaging\wrinfo\wr_div3\FC10-3.pdf,Mccook County,Flood Control Permit,Surface Water,Permit


In [10]:
# creating a usable native url
dfinPOD['new_LINK'] = dfinPOD['LINK'].str.replace('\\', "/").str.replace('R:/work/wr/imaging/wrinfo/', "/")
dfinPOD['new_LINK']

0         /wr_div3/FC10-3.pdf
1         /wr_div3/FC11-3.pdf
2         /wr_div3/4059-3.pdf
3        /wr_div3/4054A-3.pdf
4        /wr_div3/US516-3.pdf
                 ...         
19233     /wr_div3/7226-3.pdf
19234     /wr_div2/2655-2.pdf
19235     /wr_div2/2655-2.pdf
19236     /wr_div2/2655-2.pdf
19237     /wr_div3/1931-3.pdf
Name: new_LINK, Length: 19238, dtype: object

In [11]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "SDwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "SDwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "SDwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOD['DIVERSION1'].str.title()
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = dfinPOD['in_WaterSourceTypeCV']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = dfinPOD['HYDROUNIT1']
df['in_Latitude'] = dfinPOD['LATITUDE']
df['in_Longitude'] = dfinPOD['LONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "POD" + dfinPOD['OID_'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "SD"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD['PER_CFS'].astype(float)
df['in_AllocationLegalStatusCV'] = dfinPOD['in_AllocationLegalStatusCV']
df['in_AllocationNativeID'] =  dfinPOD['PERMIT_NO'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfinPOD['in_AllocationOwner']
df['in_AllocationPriorityDate'] = dfinPOD['PRIORDATE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfinPOD['PER_ACRES'].astype(float)
df['in_BeneficialUseCategory'] = dfinPOD['in_BeneficialUseCategory']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://danr.sd.gov/wrimage/wrinfo" + dfinPOD['new_LINK'].astype(str)

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

19238


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,sdwr0,SDwr_M1,SDwr_V1,SDwr_O1,,,,Wolf Creek,,Surface Water,,,,4326,,,10160010,43.71384,-97.6078,,,POD,,POD1,,,SD,,,,,,,,,,0.0,Permit,FC10-3,Mccook County,12/8/1988 0:00,,,,,0.0,Flood Control Permit,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/FC1...
1,sdwr1,SDwr_M1,SDwr_V1,SDwr_O1,,,,Beaver Creek,,Surface Water,,,,4326,,,10170203,43.31191,-96.60393,,,POD,,POD2,,,SD,,,,,,,,,,0.0,License,FC11-3,Harold Dykstra,7/21/1989 0:00,,,,,0.0,Flood Control Permit,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/FC1...
2,sdwr10,SDwr_M1,SDwr_V1,SDwr_O1,,,,Groundwater,,Groundwater,,,,4326,,,10170101,43.3171,-98.19177,,,POD,,POD10006,,,SD,,,,,,,,,,2.4,Cancelled,4059-3,Eltor Brenner,5/18/1977 0:00,,,,,235.0,Irrigation,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/405...
3,sdwr100,SDwr_M1,SDwr_V1,SDwr_O1,,,,Groundwater,,Groundwater,,,,4326,,,10170201,45.08462,-97.48097,,,POD,,POD10088,,,SD,,,,,,,,,,1.0,License,4054A-3,Town Of Wallace,10/20/1976 0:00,,,,,0.0,Municipal,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/405...
4,sdwr1000,SDwr_M1,SDwr_V1,SDwr_O1,,,,Big Sioux River,,Surface Water,,,,4326,,,10170203,44.10029,-96.60642,,,POD,,POD10899,,,SD,,,,,,,,,,0.0,License,US516-3,Bia,4/30/1940 0:00,,,,,0.0,Irrigation,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/US5...


## Clean Output

In [12]:
# Concatenate dataframes
frames = [outPOD] # just POD for now
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

19238


In [13]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

In [14]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Mccook County', 'Harold Dykstra', 'Eltor Brenner', ...,
       'Robert F Richards', 'Tinisu Springs Llc', 'Mary G Madsen'],
      dtype=object)

In [15]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array([''], dtype=object)

In [16]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Wolf Creek', 'Beaver Creek', 'Groundwater', ...,
       'Mt Rushmore Spring', 'Cliff Gulch', 'Quinn Dam'], dtype=object)

In [17]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [18]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Wolf Creek', 'Beaver Creek', 'Groundwater', ...,
       'Mt Rushmore Spring', 'Cliff Gulch', 'Quinn Dam'], dtype=object)

In [19]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater', 'Surface Water and Groundwater',
       ''], dtype=object)

In [20]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

array(['Permit', 'License', 'Cancelled', '', 'Owner Change', 'Future Use',
       'Withdrawn', 'Denied', 'Incorporated', 'Deferred'], dtype=object)

In [21]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Mccook County', 'Harold Dykstra', 'Eltor Brenner', ...,
       'Robert F Richards', 'Tinisu Springs Llc', 'Mary G Madsen'],
      dtype=object)

In [22]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['',
 'Commercial',
 'Domestic',
 'Fish And Wildlife Propagation',
 'Flood Control Permit',
 'Geothermal',
 'Ground Water Remediation',
 'Industrial',
 'Institutional',
 'Irrigation',
 'Municipal',
 'Recreation',
 'Rural Water System',
 'Suburban Housing Development']

In [23]:
# Ensure Latitude entry is either numireic or a 0
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([43.71384, 43.31191, 43.3171 , ..., 42.72809, 44.05716, 43.34562])

In [24]:
# Ensure Longitude entry is either numireic or a 0
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([ -97.6078 ,  -96.60393,  -98.19177, ...,  -96.94936, -103.30181,
        -97.04509])

In [25]:
# Changing datatype of Priority Date to date fields entry
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

<DatetimeArray>
['1988-12-08 00:00:00', '1989-07-21 00:00:00', '1977-05-18 00:00:00',
 '1976-10-20 00:00:00', '1940-04-30 00:00:00',                 'NaT',
 '2010-05-04 00:00:00', '2011-02-18 00:00:00', '2011-03-15 00:00:00',
 '2011-04-29 00:00:00',
 ...
 '1972-02-16 00:00:00', '1972-02-22 00:00:00', '1985-05-06 00:00:00',
 '1972-02-25 00:00:00', '1938-02-07 00:00:00', '1985-01-05 00:00:00',
 '1985-04-08 00:00:00', '1939-09-01 00:00:00', '1972-03-23 00:00:00',
 '1972-03-24 00:00:00']
Length: 6844, dtype: datetime64[ns]

In [26]:
# Ensure Flow entry is either numireic or a 0
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array(['', 2.4, 1.0, 0.83, 2.0, 0.47, 1.89, 1.1, 0.13, 4.36, 1.7, 3.0,
       4.29, 7.47, 1.66, 1.67, 1.14, 0.1, 6.67, 4.0, 0.06, 1.57, 0.02,
       3.56, 1.78, 1.97, 7.4, 3.12, 1.56, 14.4, 3.37, 5.33, 3.86, 2.22,
       0.07, 4.57, 6.86, 1.64, 3.14, 2.67, 1.77, 3.11, 3.33, 0.89, 1.23,
       0.18, 6.46, 4.33, 3.89, 7.87, 0.27, 0.45, 4.44, 3.18, 1.5, 3.34,
       1.33, 3.06, 0.05, 6.6, 3.81, 2.28, 1.42, 1.44, 3.42, 0.17, 2.44,
       0.08, 1.28, 0.67, 3.78, 0.92, 1.24, 3.68, 6.85, 9.19, 0.44, 2.2,
       7.85, 1.11, 3.04, 6.7, 0.04, 1.91, 0.57, 0.09, 1.39, 4.37, 8.85,
       3.55, 2.25, 3.36, 3.82, 7.77, 0.22, 1.48, 14.0, 0.11, 0.8, 3.79,
       4.28, 1.34, 1.94, 1.68, 2.14, 2.79, 1.22, 1.73, 0.12, 0.33, 0.29,
       0.03, 3.8, 1.53, 1.71, 4.4, 0.28, 4.05, 40.0, 3.5, 4.59, 416.0,
       0.5, 5.1, 0.71, 0.88, 0.86, 0.25, 0.37, 0.38, 0.95, 1.2, 2.68, 2.3,
       1.55, 0.55, 1.9, 0.91, 0.34, 3.45, 0.3, 0.4, 0.7, 15.0, 4.3, 5.58,
       25.0, 6.84, 431.71, 1.93, 2.9, 7.0, 1.01, 0.94, 2.76,

In [27]:
# Ensure Volume entry is either numireic or a 0
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array(['', 235.0, 165.0, ..., 2.25, 0.82, 90.2], dtype=object)

In [28]:
# Ensure Irrigated Acreage entry is either numireic or a 0
outdf['in_IrrigatedAcreage'] = pd.to_numeric(outdf['in_IrrigatedAcreage'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_IrrigatedAcreage'].unique()

array([''], dtype=object)

In [29]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1', 'wadeId2', 'wadeId3', ..., 'wadeId1338', 'wadeId1339',
       'wadeId1340'], dtype=object)

In [30]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['POD1', 'POD2', 'POD10006', ..., 'POD19529', 'POD19530', 'POD1954'],
      dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For SD, we don't want water rights that are considered: Cancelled, Denied, Withdrawn

In [31]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Cancelled", "Denied", "Withdrawn"]

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

13313


array(['Permit', 'License', '', 'Owner Change', 'Future Use',
       'Incorporated', 'Deferred'], dtype=object)

## Export Outputs

In [32]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13313 entries, 0 to 13312
Data columns (total 63 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   WaDEUUID                                      13313 non-null  object        
 1   in_MethodUUID                                 13313 non-null  object        
 2   in_VariableSpecificUUID                       13313 non-null  object        
 3   in_OrganizationUUID                           13313 non-null  object        
 4   in_Geometry                                   13313 non-null  object        
 5   in_GNISFeatureNameCV                          13313 non-null  object        
 6   in_WaterQualityIndicatorCV                    13313 non-null  object        
 7   in_WaterSourceName                            13313 non-null  object        
 8   in_WaterSourceNativeID                        13313 non-null  obje

In [33]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,sdwr0,SDwr_M1,SDwr_V1,SDwr_O1,,,,Wolf Creek,wadeId1,Surface Water,,,,4326,,,10160010,43.71384,-97.60780,,,POD,,POD1,,,SD,,,,,,,,,,,Permit,FC10-3,Mccook County,1988-12-08,,,,,,Flood Control Permit,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/FC1...
1,sdwr1,SDwr_M1,SDwr_V1,SDwr_O1,,,,Beaver Creek,wadeId2,Surface Water,,,,4326,,,10170203,43.31191,-96.60393,,,POD,,POD2,,,SD,,,,,,,,,,,License,FC11-3,Harold Dykstra,1989-07-21,,,,,,Flood Control Permit,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/FC1...
2,sdwr100,SDwr_M1,SDwr_V1,SDwr_O1,,,,Groundwater,wadeId3,Groundwater,,,,4326,,,10170201,45.08462,-97.48097,,,POD,,POD10088,,,SD,,,,,,,,,,1.00000,License,4054A-3,Town Of Wallace,1976-10-20,,,,,,Municipal,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/405...
3,sdwr1000,SDwr_M1,SDwr_V1,SDwr_O1,,,,Big Sioux River,wadeId4,Surface Water,,,,4326,,,10170203,44.10029,-96.60642,,,POD,,POD10899,,,SD,,,,,,,,,,,License,US516-3,Bia,1940-04-30,,,,,,Irrigation,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/US5...
4,sdwr10000,SDwr_M1,SDwr_V1,SDwr_O1,,,,Rapid Creek,wadeId5,Surface Water,,,,4326,,,10120110,44.07250,-103.49129,,,POD,,POD19531,,,SD,,,,,,,,,,0.83000,License,2655-2,City Of Rapid City,NaT,,,,,,"Municipal, Industrial, Rural Water System, Sub...",,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div2/265...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13308,sdwr9995,SDwr_M1,SDwr_V1,SDwr_O1,,,,Groundwater,wadeId3,Groundwater,,,,4326,,,702001,45.30609,-97.03371,,,POD,,POD19527,,,SD,,,,,,,,,,0.44000,Permit,7226-3,Town Of Summit,2010-11-05,,,,,,Municipal,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/722...
13309,sdwr9996,SDwr_M1,SDwr_V1,SDwr_O1,,,,Rapid Creek,wadeId5,Surface Water,,,,4326,,,10120110,44.06983,-103.26905,,,POD,,POD19528,,,SD,,,,,,,,,,0.83000,License,2655-2,City Of Rapid City,NaT,,,,,,"Municipal, Industrial, Rural Water System, Sub...",,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div2/265...
13310,sdwr9997,SDwr_M1,SDwr_V1,SDwr_O1,,,,Rapid Creek,wadeId5,Surface Water,,,,4326,,,10120110,44.05716,-103.30181,,,POD,,POD19529,,,SD,,,,,,,,,,0.83000,License,2655-2,City Of Rapid City,NaT,,,,,,"Municipal, Industrial, Rural Water System, Sub...",,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div2/265...
13311,sdwr9998,SDwr_M1,SDwr_V1,SDwr_O1,,,,Rapid Creek,wadeId5,Surface Water,,,,4326,,,10120110,44.05707,-103.30246,,,POD,,POD19530,,,SD,,,,,,,,,,0.83000,License,2655-2,City Of Rapid City,NaT,,,,,,"Municipal, Industrial, Rural Water System, Sub...",,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div2/265...


In [34]:
# Export the output dataframe
# change output name / abbreviation to match native state provdier and wade data type 
outdf.to_csv('RawInputData/Pwr_Main.zip', compression=dict(method='zip', archive_name='Pwr_Main.csv'), index=False)  # The output, save as a zip
#goutdf1.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.