# Pre-processing South Dakota Allocation data for WaDE upload.
Purpose: To pre-process the South Dakota data into one master file for simple DataFrame creation and extraction

Notes: N/A

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/SouthDakota/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Data: waterights

In [3]:
# Input File
fileInput = "waterights_input.csv"
dfinPOD = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "sdwr" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('waterights_input.zip', compression=dict(method='zip', archive_name='waterights_input.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head(1)

  dfinPOD = pd.read_csv(fileInput).replace(np.nan, "")


19238


Unnamed: 0,ReasonRemoved,IncompleteField,OID_,PERMIT_NO,LATITUDE,LONGITUDE,LAST_NAME,FIRST_NAME,ADDRESS2,CITY,STATE,ZIP,PLUS4,COUNTY_1,BASIN,HYDROUNIT1,PRIORDATE,STATUS,SOURCE,AQUIFER,MNG_UNIT,DIVERSION1,USE_TYPE1,USE_TYPE2,USE_TYPE4,USE_TYPE5,USE_TYPE6,PER_CFS,LIC_CFS,PER_ACRES,LIC_ACRES,CMPLTN_DTE,LIC_DTE,INSPT_DATE,INSPECTOR,METHODCODE,REFERENCE,ACCURACY,LINK,WaDEUUID
0,,,1,FC10-3,43.71384,-97.6078,MCCOOK COUNTY,,BOX 550,SALEM,SD,57058,550,MC,JR,10160010,12/8/1988 0:00,PE,S,,,WOLF CREEK,FCP,,,,,0.0,0.0,0.0,0.0,3/1/1994 0:00,,,,TRSOTH100,DIVERSION POINT,140,R:\work\wr\imaging\wrinfo\wr_div3\FC10-3.pdf,sdwr0


In [4]:
# first & last name funciton
def assignownerName(fName, lName):
    
    # Cleaning Text
    fName = str(fName)
    lName = str(lName)
    fName = fName.replace("*", "")
    lName = lName.replace("*", "")      
    
    # Check if first or last name are empty
    if fName == "" or pd.isnull(fName):
        outList1 = ""
    else:
        outList1 = fName.strip()
        
    if lName == "" or pd.isnull(lName):
        outList2 = ""
    else:
        outList2 = lName.strip()

    # ouput
    if outList1 == "" and outList2 == "":
        outList = ""
    elif outList1 == "":
        outList = outList2
    elif outList2 == "":
        outList = outList1
    else:
        outList = " ".join(map(str, [fName, lName]))
    return outList

dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: assignownerName(row['FIRST_NAME'], row['LAST_NAME']), axis=1)


import re
def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
dfinPOD['in_AllocationOwner'].unique()

array(['Mccook County', 'Harold Dykstra', 'Meridian Minerals Co', ...,
       'Ambrose Heimer', 'Ray Mason', 'Charles Capp'], dtype=object)

In [5]:
#Creating Beneficial Use.
#Need to translate SD abbreviatoins to a workable format.

BenUseDict = {
"COM" : "Commercial",
"DOM" : "Domestic",
"FCP" : "Flood Control Permit",
"FWP" : "Fish And Wildlife Propagation",
"GEO" : "Geothermal",
"GWR" : "Ground Water Remediation",
"IND" : "Industrial",
"INS" : "Institutional",
"IRR" : "Irrigation",
"MUN" : "Municipal",
"REC" : "Recreation",
"RWS" : "Rural Water System",
"SHD" : "Suburban Housing Development"}

def retrieveBenUse(A, B, C, D):
    A = str(A).strip()
    B = str(B).strip()
    C = str(C).strip()
    D = str(D).strip()

    if A == "":
        outA = ""
    else:
        try:
            outA = BenUseDict[A]
        except:
            outA = ""

    if B == "":
        outB = ""
    else:
        try:
            outB = ", " + BenUseDict[B]
        except:
            outB = ", " + ""

    if C == "":
        outC = ""
    else:
        try:
            outC = ", " + BenUseDict[C]
        except:
            outC = ", " + ""

    if D == "":
        outD = ""
    else:
        try:
            outD = ", " + BenUseDict[D]
        except:
            outD = ", " + ""

    outList = outA + outB + outC + outD
    outList = outList.strip()

    return outList

dfinPOD['in_BeneficialUseCategory'] = dfinPOD.apply(lambda row: retrieveBenUse(row['USE_TYPE1'], 
                                                          row['USE_TYPE2'], 
                                                          row['USE_TYPE4'], 
                                                          row['USE_TYPE5']), axis=1)
dfinPOD['in_BeneficialUseCategory'].unique()

array(['Flood Control Permit', 'Irrigation', 'Commercial, Industrial',
       'Industrial', 'Municipal', 'Rural Water System',
       'Irrigation, Rural Water System, Fish And Wildlife Propagation, Domestic',
       'Commercial', 'Commercial, Domestic',
       'Fish And Wildlife Propagation, Domestic',
       'Commercial, Recreation', 'Ground Water Remediation',
       'Fish And Wildlife Propagation', 'Recreation',
       'Suburban Housing Development, Commercial', 'Irrigation, Domestic',
       'Domestic', 'Suburban Housing Development',
       'Municipal, Industrial, Rural Water System, Suburban Housing Development',
       'Recreation, Domestic', 'Domestic, Commercial',
       'Fish And Wildlife Propagation, Recreation',
       'Fish And Wildlife Propagation, Irrigation',
       'Recreation, Municipal', 'Industrial, Domestic',
       'Municipal, Industrial, Suburban Housing Development, Commercial',
       'Commercial, Irrigation', 'Geothermal', 'Institutional',
       'Domestic, Re

In [6]:
#Creating WaterSourceTypeCV field

WSTypeDict = {
    "S" : "Surface Water",
    "G" : "Groundwater",
    "B" : "Surface Water and Groundwater"}

def retrieveWSType(colrowValue):
    colrowValue = str(colrowValue).strip()
    if colrowValue == "" or pd.isnull(colrowValue):
        outString = ""
    else:
        try:
            outString = WSTypeDict[colrowValue]
        except:
            outString = ""
    return outString

dfinPOD['in_WaterSourceTypeCV'] = dfinPOD.apply(lambda row: retrieveWSType(row['SOURCE']), axis=1)
dfinPOD['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater', 'Surface Water and Groundwater',
       ''], dtype=object)

In [7]:
#Creating allocation status

AlloStatusDict = {
"CA" : "Cancelled",
"DF" : "Deferred",
"DN" : "Denied",
"FU" : "Future Use",
"HD" : "Hold",
"IP" : "Incorporated",
"LC" : "License",
"OC" : "Owner Change",
"PE" : "Permit",
"WI" : "Withdrawn"}

def retrieveStatus(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = ""
    else:
        String1 = str(colrowValue).strip()
        try:
            outList = AlloStatusDict[String1]
        except:
            outList = ""
    return outList

dfinPOD['in_AllocationLegalStatusCV'] = dfinPOD.apply(lambda row: retrieveStatus(row['STATUS']), axis=1)
dfinPOD['in_AllocationLegalStatusCV'].unique()

array(['Permit', 'License', 'Cancelled', 'Deferred', 'Denied',
       'Withdrawn', '', 'Future Use', 'Incorporated', 'Owner Change'],
      dtype=object)

In [8]:
dfinPOD.head(1)

Unnamed: 0,ReasonRemoved,IncompleteField,OID_,PERMIT_NO,LATITUDE,LONGITUDE,LAST_NAME,FIRST_NAME,ADDRESS2,CITY,STATE,ZIP,PLUS4,COUNTY_1,BASIN,HYDROUNIT1,PRIORDATE,STATUS,SOURCE,AQUIFER,MNG_UNIT,DIVERSION1,USE_TYPE1,USE_TYPE2,USE_TYPE4,USE_TYPE5,USE_TYPE6,PER_CFS,LIC_CFS,PER_ACRES,LIC_ACRES,CMPLTN_DTE,LIC_DTE,INSPT_DATE,INSPECTOR,METHODCODE,REFERENCE,ACCURACY,LINK,WaDEUUID,in_AllocationOwner,in_BeneficialUseCategory,in_WaterSourceTypeCV,in_AllocationLegalStatusCV
0,,,1,FC10-3,43.71384,-97.6078,MCCOOK COUNTY,,BOX 550,SALEM,SD,57058,550,MC,JR,10160010,12/8/1988 0:00,PE,S,,,WOLF CREEK,FCP,,,,,0.0,0.0,0.0,0.0,3/1/1994 0:00,,,,TRSOTH100,DIVERSION POINT,140,R:\work\wr\imaging\wrinfo\wr_div3\FC10-3.pdf,sdwr0,Mccook County,Flood Control Permit,Surface Water,Permit


In [9]:
# creating a usable native url
dfinPOD['new_LINK'] = dfinPOD['LINK'].str.replace('\\', "/").str.replace('R:/work/wr/imaging/wrinfo/', "/")
dfinPOD['new_LINK']

  dfinPOD['new_LINK'] = dfinPOD['LINK'].str.replace('\\', "/").str.replace('R:/work/wr/imaging/wrinfo/', "/")


0        /wr_div3/FC10-3.pdf
1        /wr_div3/FC11-3.pdf
2         /wr_div3/FC7-3.pdf
3        /wr_div3/1017-3.pdf
4        /wr_div2/1422-2.pdf
                ...         
19233     /wr_div1/697-1.pdf
19234     /wr_div1/707-1.pdf
19235     /wr_div1/717-1.pdf
19236     /wr_div1/753-1.pdf
19237     /wr_div1/837-1.pdf
Name: new_LINK, Length: 19238, dtype: object

In [10]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "SDwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "SDwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "SDwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOD['DIVERSION1'].str.title()
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = dfinPOD['in_WaterSourceTypeCV']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = dfinPOD['HYDROUNIT1']
df['in_Latitude'] = dfinPOD['LATITUDE']
df['in_Longitude'] = dfinPOD['LONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "" # auto fill in below
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "SD"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD['PER_CFS'].astype(float)
df['in_AllocationLegalStatusCV'] = dfinPOD['in_AllocationLegalStatusCV']
df['in_AllocationNativeID'] =  dfinPOD['PERMIT_NO'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfinPOD['in_AllocationOwner']
df['in_AllocationPriorityDate'] = dfinPOD['PRIORDATE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfinPOD['PER_ACRES'].astype(float)
df['in_BeneficialUseCategory'] = dfinPOD['in_BeneficialUseCategory']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://danr.sd.gov/wrimage/wrinfo" + dfinPOD['new_LINK'].astype(str)

df = df.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(df))
df.head()

19238


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,sdwr0,SDwr_M1,SDwr_V1,SDwr_O1,,,,Wolf Creek,,Surface Water,,,,4326,,,10160010,43.71384,-97.6078,,,POD,,,,,SD,,,,,,,,,,0.0,Permit,FC10-3,Mccook County,12/8/1988 0:00,,,,,0.0,Flood Control Permit,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/FC1...
1,sdwr1,SDwr_M1,SDwr_V1,SDwr_O1,,,,Beaver Creek,,Surface Water,,,,4326,,,10170203,43.31191,-96.60393,,,POD,,,,,SD,,,,,,,,,,0.0,License,FC11-3,Harold Dykstra,7/21/1989 0:00,,,,,0.0,Flood Control Permit,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/FC1...
2,sdwr2,SDwr_M1,SDwr_V1,SDwr_O1,,,,Dry Draw,,Surface Water,,,,4326,,,10170203,43.63269,-96.56461,,,POD,,,,,SD,,,,,,,,,,0.0,Cancelled,FC7-3,Meridian Minerals Co,12/29/1986 0:00,,,,,0.0,Flood Control Permit,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/FC7...
3,sdwr3,SDwr_M1,SDwr_V1,SDwr_O1,,,,Groundwater,,Groundwater,,,,4326,,,10170202,44.42416,-96.90175,,,POD,,,,,SD,,,,,,,,,,1.797,License,1017-3,Natasha Swier,2/11/1963 0:00,,,,,125.0,Irrigation,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/101...
4,sdwr4,SDwr_M1,SDwr_V1,SDwr_O1,,,,Bull Creek Tributary,,Surface Water,,,,4326,,,10140101,43.26114,-99.58091,,,POD,,,,,SD,,,,,,,,,,4.0,License,1422-2,Gerald E Gergen,8/20/1976 0:00,,,,,279.9,Irrigation,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div2/142...


## Clean Output

In [11]:
# Concatenate dataframes
frames = [df] # just POD for now
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

19238


In [12]:
# Ensure Empty String

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [13]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Wolf Creek', 'Beaver Creek', 'Dry Draw', ..., 'Lake',
       'W Branch Bull Creek', 'Unnamed Dry Creek'], dtype=object)

In [14]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater', 'Surface Water and Groundwater',
       ''], dtype=object)

In [15]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

array(['Permit', 'License', 'Cancelled', 'Deferred', 'Denied',
       'Withdrawn', '', 'Future Use', 'Incorporated', 'Owner Change'],
      dtype=object)

In [16]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Mccook County', 'Harold Dykstra', 'Meridian Minerals Co', ...,
       'Ambrose Heimer', 'Ray Mason', 'Charles Capp'], dtype=object)

In [17]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['Flood Control Permit', 'Irrigation', 'Commercial, Industrial',
       'Industrial', 'Municipal', 'Rural Water System',
       'Irrigation, Rural Water System, Fish And Wildlife Propagation, Domestic',
       'Commercial', 'Commercial, Domestic',
       'Fish And Wildlife Propagation, Domestic',
       'Commercial, Recreation', 'Ground Water Remediation',
       'Fish And Wildlife Propagation', 'Recreation',
       'Suburban Housing Development, Commercial', 'Irrigation, Domestic',
       'Domestic', 'Suburban Housing Development',
       'Municipal, Industrial, Rural Water System, Suburban Housing Development',
       'Recreation, Domestic', 'Domestic, Commercial',
       'Fish And Wildlife Propagation, Recreation',
       'Fish And Wildlife Propagation, Irrigation',
       'Recreation, Municipal', 'Industrial, Domestic',
       'Municipal, Industrial, Suburban Housing Development, Commercial',
       'Commercial, Irrigation', 'Geothermal', 'Institutional',
       'Domestic, Re

In [18]:
# in_Latitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna("")
outdf['in_Latitude'].unique()

array([43.71384, 43.31191, 43.63269, ..., 44.98889, 44.83907, 45.15978])

In [19]:
# in_Longitude
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna("")
outdf['in_Longitude'].unique()

array([ -97.6078 ,  -96.60393,  -96.56461, ..., -101.9765 , -101.76175,
       -101.93783])

In [20]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

array(['1988-12-08T00:00:00.000000000', '1989-07-21T00:00:00.000000000',
       '1986-12-29T00:00:00.000000000', ...,
       '1997-02-06T00:00:00.000000000', '1961-05-23T00:00:00.000000000',
       '1963-05-13T00:00:00.000000000'], dtype='datetime64[ns]')

In [21]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array(['', 1.797, 4.0, 0.55, 2.05, 0.6, 1.57, 2.0, 2.4, 2.1, 2.24, 2.28,
       5.05, 4.57, 4.4, 0.3, 0.86, 2.5, 0.33, 7.06, 0.62, 0.9, 2.04, 3.94,
       2.56, 1.72, 0.06, 0.85, 17.7, 3.43, 6.75, 2.66, 2.2, 1.18, 1.21,
       3.0, 0.56, 3.71, 0.03, 0.09, 4.3, 1.0, 0.73, 8.0, 3.5, 0.055, 4.6,
       0.006, 1.94, 0.44, 0.044, 1.85, 0.035, 0.1, 0.25, 0.222, 0.166,
       0.022, 0.05, 0.93, 0.355, 0.045, 0.033, 0.47, 0.71, 0.155, 1.1,
       0.333, 0.18, 1.77, 2.8, 0.57, 1.66, 1.81, 0.4, 0.07, 1.9, 2.11,
       2.12, 0.11, 2.23, 0.59, 0.27, 0.22, 1.67, 1.89, 1.78, 0.15, 0.085,
       2.89, 1.4, 2.9, 1.14, 3.44, 0.37, 7.33, 0.311, 5.52, 2.44, 0.666,
       0.75, 0.456, 1.79, 1.43, 1.33, 2.55, 1.84, 2.22, 10.0, 0.071,
       0.891, 0.58, 5.7, 1.86, 1.11, 3.34, 1.56, 2.67, 3.9, 0.67, 2.87,
       1.45, 0.46, 0.167, 1.28, 0.156, 0.04, 0.51, 2.14, 8.08, 0.66,
       0.266, 0.444, 0.28, 0.098, 0.031, 1.64, 0.111, 0.144, 0.08, 0.067,
       0.011, 1.06, 0.45, 1.47, 0.133, 0.077, 16.5, 0.089, 0.1

In [22]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array(['', 125.0, 279.9, ..., 111.7, 159.4, 89.95], dtype=object)

In [23]:
%%time
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = outdf['in_Latitude']
dfSiteNativeID['in_Longitude'] = outdf['in_Longitude']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)
dfSiteNativeID['linkKey'] = dfSiteNativeID['in_Latitude'].astype(str) + dfSiteNativeID['in_Longitude'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
SiteNativeIDdict = pd.Series(dfSiteNativeID.in_SiteNativeID.values, index=dfSiteNativeID.linkKey.astype(str)).to_dict()
def retrieveSiteNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = SiteNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveSiteNativeID( row['in_Latitude'], row['in_Longitude']), axis=1)
outdf['in_SiteNativeID'].unique()

Wall time: 335 ms


array(['wadeID1', 'wadeID2', 'wadeID3', ..., 'wadeID16521', 'wadeID16522',
       'wadeID16523'], dtype=object)

In [24]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeID1', 'wadeID2', 'wadeID3', ..., 'wadeID1338', 'wadeID1339',
       'wadeID1340'], dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For SD, we don't want water rights that are considered: Cancelled, Denied, Withdrawn

In [25]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Cancelled", "Denied", "Withdrawn"]

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

13313


array(['Permit', 'License', 'Deferred', '', 'Future Use', 'Incorporated',
       'Owner Change'], dtype=object)

## Export Outputs

In [26]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

WaDEUUID                                                object
in_MethodUUID                                           object
in_VariableSpecificUUID                                 object
in_OrganizationUUID                                     object
in_Geometry                                             object
in_GNISFeatureNameCV                                    object
in_WaterQualityIndicatorCV                              object
in_WaterSourceName                                      object
in_WaterSourceNativeID                                  object
in_WaterSourceTypeCV                                    object
in_CoordinateAccuracy                                   object
in_CoordinateMethodCV                                   object
in_County                                               object
in_EPSGCodeCV                                            int64
in_GNISCodeCV                                           object
in_HUC12                                               

In [27]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,sdwr0,SDwr_M1,SDwr_V1,SDwr_O1,,,,Wolf Creek,wadeID1,Surface Water,,,,4326,,,10160010,43.71384,-97.60780,,,POD,,wadeID1,,,SD,,,,,,,,,,,Permit,FC10-3,Mccook County,1988-12-08,,,,,,Flood Control Permit,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/FC1...
1,sdwr1,SDwr_M1,SDwr_V1,SDwr_O1,,,,Beaver Creek,wadeID2,Surface Water,,,,4326,,,10170203,43.31191,-96.60393,,,POD,,wadeID2,,,SD,,,,,,,,,,,License,FC11-3,Harold Dykstra,1989-07-21,,,,,,Flood Control Permit,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/FC1...
2,sdwr3,SDwr_M1,SDwr_V1,SDwr_O1,,,,Groundwater,wadeID4,Groundwater,,,,4326,,,10170202,44.42416,-96.90175,,,POD,,wadeID4,,,SD,,,,,,,,,,1.79700,License,1017-3,Natasha Swier,1963-02-11,,,,,125.00000,Irrigation,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/101...
3,sdwr4,SDwr_M1,SDwr_V1,SDwr_O1,,,,Bull Creek Tributary,wadeID5,Surface Water,,,,4326,,,10140101,43.26114,-99.58091,,,POD,,wadeID5,,,SD,,,,,,,,,,4.00000,License,1422-2,Gerald E Gergen,1976-08-20,,,,,279.90000,Irrigation,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div2/142...
4,sdwr20,SDwr_M1,SDwr_V1,SDwr_O1,,,,James River,wadeID11,Surface Water,,,,4326,,,10160003,45.53910,-98.10536,,,POD,,wadeID21,,,SD,,,,,,,,,,,Deferred,FC15-3,Lower Crow Creek Improvement,NaT,,,,,,Flood Control Permit,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/FC1...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13308,sdwr19224,SDwr_M1,SDwr_V1,SDwr_O1,,,,Runoff,wadeID25,Surface Water,,,,4326,,,10170203,43.51808,-96.57607,,,POD,,wadeID16511,,,SD,,,,,,,,,,0.56000,License,5996-3,L G Everist Inc,1997-04-21,,,,,,Commercial,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div3/599...
13309,sdwr19225,SDwr_M1,SDwr_V1,SDwr_O1,,,,Groundwater,wadeID4,Groundwater,,,,4326,,,10120202,44.60921,-103.63923,,,POD,,wadeID16512,,,SD,,,,,,,,,,0.89000,License,1650-1,Foothill Land Cattle Llc,1997-04-03,,,,,,"Commercial, Irrigation",,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div1/165...
13310,sdwr19226,SDwr_M1,SDwr_V1,SDwr_O1,,,,Groundwater,wadeID4,Groundwater,,,,4326,,,10120111,44.19641,-103.35057,,,POD,,wadeID16513,,,SD,,,,,,,,,,0.02200,License,1651-1,Stagebarn Housing Center,1997-05-16,,,,,,Commercial,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div1/165...
13311,sdwr19227,SDwr_M1,SDwr_V1,SDwr_O1,,,,Dry Draw,wadeID3,Surface Water,,,,4326,,,10120203,44.40136,-103.84815,,,POD,,wadeID16514,,,SD,,,,,,,,,,,License,1422-1,Lac Minerals Usa Llc,1987-10-01,,,,,,Industrial,,,,,,0,,,,,,,,,,https://danr.sd.gov/wrimage/wrinfo/wr_div1/142...


In [28]:
# Export the output dataframe
outdf.to_csv('Pwr_sdMain.zip', compression=dict(method='zip', archive_name='Pwr_sdMain.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.