# Pre-processing North Dakaota Allocation data for WaDEQA upload.
Date Updated: 03/28/2023
Purpose:  To pre-process the ND data into one master file for simple DataFrame creation and extraction

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/NorthDakota/WaterAllocation/RawInputData"
os.chdir(workingDir)

## POD Data: Permit

In [3]:
# Input File
dfinPOD = pd.read_csv("Permits_input.zip", encoding = "ISO-8859-1", compression='zip').replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "ndwr" + df.index.astype(str)
    dfinPOD.to_csv('Permits_input.csv', index=False)

print(len(dfinPOD))
dfinPOD.head(1)

12181


Unnamed: 0,WaDEUUID,ReasonRemoved,IncompleteField,permit_ind,permit_num,pod_index,pod,permit_hol,priority_d,use_type,status,date_issue,date_cance,beneficial,county,hu_sub_bas,aquifer,subaquifer,req_acft,req_acre,req_rate,req_storag,app_acft,app_acre,app_rate,app_storag,pod_status,source,irrigation,source_nam,mainstem,impound_lo,impound_na,return_des,discharge_,period_sta,period_end,return_qua,held_acft,held_acre,held_rate,held_stora,longitude,latitude
0,ndwr0,,,1,4407,1,13007302B,"KETTERLING, ROLAND & LORRAINE",3/4/1991,Irrigation,Denied,,,1111-11-11,McIntosh,Beaver,,,204.0,135.2,1000.0,0.0,0.0,0.0,0.0,0.0,Denied,Ground Water,Sprinkler,,0,,,,,,,0,0.0,0.0,0.0,0.0,-99.78988,46.1113


In [4]:
# swapping order owner name
import re

def createOwnerName(val):
    if val == "" or pd.isnull(val):
        outString = ""
    else:
        val = str(val)
        val = val.strip()
        if "," in val:
            x = val.split(",")
            outString = str(x[0]).strip() + " " + str(x[1]).strip()
        else:
            outString = val     
    return outString
dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: createOwnerName(row['permit_hol']), axis=1)

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
dfinPOD['in_AllocationOwner'].unique()

array(['Ketterling Roland Lorraine', 'Hyde George H', 'Senerius Fred',
       ..., 'Iglehart Doug', 'Broten Eric', 'New Rockford City Of'],
      dtype=object)

In [5]:
dfinPOD['in_WaterAllocationNativeURL'] = "https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=" + dfinPOD['permit_ind'].astype(str)
dfinPOD['in_WaterAllocationNativeURL'].unique()

array(['https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=1',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=2',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=11',
       ...,
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=5909',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=5910',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=5911'],
      dtype=object)

In [6]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "NDwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "NDwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "NDwr_O1"

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Water Source
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df["in_WaterSourceName"] = dfinPOD['source_nam']
df['in_WaterSourceNativeID'] = ""
df["in_WaterSourceTypeCV"] = dfinPOD['source']

# Site
df["in_CoordinateAccuracy"] = ""
df["in_CoordinateMethodCV"] = "Centroid of Area"
df['in_County'] = dfinPOD['county']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df["in_Latitude"] = dfinPOD['latitude']
df["in_Longitude"] = dfinPOD['longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df["in_PODorPOUSite"] = "POD"
df["in_SiteName"] = ""
df["in_SiteNativeID"] = "POD" + dfinPOD['pod'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df["in_SiteTypeCV"] = ""
df["in_StateCV"] = "ND"
df['in_USGSSiteID'] = ""

# Allocation
df["in_AllocationApplicationDate"] = dfinPOD['date_issue']
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df["in_AllocationExpirationDate"] = dfinPOD['date_cance']
df["in_AllocationFlow_CFS"] = dfinPOD['app_rate'].astype(float)
df['in_AllocationLegalStatusCV'] = dfinPOD['pod_status'].astype(str)
df["in_AllocationNativeID"] = dfinPOD['permit_num'].astype(str)
df['in_AllocationOwner'] = dfinPOD['in_AllocationOwner']
df['in_AllocationPriorityDate'] = dfinPOD['priority_d']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfinPOD['period_end']
df['in_AllocationTimeframeStart'] = dfinPOD['period_sta']
df['in_AllocationTypeCV'] = ""
df["in_AllocationVolume_AF"] = dfinPOD['app_acft'].astype(float)
df["in_BeneficialUseCategory"] = dfinPOD['use_type']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOD['app_acre'].astype(float)
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=" + dfinPOD['in_WaterAllocationNativeURL']

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

12181


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,ndwr0,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Ground Water,,Centroid of Area,McIntosh,4326,,,,46.1113,-99.78988,,,POD,,POD13007302B,,,ND,,,,,,,,,,0.0,Denied,4407,Ketterling Roland Lorraine,3/4/1991,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
1,ndwr1,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,,Centroid of Area,McKenzie,4326,,,,48.02622,-103.75216,,,POD,,POD15310236CC,,,ND,,,,,,,,,,0.0,Cancelled,1E,Hyde George H,8/15/1901,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
2,ndwr10,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,,Centroid of Area,Mercer,4326,,,,47.28288,-101.86756,,,POD,,POD14408820BB,,,ND,,,,,,,,,,0.0,Cancelled,8E,Senerius Fred,12/22/1937,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
3,ndwr100,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,,Centroid of Area,Ward,4326,,,,48.33611,-101.46082,,,POD,,POD15608415BD,,,ND,,6/9/1915,,,,,,,,46.1,InActive,91C,Kuda Alan P And Hugheskuda,6/9/1915,,,,,17.0,Irrigation,,,,,,0,,8.5,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
4,ndwr1000,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,,Centroid of Area,Williams,4326,,,,48.52694,-103.71934,,,POD,,POD15810110AC,,,ND,,,,,,,,,2/9/1994,0.0,Cancelled,902,Grev Henry,4/10/1961,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...


## Concatenate and Clean Data

In [7]:
# Concatenate dataframes
frames = [outPOD] # just POD for now
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

12181


## WaDE Custom Elements (due to missing sate info)

In [8]:
# Clean owner name up
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [9]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Ketterling Roland Lorraine', 'Hyde George H', 'Senerius Fred',
       ..., 'Iglehart Doug', 'Broten Eric', 'New Rockford City Of'],
      dtype=object)

In [10]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array([''], dtype=object)

In [11]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['', 'Unnamed Stream Trib Of Little Muddy', 'Missouri River',
       'Goschke Dam On Tongue River', 'Goshke Dam On Busse Coulee',
       'Goschke Dam On Busse Coulee', 'Unnamed Tributary',
       'Unamed Coulee Trib Of Lower Des Lacs', 'Souris River',
       'Flat Creek Trib Of N Fork Grand River', 'Cannonball River',
       'Chimney Butte Creek', 'Unnamed Coulee Trib Of The Heart River',
       'Unnamed Coulee A Trib Of T Heart River',
       'Unnamed Creek Trib Of Wolf Creek', 'Lake Sakakawea',
       'Forest River', "Gibb'S Spring Trib Of Little Knife",
       'White Earth Creek', 'Spring Creek Trib Of Knife River',
       'Willow Creek', 'Little Bull Creek',
       'Unnamed Tributary Of North Branch Park',
       'Unnamed Creek Tribof Stony Slough', 'Sheyene River',
       'Unnamed Waterway Trib Of Des Lacs River', 'Goschke Dam',
       'Unnamed Creek Trib Of The Tongue River',
       'Bussee Coulee Trib Of The Tongue River',
       'Warwick Dam On Sheynne River',
       'Unn

In [12]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Mcintosh', 'Mckenzie', 'Mercer', 'Ward', 'Williams', 'Oliver',
       'Renville', 'Dickey', 'Sargent', 'Dunn', 'Grand Forks', 'Mchenry',
       'Logan', 'Grant', 'Pembina', 'Mclean', 'Kidder', 'Bowman',
       'Stutsman', 'Adams', 'Lamoure', 'Sioux', 'Morton', 'Mountrail',
       'Griggs', 'Burleigh', 'Ransom', 'Bottineau', 'Steele', 'Billings',
       'Divide', 'Burke', 'Benson', 'Cavalier', 'Wells', 'Emmons',
       'Stark', 'Nelson', 'Cass', 'Eddy', 'Barnes', 'Towner', 'Walsh',
       'Foster', 'Richland', 'Slope', 'Golden Valley', 'Traill',
       'Hettinger', 'Rolette', 'Ramsey', 'Sheridan', 'Pierce'],
      dtype=object)

In [13]:
def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [14]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['', 'Unnamed Stream Trib Of Little Muddy', 'Missouri River',
       'Goschke Dam On Tongue River', 'Goshke Dam On Busse Coulee',
       'Goschke Dam On Busse Coulee', 'Unnamed Tributary',
       'Unamed Coulee Trib Of Lower Des Lacs', 'Souris River',
       'Flat Creek Trib Of N Fork Grand River', 'Cannonball River',
       'Chimney Butte Creek', 'Unnamed Coulee Trib Of The Heart River',
       'Unnamed Coulee A Trib Of T Heart River',
       'Unnamed Creek Trib Of Wolf Creek', 'Lake Sakakawea',
       'Forest River', "Gibb'S Spring Trib Of Little Knife",
       'White Earth Creek', 'Spring Creek Trib Of Knife River',
       'Willow Creek', 'Little Bull Creek',
       'Unnamed Tributary Of North Branch Park',
       'Unnamed Creek Tribof Stony Slough', 'Sheyene River',
       'Unnamed Waterway Trib Of Des Lacs River', 'Goschke Dam',
       'Unnamed Creek Trib Of The Tongue River',
       'Bussee Coulee Trib Of The Tongue River',
       'Warwick Dam On Sheynne River',
       'Unn

In [15]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Ground Water', 'Surface Water', ''], dtype=object)

In [16]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Mcintosh', 'Mckenzie', 'Mercer', 'Ward', 'Williams', 'Oliver',
       'Renville', 'Dickey', 'Sargent', 'Dunn', 'Grand Forks', 'Mchenry',
       'Logan', 'Grant', 'Pembina', 'Mclean', 'Kidder', 'Bowman',
       'Stutsman', 'Adams', 'Lamoure', 'Sioux', 'Morton', 'Mountrail',
       'Griggs', 'Burleigh', 'Ransom', 'Bottineau', 'Steele', 'Billings',
       'Divide', 'Burke', 'Benson', 'Cavalier', 'Wells', 'Emmons',
       'Stark', 'Nelson', 'Cass', 'Eddy', 'Barnes', 'Towner', 'Walsh',
       'Foster', 'Richland', 'Slope', 'Golden Valley', 'Traill',
       'Hettinger', 'Rolette', 'Ramsey', 'Sheridan', 'Pierce'],
      dtype=object)

In [17]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array([''], dtype=object)

In [18]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

array(['Denied', 'Cancelled', 'InActive', 'Active', 'Deferred', 'Void',
       'Held In Abeyance', ''], dtype=object)

In [19]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['Irrigation', 'Industrial', 'Rural Water', 'Fish and Wildlife',
       'Recreation', 'Stock', 'Municipal', 'Multiple Use',
       'Flood Control', 'Undefined', 'Power Generation', 'Domestic'],
      dtype=object)

In [20]:
# in_Latitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna("")
outdf['in_Latitude'].unique()

array([46.1113  , 48.02622 , 47.28288 , ..., 48.520211, 48.520201,
       48.512898])

In [21]:
# in_Longitude
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna("")
outdf['in_Longitude'].unique()

array([ -99.78988 , -103.75216 , -101.86756 , ..., -101.184898,
       -101.174198, -101.163418])

In [22]:
# Changing datatype of date fields to fit WaDE.
outdf['in_AllocationApplicationDate'] = pd.to_datetime(outdf['in_AllocationApplicationDate'], errors = 'coerce')
outdf['in_AllocationApplicationDate'] = pd.to_datetime(outdf["in_AllocationApplicationDate"].dt.strftime('%m/%d/%Y'))

outdf['in_AllocationExpirationDate'] = pd.to_datetime(outdf['in_AllocationExpirationDate'], errors = 'coerce')
outdf['in_AllocationExpirationDate'] = pd.to_datetime(outdf["in_AllocationExpirationDate"].dt.strftime('%m/%d/%Y'))

outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf.head()

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,ndwr0,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Ground Water,,Centroid of Area,Mcintosh,4326,,,,46.1113,-99.78988,,,POD,,POD13007302B,,,ND,,NaT,,,,,,,NaT,0.0,Denied,4407,Ketterling Roland Lorraine,1991-03-04,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
1,ndwr1,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,,Centroid of Area,Mckenzie,4326,,,,48.02622,-103.75216,,,POD,,POD15310236CC,,,ND,,NaT,,,,,,,NaT,0.0,Cancelled,1E,Hyde George H,1901-08-15,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
2,ndwr10,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,,Centroid of Area,Mercer,4326,,,,47.28288,-101.86756,,,POD,,POD14408820BB,,,ND,,NaT,,,,,,,NaT,0.0,Cancelled,8E,Senerius Fred,1937-12-22,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
3,ndwr100,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,,Centroid of Area,Ward,4326,,,,48.33611,-101.46082,,,POD,,POD15608415BD,,,ND,,1915-06-09,,,,,,,NaT,46.1,InActive,91C,Kuda Alan P And Hugheskuda,1915-06-09,,,,,17.0,Irrigation,,,,,,0,,8.5,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
4,ndwr1000,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,,Centroid of Area,Williams,4326,,,,48.52694,-103.71934,,,POD,,POD15810110AC,,,ND,,NaT,,,,,,,1994-02-09,0.0,Cancelled,902,Grev Henry,1961-04-10,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...


In [23]:
outdf['in_AllocationTimeframeEnd'] = pd.to_datetime(outdf['in_AllocationTimeframeEnd'], errors = 'coerce')
outdf['in_AllocationTimeframeEnd'] = outdf["in_AllocationTimeframeEnd"].dt.strftime('%m/%d')
outdf['in_AllocationTimeframeEnd'].unique()

array([nan, '10/15', '06/15', '04/01', '05/15', '05/01'], dtype=object)

In [24]:
outdf['in_AllocationTimeframeStart'] = pd.to_datetime(outdf['in_AllocationTimeframeStart'], errors = 'coerce')
outdf['in_AllocationTimeframeStart'] = outdf["in_AllocationTimeframeStart"].dt.strftime('%m/%d')
outdf['in_AllocationTimeframeStart'].unique()

array([nan, '04/01'], dtype=object)

In [25]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array(['', 46.1, 6500.0, 175.0, 300.0, 87.5, 800.0, 1174.0, 1671.0, 20.0,
       2445.0, 850.0, 400.0, 45.0, 700.0, 1600.0, 1050.0, 350.0, 2450.0,
       900.0, 1000.0, 8010.0, 200.0, 476.0, 90.0, 375.0, 15.0, 6.0,
       5000.0, 13464.0, 425.0, 3500.0, 950.0, 438.0, 250.0, 360.0, 1850.0,
       1500.0, 500.0, 1211.7, 1240.0, 189.0, 2500.0, 519.0, 50.0, 875.0,
       73.0, 100.0, 750.0, 58.3, 1200.0, 825.0, 12.0, 4450.0, 1950.0,
       1585.0, 650.0, 450.0, 2250.0, 1347.0, 1346.0, 13460.0, 952.0,
       215.0, 35.0, 60.0, 2000.0, 10501.9, 3.0, 655.0, 4937.13, 1962.0,
       685.0, 600.0, 6732.0, 930.0, 1900.0, 11220.0, 150.0, 285.0, 8976.0,
       40.0, 12600.0, 3600.0, 623.0, 1300.0, 831.7, 772.9, 4500.0, 3000.0,
       320.0, 4970.0, 133.0, 448.8, 595.0, 180.0, 2400.0, 491.0, 1800.0,
       2900.0, 78.0, 1680.0, 2700.0, 675.0, 1350.0, 1750.0, 1100.0,
       4306.0, 810.0, 803.0, 4000.0, 583.0, 5834.4, 125.0, 580.0, 2815.0,
       3062.0, 2692.8, 2600.0, 2200.0, 12399.0, 670.0, 120.0,

In [26]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array(['', 17.0, 1858.0, ..., 4480.0, 193.05, 871.0], dtype=object)

In [27]:
# Fixing in_IrrigatedAcreage datatype
outdf['in_IrrigatedAcreage'] = pd.to_numeric(outdf['in_IrrigatedAcreage'], errors='coerce').replace(0,"").fillna("")
outdf['in_IrrigatedAcreage'].unique()

array(['', 8.5, 929.0, ..., 142.2, 258.3, 350.1], dtype=object)

In [28]:
%%time

# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

Wall time: 144 ms


array(['wadeID1', 'wadeID2', 'wadeID3', 'wadeID4', 'wadeID5', 'wadeID6',
       'wadeID7', 'wadeID8', 'wadeID9', 'wadeID10', 'wadeID11',
       'wadeID12', 'wadeID13', 'wadeID14', 'wadeID15', 'wadeID16',
       'wadeID17', 'wadeID18', 'wadeID19', 'wadeID20', 'wadeID21',
       'wadeID22', 'wadeID23', 'wadeID24', 'wadeID25', 'wadeID26',
       'wadeID27', 'wadeID28', 'wadeID29', 'wadeID30', 'wadeID31',
       'wadeID32', 'wadeID33', 'wadeID34', 'wadeID35', 'wadeID36',
       'wadeID37', 'wadeID38', 'wadeID39', 'wadeID40', 'wadeID41',
       'wadeID42', 'wadeID43', 'wadeID44', 'wadeID45', 'wadeID46',
       'wadeID47', 'wadeID48', 'wadeID49', 'wadeID50', 'wadeID51',
       'wadeID52', 'wadeID53', 'wadeID54', 'wadeID55', 'wadeID56',
       'wadeID57', 'wadeID58', 'wadeID59', 'wadeID60', 'wadeID61',
       'wadeID62', 'wadeID63', 'wadeID64', 'wadeID65', 'wadeID66',
       'wadeID67', 'wadeID68', 'wadeID69', 'wadeID70', 'wadeID71',
       'wadeID72', 'wadeID73', 'wadeID74', 'wadeID75', 'wad

## Drop non-Active AllocationLegalStatusCV Water Rights
- For ND, we don't want water rights that are considered: Application In Processing, Cancelled, Deferred, Denied, Held In Abeyance, InActive, Pending Review, Under Review, Void

In [29]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Application In Processing", "Cancelled", "Deferred", "Denied", "Held In Abeyance", "InActive", "Pending Review", "Under Review", "Void"]

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

5562


array(['Active', ''], dtype=object)

# Export the Output 

In [30]:
#technique to check datatype of long dataframes.
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

WaDEUUID                                                object
in_MethodUUID                                           object
in_VariableSpecificUUID                                 object
in_OrganizationUUID                                     object
in_Geometry                                             object
in_GNISFeatureNameCV                                    object
in_WaterQualityIndicatorCV                              object
in_WaterSourceName                                      object
in_WaterSourceNativeID                                  object
in_WaterSourceTypeCV                                    object
in_CoordinateAccuracy                                   object
in_CoordinateMethodCV                                   object
in_County                                               object
in_EPSGCodeCV                                            int64
in_GNISCodeCV                                           object
in_HUC12                                               

In [31]:
# Export the output dataframe
outdf.to_csv('Pwr_ndMain.zip', compression=dict(method='zip', archive_name='Pwr_ndMain.csv'), index=False)   # The output, save as a zip
#dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.