# Pre-processing North Dakaota Allocation data for WaDEQA upload.
Date Updated: 03/28/2023
Purpose:  To pre-process the ND data into one master file for simple DataFrame creation and extraction

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/NorthDakota/WaterAllocation/RawInputData"
os.chdir(workingDir)

## POD Data: Permit

In [3]:
# Input File
dfinPOD = pd.read_csv("Permits_input.zip", encoding = "ISO-8859-1", compression='zip').replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "ndwr" + df.index.astype(str)
    dfinPOD.to_csv('Permits_input.csv', index=False)

print(len(dfinPOD))
dfinPOD.head(1)

12181


Unnamed: 0,WaDEUUID,ReasonRemoved,IncompleteField,permit_ind,permit_num,pod_index,pod,permit_hol,priority_d,use_type,status,date_issue,date_cance,beneficial,county,hu_sub_bas,aquifer,subaquifer,req_acft,req_acre,req_rate,req_storag,app_acft,app_acre,app_rate,app_storag,pod_status,source,irrigation,source_nam,mainstem,impound_lo,impound_na,return_des,discharge_,period_sta,period_end,return_qua,held_acft,held_acre,held_rate,held_stora,longitude,latitude
0,ndwr0,,,1,4407,1,13007302B,"KETTERLING, ROLAND & LORRAINE",3/4/1991,Irrigation,Denied,,,1111-11-11,McIntosh,Beaver,,,204.0,135.2,1000.0,0.0,0.0,0.0,0.0,0.0,Denied,Ground Water,Sprinkler,,0,,,,,,,0,0.0,0.0,0.0,0.0,-99.78988,46.1113


In [4]:
# swapping order owner name
import re

def createOwnerName(val):
    if val == "" or pd.isnull(val):
        outString = ""
    else:
        val = str(val)
        val = val.strip()
        if "," in val:
            x = val.split(",")
            outString = str(x[0]).strip() + " " + str(x[1]).strip()
        else:
            outString = val     
    return outString
dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: createOwnerName(row['permit_hol']), axis=1)

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).strip()
    return Val
dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
dfinPOD['in_AllocationOwner'].unique()

array(['KETTERLING ROLAND  LORRAINE', 'HYDE GEORGE H', 'SENERIUS FRED',
       ..., 'IGLEHART DOUG', 'BROTEN ERIC', 'NEW ROCKFORD CITY OF'],
      dtype=object)

In [5]:
dfinPOD['in_WaterAllocationNativeURL'] = "https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=" + dfinPOD['permit_ind'].astype(str)
dfinPOD['in_WaterAllocationNativeURL'].unique()

array(['https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=1',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=2',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=11',
       ...,
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=5909',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=5910',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=5911'],
      dtype=object)

In [6]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "NDwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "NDwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "NDwr_O1"

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Water Source
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df["in_WaterSourceName"] = dfinPOD['source_nam']
df['in_WaterSourceNativeID'] = ""
df["in_WaterSourceTypeCV"] = dfinPOD['source']

# Site
df["in_CoordinateAccuracy"] = "WaDE Unspecified"
df["in_CoordinateMethodCV"] = "Centroid of Area"
df['in_County'] = dfinPOD['county']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df["in_Latitude"] = dfinPOD['latitude']
df["in_Longitude"] = dfinPOD['longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df["in_PODorPOUSite"] = "POD"
df["in_SiteName"] = "WaDE Unspecified"
df["in_SiteNativeID"] = "POD" + dfinPOD['pod'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df["in_SiteTypeCV"] = "WaDE Unspecified"
df["in_StateCV"] = "ND"
df['in_USGSSiteID'] = ""

# Allocation
df["in_AllocationApplicationDate"] = dfinPOD['date_issue']
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df["in_AllocationExpirationDate"] = dfinPOD['date_cance']
df["in_AllocationFlow_CFS"] = dfinPOD['app_rate'].astype(float)
df['in_AllocationLegalStatusCV'] = dfinPOD['pod_status'].astype(str)
df["in_AllocationNativeID"] = dfinPOD['permit_num'].astype(str)
df['in_AllocationOwner'] = dfinPOD['in_AllocationOwner']
df['in_AllocationPriorityDate'] = dfinPOD['priority_d']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfinPOD['period_end']
df['in_AllocationTimeframeStart'] = dfinPOD['period_sta']
df['in_AllocationTypeCV'] = ""
df["in_AllocationVolume_AF"] = dfinPOD['app_acft'].astype(float)
df["in_BeneficialUseCategory"] = dfinPOD['use_type']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOD['app_acre'].astype(float)
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=" + dfinPOD['in_WaterAllocationNativeURL']

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

12181


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,ndwr0,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Ground Water,WaDE Unspecified,Centroid of Area,McIntosh,4326,,,,46.1113,-99.78988,,,POD,WaDE Unspecified,POD13007302B,,WaDE Unspecified,ND,,,,,,,,,,0.0,Denied,4407,KETTERLING ROLAND LORRAINE,3/4/1991,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
1,ndwr1,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,WaDE Unspecified,Centroid of Area,McKenzie,4326,,,,48.02622,-103.75216,,,POD,WaDE Unspecified,POD15310236CC,,WaDE Unspecified,ND,,,,,,,,,,0.0,Cancelled,1E,HYDE GEORGE H,8/15/1901,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
2,ndwr10,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,WaDE Unspecified,Centroid of Area,Mercer,4326,,,,47.28288,-101.86756,,,POD,WaDE Unspecified,POD14408820BB,,WaDE Unspecified,ND,,,,,,,,,,0.0,Cancelled,8E,SENERIUS FRED,12/22/1937,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
3,ndwr100,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,WaDE Unspecified,Centroid of Area,Ward,4326,,,,48.33611,-101.46082,,,POD,WaDE Unspecified,POD15608415BD,,WaDE Unspecified,ND,,6/9/1915,,,,,,,,46.1,InActive,91C,KUDA ALAN P and HUGHESKUDA,6/9/1915,,,,,17.0,Irrigation,,,,,,0,,8.5,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
4,ndwr1000,NDwr_M1,NDwr_V1,NDwr_O1,,,,,,Surface Water,WaDE Unspecified,Centroid of Area,Williams,4326,,,,48.52694,-103.71934,,,POD,WaDE Unspecified,POD15810110AC,,WaDE Unspecified,ND,,,,,,,,,2/9/1994,0.0,Cancelled,902,GREV HENRY,4/10/1961,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...


## Concatenate and Clean Data

In [7]:
# Concatenate dataframes
frames = [outPOD]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

12181


## WaDE Custom Elements (due to missing sate info)

In [8]:
# Fixing empty string names

def fixEmptyString(val):
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = val
    return outString

In [9]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['WaDE Unspecified', 'Unnamed stream trib. of Little Muddy',
       'Missouri River', 'Goschke Dam on Tongue River',
       'Goshke Dam on Busse Coulee', 'Goschke Dam on Busse Coulee',
       'Unnamed Tributary', 'Unamed Coulee trib. of Lower Des Lacs',
       'Souris River', 'Flat Creek trib. of N. Fork Grand River',
       'Cannonball River', 'Chimney Butte Creek',
       'Unnamed coulee trib. of the Heart River',
       'Unnamed coulee a trib. of t Heart River',
       'Unnamed creek, trib of Wolf Creek', 'Lake Sakakawea',
       'Forest River', "Gibb's Spring, Trib. of Little Knife",
       'White Earth Creek', 'Spring Creek, Trib. of Knife River',
       'Willow Creek', 'Little Bull Creek',
       'Unnamed tributary of North Branch Park',
       'Unnamed creek, trib.of Stony Slough', 'Sheyene River',
       'unnamed waterway, trib of Des Lacs River', 'Goschke Dam',
       'Unnamed creek, trib. of the Tongue River',
       'Bussee Coulee, trib. of the Tongue River',
       'W

In [10]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Ground Water', 'Surface Water', 'WaDE Unspecified'], dtype=object)

In [11]:
outdf['in_County'] = outdf.apply(lambda row: fixEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['McIntosh', 'McKenzie', 'Mercer', 'Ward', 'Williams', 'Oliver',
       'Renville', 'Dickey', 'Sargent', 'Dunn', 'Grand Forks', 'McHenry',
       'Logan', 'Grant', 'Pembina', 'McLean', 'Kidder', 'Bowman',
       'Stutsman', 'Adams', 'LaMoure', 'Sioux', 'Morton', 'Mountrail',
       'Griggs', 'Burleigh', 'Ransom', 'Bottineau', 'Steele', 'Billings',
       'Divide', 'Burke', 'Benson', 'Cavalier', 'Wells', 'Emmons',
       'Stark', 'Nelson', 'Cass', 'Eddy', 'Barnes', 'Towner', 'Walsh',
       'Foster', 'Richland', 'Slope', 'Golden Valley', 'Traill',
       'Hettinger', 'Rolette', 'Ramsey', 'Sheridan', 'Pierce'],
      dtype=object)

In [12]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['WaDE Unspecified'], dtype=object)

In [13]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

array(['Denied', 'Cancelled', 'InActive', 'Active', 'Deferred', 'Void',
       'Held In Abeyance', 'WaDE Unspecified'], dtype=object)

In [14]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fixEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['Irrigation', 'Industrial', 'Rural Water', 'Fish and Wildlife',
       'Recreation', 'Stock', 'Municipal', 'Multiple Use',
       'Flood Control', 'Undefined', 'Power Generation', 'Domestic'],
      dtype=object)

In [15]:
# in_Latitude & in_Longitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna(0)
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna(0)
outdf.head(1)

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,ndwr0,NDwr_M1,NDwr_V1,NDwr_O1,,,,WaDE Unspecified,,Ground Water,WaDE Unspecified,Centroid of Area,McIntosh,4326,,,,46.1113,-99.78988,,,POD,WaDE Unspecified,POD13007302B,,WaDE Unspecified,ND,,,,,,,,,,0.0,Denied,4407,KETTERLING ROLAND LORRAINE,3/4/1991,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...


In [16]:
# Changing datatype of date fields to fit WaDE.
outdf['in_AllocationApplicationDate'] = pd.to_datetime(outdf['in_AllocationApplicationDate'], errors = 'coerce')
outdf['in_AllocationApplicationDate'] = pd.to_datetime(outdf["in_AllocationApplicationDate"].dt.strftime('%m/%d/%Y'))

outdf['in_AllocationExpirationDate'] = pd.to_datetime(outdf['in_AllocationExpirationDate'], errors = 'coerce')
outdf['in_AllocationExpirationDate'] = pd.to_datetime(outdf["in_AllocationExpirationDate"].dt.strftime('%m/%d/%Y'))

outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf.head()

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,ndwr0,NDwr_M1,NDwr_V1,NDwr_O1,,,,WaDE Unspecified,,Ground Water,WaDE Unspecified,Centroid of Area,McIntosh,4326,,,,46.1113,-99.78988,,,POD,WaDE Unspecified,POD13007302B,,WaDE Unspecified,ND,,NaT,,,,,,,NaT,0.0,Denied,4407,KETTERLING ROLAND LORRAINE,1991-03-04,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
1,ndwr1,NDwr_M1,NDwr_V1,NDwr_O1,,,,WaDE Unspecified,,Surface Water,WaDE Unspecified,Centroid of Area,McKenzie,4326,,,,48.02622,-103.75216,,,POD,WaDE Unspecified,POD15310236CC,,WaDE Unspecified,ND,,NaT,,,,,,,NaT,0.0,Cancelled,1E,HYDE GEORGE H,1901-08-15,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
2,ndwr10,NDwr_M1,NDwr_V1,NDwr_O1,,,,WaDE Unspecified,,Surface Water,WaDE Unspecified,Centroid of Area,Mercer,4326,,,,47.28288,-101.86756,,,POD,WaDE Unspecified,POD14408820BB,,WaDE Unspecified,ND,,NaT,,,,,,,NaT,0.0,Cancelled,8E,SENERIUS FRED,1937-12-22,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
3,ndwr100,NDwr_M1,NDwr_V1,NDwr_O1,,,,WaDE Unspecified,,Surface Water,WaDE Unspecified,Centroid of Area,Ward,4326,,,,48.33611,-101.46082,,,POD,WaDE Unspecified,POD15608415BD,,WaDE Unspecified,ND,,1915-06-09,,,,,,,NaT,46.1,InActive,91C,KUDA ALAN P and HUGHESKUDA,1915-06-09,,,,,17.0,Irrigation,,,,,,0,,8.5,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...
4,ndwr1000,NDwr_M1,NDwr_V1,NDwr_O1,,,,WaDE Unspecified,,Surface Water,WaDE Unspecified,Centroid of Area,Williams,4326,,,,48.52694,-103.71934,,,POD,WaDE Unspecified,POD15810110AC,,WaDE Unspecified,ND,,NaT,,,,,,,1994-02-09,0.0,Cancelled,902,GREV HENRY,1961-04-10,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...


In [17]:
outdf['in_AllocationTimeframeEnd'] = pd.to_datetime(outdf['in_AllocationTimeframeEnd'], errors = 'coerce')
outdf['in_AllocationTimeframeEnd'] = outdf["in_AllocationTimeframeEnd"].dt.strftime('%m/%d')
outdf['in_AllocationTimeframeEnd'].unique()

array([nan, '10/15', '06/15', '04/01', '05/15', '05/01'], dtype=object)

In [18]:
outdf['in_AllocationTimeframeStart'] = pd.to_datetime(outdf['in_AllocationTimeframeStart'], errors = 'coerce')
outdf['in_AllocationTimeframeStart'] = outdf["in_AllocationTimeframeStart"].dt.strftime('%m/%d')
outdf['in_AllocationTimeframeStart'].unique()

array([nan, '04/01'], dtype=object)

In [19]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').fillna(0)
outdf['in_AllocationFlow_CFS'].unique()

array([0.000000e+00, 4.610000e+01, 6.500000e+03, 1.750000e+02,
       3.000000e+02, 8.750000e+01, 8.000000e+02, 1.174000e+03,
       1.671000e+03, 2.000000e+01, 2.445000e+03, 8.500000e+02,
       4.000000e+02, 4.500000e+01, 7.000000e+02, 1.600000e+03,
       1.050000e+03, 3.500000e+02, 2.450000e+03, 9.000000e+02,
       1.000000e+03, 8.010000e+03, 2.000000e+02, 4.760000e+02,
       9.000000e+01, 3.750000e+02, 1.500000e+01, 6.000000e+00,
       5.000000e+03, 1.346400e+04, 4.250000e+02, 3.500000e+03,
       9.500000e+02, 4.380000e+02, 2.500000e+02, 3.600000e+02,
       1.850000e+03, 1.500000e+03, 5.000000e+02, 1.211700e+03,
       1.240000e+03, 1.890000e+02, 2.500000e+03, 5.190000e+02,
       5.000000e+01, 8.750000e+02, 7.300000e+01, 1.000000e+02,
       7.500000e+02, 5.830000e+01, 1.200000e+03, 8.250000e+02,
       1.200000e+01, 4.450000e+03, 1.950000e+03, 1.585000e+03,
       6.500000e+02, 4.500000e+02, 2.250000e+03, 1.347000e+03,
       1.346000e+03, 1.346000e+04, 9.520000e+02, 2.1500

In [20]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').fillna(0)
outdf['in_AllocationVolume_AF'].unique()

array([   0.  ,   17.  , 1858.  , ..., 4480.  ,  193.05,  871.  ])

In [21]:
# Fixing in_IrrigatedAcreage datatype
outdf['in_IrrigatedAcreage'] = pd.to_numeric(outdf['in_IrrigatedAcreage'], errors='coerce').fillna(0)
outdf['in_IrrigatedAcreage'].unique()

array([  0. ,   8.5, 929. , ..., 142.2, 258.3, 350.1])

In [22]:
%%time

# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

Wall time: 4.79 s


array(['wadeID1', 'wadeID2', 'wadeID3', 'wadeID4', 'wadeID5', 'wadeID6',
       'wadeID7', 'wadeID8', 'wadeID9', 'wadeID10', 'wadeID11',
       'wadeID12', 'wadeID13', 'wadeID14', 'wadeID15', 'wadeID16',
       'wadeID17', 'wadeID18', 'wadeID19', 'wadeID20', 'wadeID21',
       'wadeID22', 'wadeID23', 'wadeID24', 'wadeID25', 'wadeID26',
       'wadeID27', 'wadeID28', 'wadeID29', 'wadeID30', 'wadeID31',
       'wadeID32', 'wadeID33', 'wadeID34', 'wadeID35', 'wadeID36',
       'wadeID37', 'wadeID38', 'wadeID39', 'wadeID40', 'wadeID41',
       'wadeID42', 'wadeID43', 'wadeID44', 'wadeID45', 'wadeID46',
       'wadeID47', 'wadeID48', 'wadeID49', 'wadeID50', 'wadeID51',
       'wadeID52', 'wadeID53', 'wadeID54', 'wadeID55', 'wadeID56',
       'wadeID57', 'wadeID58', 'wadeID59', 'wadeID60', 'wadeID61',
       'wadeID62', 'wadeID63', 'wadeID64', 'wadeID65', 'wadeID66',
       'wadeID67', 'wadeID68', 'wadeID69', 'wadeID70', 'wadeID71',
       'wadeID72', 'wadeID73', 'wadeID74', 'wadeID75', 'wad

# Export the Output 

In [23]:
#technique to check datatype of long dataframes.
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

WaDEUUID                                                object
in_MethodUUID                                           object
in_VariableSpecificUUID                                 object
in_OrganizationUUID                                     object
in_Geometry                                             object
in_GNISFeatureNameCV                                    object
in_WaterQualityIndicatorCV                              object
in_WaterSourceName                                      object
in_WaterSourceNativeID                                  object
in_WaterSourceTypeCV                                    object
in_CoordinateAccuracy                                   object
in_CoordinateMethodCV                                   object
in_County                                               object
in_EPSGCodeCV                                            int64
in_GNISCodeCV                                           object
in_HUC12                                               

In [24]:
# Export the output dataframe
outdf.to_csv('Pwr_ndMain.zip', index=False, compression="zip")  # The output, save as a zip
#dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.