# Pre-processing (state / organization Name) Allocation data for WaDE upload.
- Purpose:  To pre-process the data into one master file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Minnesota/WaterAllocation" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/Minnesota/WaterAllocation


## Point of Diversion Data

In [3]:
# Input File
FI_PoD = "RawInputData/mpars_index_permits_installations_uses_WGS1984.zip"
dfinPOD = pd.read_csv(FI_PoD, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "d" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('RawInputData/mpars_index_permits_installations_uses_WGS1984.zip', compression=dict(method='zip', archive_name='mpars_index_permits_installatio.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head()

30818


  dfinPOD = pd.read_csv(FI_PoD, encoding = "ISO-8859-1").replace(np.nan, "")


Unnamed: 0,Ã¯Â»Â¿OID_,permit_number,general_permit_number,permit_status,permit_class,use_type,use_category,permit_total_volume_mgy,permit_total_acres,permit_effective_date,permit_expiration_date,project_name,landowner,agent,installation_name,installation_status,installation_pumping_rate_gpm,location_legal_description,utm_x,utm_y,latitude,longitude,county_name,watershed_major,watershed_name,resource_type,resource_category,resource_name,resource_number,well_number,well_depth_ft,aquifer,use_2022_mg,use_2021_mg,use_2020_mg,use_2019_mg,use_2018_mg,use_2017_mg,use_2016_mg,use_2015_mg,use_2014_mg,use_2013_mg,use_2012_mg,use_2011_mg,use_2010_mg,use_2009_mg,use_2008_mg,use_2007_mg,use_2006_mg,use_2005_mg,use_2004_mg,use_2003_mg,use_2002_mg,use_2001_mg,use_2000_mg,use_1999_mg,use_1998_mg,use_1997_mg,use_1996_mg,use_1995_mg,use_1994_mg,use_1993_mg,use_1992_mg,use_1991_mg,use_1990_mg,use_1989_mg,use_1988_mg,WaDEUUID
0,1,1945-0008,,Inactive,Individual Permit,Agricultural/Food Processing,Industrial Processing,1769.7,,10/8/1984 0:00:00,,Moorhead,American Crystal Sugar Company,,1,Inactive,3363.0,T140N-R48W-S32,212733.0,5199908.0,46.89055,-96.77104,Clay,57.0,Upper Red River of the North,Stream/River,Surface Water,RED RIVER OF THE NORTH,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3,0.0,0.0,7.992,0.0,29.5,24.0,0.0,d0
1,2,1947-0012,,Active,Individual Permit,Mine Processing (excludes sand/gravel),Industrial Processing,50000.0,,5/9/2003 0:00:00,,LAKE SUPERIOR,Cleveland Cliffs - Northshore Mining Co,,1,Active,95000.0,T55N-R7W-S6,630929.0,5237380.0,47.27682,-91.26885,Lake,2.0,Lake Superior - South,Lake,Surface Water,Superior,16000100.0,,,,15080.4,15664.8,15811.2,27024.0,44671.0,36464.0,36849.0,44169.0,45146.0,46551.0,45813.0,45362.0,44067.0,47035.0,46505.0,46792.0,45191.2,45861.6,46998.9,47304.0,45878.4,46008.0,47433.6,46677.82,47299.8,47304.0,44452.8,46641.0,1543.7,1548.0,39775.0,44145.6,31536.0,1123.0,0.0,d1
2,3,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #10 - 222051,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBAA,,222051.0,124.0,Quaternary Buried Artesian,0.00197,15.1196,9.94974,0.0,36.10323,2.5915,7.64653,0.0,18.22546,0.0,18.03,2.81,0.0,0.54,0.0,0.0,0.0,8.501,10.593,32.31,0.346,1.86,2.901,10.312,16.428,22.13,21.223,3.88407,96.703,92.571,33.778,0.0,,,,d2
3,4,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #8 - 222049,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBAA,,222049.0,122.0,Quaternary Buried Artesian,43.75936,177.65471,101.81552,94.83861,90.70029,66.43305,45.53407,51.25836,67.91247,101.29814,133.09,100.39,96.21,27.32,34.242,36.84,30.528,10.784,35.3915,33.47,89.021,32.58,31.722,54.469,75.965,119.03,100.034,93.11032,74.87,124.759,146.615,272.7,293.33,355.207,611.63,d3
4,5,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #9 - 222050,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBUA,,222050.0,114.0,Quaternary Buried Unconfined,89.3954,41.09104,98.08311,116.75849,145.53145,247.56611,124.52859,86.51413,24.67334,51.96678,134.57,49.13,32.47,18.51,57.858,86.11,45.682,43.795,26.6755,7.45,0.213,52.51,82.786,43.199,57.237,33.41,57.813,67.06561,128.327,65.2,115.069,0.0,,,,d4


In [4]:
# convert MGY to CFS for WaDE
# / 365 to get MGD * 1.8581441079018 to get CFS

def ConvertMGYToCFSFunc(valA):
    if str(valA).strip() == "":
        outVal = "";
    else:
        outVal = valA / 365 * 1.8581441079018
    return outVal

dfinPOD['in_AllocationFlow_CFS'] = dfinPOD.apply(lambda row: ConvertMGYToCFSFunc(row['permit_total_volume_mgy']), axis=1)
dfinPOD.head()

Unnamed: 0,Ã¯Â»Â¿OID_,permit_number,general_permit_number,permit_status,permit_class,use_type,use_category,permit_total_volume_mgy,permit_total_acres,permit_effective_date,permit_expiration_date,project_name,landowner,agent,installation_name,installation_status,installation_pumping_rate_gpm,location_legal_description,utm_x,utm_y,latitude,longitude,county_name,watershed_major,watershed_name,resource_type,resource_category,resource_name,resource_number,well_number,well_depth_ft,aquifer,use_2022_mg,use_2021_mg,use_2020_mg,use_2019_mg,use_2018_mg,use_2017_mg,use_2016_mg,use_2015_mg,use_2014_mg,use_2013_mg,use_2012_mg,use_2011_mg,use_2010_mg,use_2009_mg,use_2008_mg,use_2007_mg,use_2006_mg,use_2005_mg,use_2004_mg,use_2003_mg,use_2002_mg,use_2001_mg,use_2000_mg,use_1999_mg,use_1998_mg,use_1997_mg,use_1996_mg,use_1995_mg,use_1994_mg,use_1993_mg,use_1992_mg,use_1991_mg,use_1990_mg,use_1989_mg,use_1988_mg,WaDEUUID,in_AllocationFlow_CFS
0,1,1945-0008,,Inactive,Individual Permit,Agricultural/Food Processing,Industrial Processing,1769.7,,10/8/1984 0:00:00,,Moorhead,American Crystal Sugar Company,,1,Inactive,3363.0,T140N-R48W-S32,212733.0,5199908.0,46.89055,-96.77104,Clay,57.0,Upper Red River of the North,Stream/River,Surface Water,RED RIVER OF THE NORTH,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3,0.0,0.0,7.992,0.0,29.5,24.0,0.0,d0,9.0092
1,2,1947-0012,,Active,Individual Permit,Mine Processing (excludes sand/gravel),Industrial Processing,50000.0,,5/9/2003 0:00:00,,LAKE SUPERIOR,Cleveland Cliffs - Northshore Mining Co,,1,Active,95000.0,T55N-R7W-S6,630929.0,5237380.0,47.27682,-91.26885,Lake,2.0,Lake Superior - South,Lake,Surface Water,Superior,16000100.0,,,,15080.4,15664.8,15811.2,27024.0,44671.0,36464.0,36849.0,44169.0,45146.0,46551.0,45813.0,45362.0,44067.0,47035.0,46505.0,46792.0,45191.2,45861.6,46998.9,47304.0,45878.4,46008.0,47433.6,46677.82,47299.8,47304.0,44452.8,46641.0,1543.7,1548.0,39775.0,44145.6,31536.0,1123.0,0.0,d1,254.54029
2,3,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #10 - 222051,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBAA,,222051.0,124.0,Quaternary Buried Artesian,0.00197,15.1196,9.94974,0.0,36.10323,2.5915,7.64653,0.0,18.22546,0.0,18.03,2.81,0.0,0.54,0.0,0.0,0.0,8.501,10.593,32.31,0.346,1.86,2.901,10.312,16.428,22.13,21.223,3.88407,96.703,92.571,33.778,0.0,,,,d2,3.71629
3,4,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #8 - 222049,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBAA,,222049.0,122.0,Quaternary Buried Artesian,43.75936,177.65471,101.81552,94.83861,90.70029,66.43305,45.53407,51.25836,67.91247,101.29814,133.09,100.39,96.21,27.32,34.242,36.84,30.528,10.784,35.3915,33.47,89.021,32.58,31.722,54.469,75.965,119.03,100.034,93.11032,74.87,124.759,146.615,272.7,293.33,355.207,611.63,d3,3.71629
4,5,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #9 - 222050,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBUA,,222050.0,114.0,Quaternary Buried Unconfined,89.3954,41.09104,98.08311,116.75849,145.53145,247.56611,124.52859,86.51413,24.67334,51.96678,134.57,49.13,32.47,18.51,57.858,86.11,45.682,43.795,26.6755,7.45,0.213,52.51,82.786,43.199,57.237,33.41,57.813,67.06561,128.327,65.2,115.069,0.0,,,,d4,3.71629


In [5]:
# create single in_WaterSourceNativeID to work with

def CreateSingleWsIDFunc(valA, valB):
    valA = str(valA).split('.')[0]
    valB = str(valB).split('.')[0]
    
    if valA != "" and valB == "":
        outString = valA
    elif valA == "" and valB != "":
        outString = valB
    else:
        outString = ""
    
    return outString

dfinPOD['in_WaterSourceNativeID'] = dfinPOD.apply(lambda row: CreateSingleWsIDFunc(row['resource_number'], row['well_number']), axis=1)
dfinPOD.head()

Unnamed: 0,Ã¯Â»Â¿OID_,permit_number,general_permit_number,permit_status,permit_class,use_type,use_category,permit_total_volume_mgy,permit_total_acres,permit_effective_date,permit_expiration_date,project_name,landowner,agent,installation_name,installation_status,installation_pumping_rate_gpm,location_legal_description,utm_x,utm_y,latitude,longitude,county_name,watershed_major,watershed_name,resource_type,resource_category,resource_name,resource_number,well_number,well_depth_ft,aquifer,use_2022_mg,use_2021_mg,use_2020_mg,use_2019_mg,use_2018_mg,use_2017_mg,use_2016_mg,use_2015_mg,use_2014_mg,use_2013_mg,use_2012_mg,use_2011_mg,use_2010_mg,use_2009_mg,use_2008_mg,use_2007_mg,use_2006_mg,use_2005_mg,use_2004_mg,use_2003_mg,use_2002_mg,use_2001_mg,use_2000_mg,use_1999_mg,use_1998_mg,use_1997_mg,use_1996_mg,use_1995_mg,use_1994_mg,use_1993_mg,use_1992_mg,use_1991_mg,use_1990_mg,use_1989_mg,use_1988_mg,WaDEUUID,in_AllocationFlow_CFS,in_WaterSourceNativeID
0,1,1945-0008,,Inactive,Individual Permit,Agricultural/Food Processing,Industrial Processing,1769.7,,10/8/1984 0:00:00,,Moorhead,American Crystal Sugar Company,,1,Inactive,3363.0,T140N-R48W-S32,212733.0,5199908.0,46.89055,-96.77104,Clay,57.0,Upper Red River of the North,Stream/River,Surface Water,RED RIVER OF THE NORTH,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3,0.0,0.0,7.992,0.0,29.5,24.0,0.0,d0,9.0092,
1,2,1947-0012,,Active,Individual Permit,Mine Processing (excludes sand/gravel),Industrial Processing,50000.0,,5/9/2003 0:00:00,,LAKE SUPERIOR,Cleveland Cliffs - Northshore Mining Co,,1,Active,95000.0,T55N-R7W-S6,630929.0,5237380.0,47.27682,-91.26885,Lake,2.0,Lake Superior - South,Lake,Surface Water,Superior,16000100.0,,,,15080.4,15664.8,15811.2,27024.0,44671.0,36464.0,36849.0,44169.0,45146.0,46551.0,45813.0,45362.0,44067.0,47035.0,46505.0,46792.0,45191.2,45861.6,46998.9,47304.0,45878.4,46008.0,47433.6,46677.82,47299.8,47304.0,44452.8,46641.0,1543.7,1548.0,39775.0,44145.6,31536.0,1123.0,0.0,d1,254.54029,16000100.0
2,3,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #10 - 222051,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBAA,,222051.0,124.0,Quaternary Buried Artesian,0.00197,15.1196,9.94974,0.0,36.10323,2.5915,7.64653,0.0,18.22546,0.0,18.03,2.81,0.0,0.54,0.0,0.0,0.0,8.501,10.593,32.31,0.346,1.86,2.901,10.312,16.428,22.13,21.223,3.88407,96.703,92.571,33.778,0.0,,,,d2,3.71629,222051.0
3,4,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #8 - 222049,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBAA,,222049.0,122.0,Quaternary Buried Artesian,43.75936,177.65471,101.81552,94.83861,90.70029,66.43305,45.53407,51.25836,67.91247,101.29814,133.09,100.39,96.21,27.32,34.242,36.84,30.528,10.784,35.3915,33.47,89.021,32.58,31.722,54.469,75.965,119.03,100.034,93.11032,74.87,124.759,146.615,272.7,293.33,355.207,611.63,d3,3.71629,222049.0
4,5,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #9 - 222050,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBUA,,222050.0,114.0,Quaternary Buried Unconfined,89.3954,41.09104,98.08311,116.75849,145.53145,247.56611,124.52859,86.51413,24.67334,51.96678,134.57,49.13,32.47,18.51,57.858,86.11,45.682,43.795,26.6755,7.45,0.213,52.51,82.786,43.199,57.237,33.41,57.813,67.06561,128.327,65.2,115.069,0.0,,,,d4,3.71629,222050.0


In [6]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "MNwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "MNwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "MNwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOD['resource_name']
df['in_WaterSourceNativeID'] = dfinPOD['in_WaterSourceNativeID'] # will supllment empty Ids below
df['in_WaterSourceTypeCV'] = dfinPOD['resource_category']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = dfinPOD['county_name']
df['in_EPSGCodeCV'] = "4326"
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD['latitude']
df['in_Longitude'] = dfinPOD['longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"  # "Point of Diversion"
df['in_SiteName'] = dfinPOD['installation_name']
df['in_SiteNativeID'] = "" # not available, will create custom Id below
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "MN"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = dfinPOD['permit_expiration_date']
df['in_AllocationFlow_CFS'] = dfinPOD['in_AllocationFlow_CFS']
df['in_AllocationLegalStatusCV'] = dfinPOD['permit_status']
df['in_AllocationNativeID'] =  dfinPOD['permit_number']
df['in_AllocationOwner'] = dfinPOD['landowner']
df['in_AllocationPriorityDate'] = dfinPOD['permit_effective_date']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = dfinPOD['permit_class']
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfinPOD['use_category']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "0" # either a 1 for excempt or 0 for not-excempt
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOD['permit_total_acres']
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = ""

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

30818


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,d0,MNwr_M1,MNwr_V1,MNwr_O1,,,,RED RIVER OF THE NORTH,,Surface Water,,,Clay,4326,,,,46.89055,-96.77104,,,POD,1,,,,MN,,,,,,,,,,9.0092,Inactive,1945-0008,American Crystal Sugar Company,10/8/1984 0:00:00,,,,Individual Permit,,Industrial Processing,,,,,,0,,,,,,,,,,
1,d1,MNwr_M1,MNwr_V1,MNwr_O1,,,,Superior,16000100.0,Surface Water,,,Lake,4326,,,,47.27682,-91.26885,,,POD,1,,,,MN,,,,,,,,,,254.54029,Active,1947-0012,Cleveland Cliffs - Northshore Mining Co,5/9/2003 0:00:00,,,,Individual Permit,,Industrial Processing,,,,,,0,,,,,,,,,,
2,d2,MNwr_M1,MNwr_V1,MNwr_O1,,,,QBAA,222051.0,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #10 - 222051,,,,MN,,,,,,,,,,3.71629,Active,1947-0014,Moorhead Public Service,2/24/1989 0:00:00,,,,Individual Permit,,Water Supply,,,,,,0,,,,,,,,,,
3,d3,MNwr_M1,MNwr_V1,MNwr_O1,,,,QBAA,222049.0,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #8 - 222049,,,,MN,,,,,,,,,,3.71629,Active,1947-0014,Moorhead Public Service,2/24/1989 0:00:00,,,,Individual Permit,,Water Supply,,,,,,0,,,,,,,,,,
4,d4,MNwr_M1,MNwr_V1,MNwr_O1,,,,QBUA,222050.0,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #9 - 222050,,,,MN,,,,,,,,,,3.71629,Active,1947-0014,Moorhead Public Service,2/24/1989 0:00:00,,,,Individual Permit,,Water Supply,,,,,,0,,,,,,,,,,


## Place of Use Data

In [7]:
# N/A

## Concatenate POD and POU Data.  Make needed changes

In [8]:
# Concatenate dataframes
frames = [outPOD]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

30818


## Clean Data / data types

In [9]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\),(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [10]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Red River Of The North', 'Superior', 'Qbaa', ..., 'South Lida',
       'St Louis River Estuary St Louis Bay', 'East Sunburg'],
      dtype=object)

In [11]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Clay', 'Lake', 'Otter Tail', 'St Louis', 'Blue Earth',
       'Sherburne', 'Martin', 'Scott', 'Cottonwood', 'Isanti', 'Itasca',
       'Polk', 'Anoka', 'Hennepin', 'Mcleod', 'Nobles', 'Cook', 'Grant',
       'Dakota', 'Marshall', 'Crow Wing', 'Jackson', 'Redwood',
       'Mille Lacs', 'Freeborn', 'Rock', 'Lincoln', 'Big Stone', 'Ramsey',
       'Mower', 'Carver', 'Nicollet', 'Becker', 'Kandiyohi', 'Wadena',
       'Norman', 'Le Sueur', 'Pope', 'Hubbard', 'Meeker', 'Beltrami',
       'Olmsted', 'Traverse', 'Washington', 'Stearns', 'Lac Qui Parle',
       'Benton', 'Clearwater', 'Faribault', 'Todd', 'Swift', 'Rice',
       'Aitkin', 'Renville', 'Goodhue', 'Wright', 'Fillmore', 'Red Lake',
       'Douglas', 'Sibley', 'Cass', 'Stevens', 'Pipestone', 'Winona',
       'Morrison', 'Houston', 'Chisago', 'Kittson', 'Lyon', 'Carlton',
       'Roseau', 'Watonwan', 'Dodge', 'Lake Of The Woods', 'Brown',
       'Kanabec', 'Yellow Medicine', 'Pine', 'Chippewa', 'Koochiching',
       'Wabasha

In [12]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['1', 'Well #10 222051', 'Well #8 222049', ...,
       '23034 Unnamed Pond', '23025 Elk Lake', '23027 Sunburg Lake'],
      dtype=object)

In [13]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['American Crystal Sugar Company',
       'Cleveland Cliffs Northshore Mining Co', 'Moorhead Public Service',
       ..., 'Ep Gts Housing Phase I Llc', 'Leatherman Eric',
       'Erichson Mark'], dtype=object)

In [14]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [15]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Red River Of The North', 'Superior', 'Qbaa', ..., 'South Lida',
       'St Louis River Estuary St Louis Bay', 'East Sunburg'],
      dtype=object)

In [16]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater', ''], dtype=object)

In [17]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array([''], dtype=object)

In [18]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['1', 'Well #10 222051', 'Well #8 222049', ...,
       '23034 Unnamed Pond', '23025 Elk Lake', '23027 Sunburg Lake'],
      dtype=object)

In [19]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['American Crystal Sugar Company',
       'Cleveland Cliffs Northshore Mining Co', 'Moorhead Public Service',
       ..., 'Ep Gts Housing Phase I Llc', 'Leatherman Eric',
       'Erichson Mark'], dtype=object)

In [20]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['Agricultural Irrigation',
 'Heating/Cooling',
 'Industrial Processing',
 'Non-Crop Irrigation',
 'Power Generation',
 'Special Categories',
 'Water Level Maintenance',
 'Water Supply']

In [21]:
# Ensure Latitude entry is either numireic or a 0
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([46.8905514 , 47.27681851, 46.87777341, ..., 45.92172337,
       45.06732735, 45.32635259])

In [22]:
# Ensure Longitude entry is either numireic or a 0
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-96.7710413 , -91.26884763, -96.63538667, ..., -95.53038881,
       -95.71501657, -95.23501413])

In [23]:
# Changing datatype of Expiration Date to date fields entry
outdf['in_AllocationExpirationDate'] = pd.to_datetime(outdf['in_AllocationExpirationDate'], errors = 'coerce')
outdf['in_AllocationExpirationDate'] = pd.to_datetime(outdf["in_AllocationExpirationDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationExpirationDate'] = outdf['in_AllocationExpirationDate'].astype(str).str.replace("NaT", "")
outdf['in_AllocationExpirationDate'].unique()

array(['', '2017-05-26', '2024-12-31', ..., '2024-08-31', '2024-10-02',
       '2024-09-12'], dtype=object)

In [24]:
# Changing datatype of Priority Date to date fields entry
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

array(['1984-10-08T00:00:00.000000000', '2003-05-09T00:00:00.000000000',
       '1989-02-24T00:00:00.000000000', ...,
       '2023-09-21T00:00:00.000000000', '2023-09-12T00:00:00.000000000',
       '2023-09-27T00:00:00.000000000'], dtype='datetime64[ns]')

In [25]:
# Ensure Flow entry is either numireic
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array([9.00919898014744, 254.54028875367123, 3.7162882158036, ...,
       18.734165252270202, 1.5944403687529964, 2.708308672339062],
      dtype=object)

In [26]:
# Ensure Volume entry is either numireic
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array([''], dtype=object)

In [27]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1', '16000100', '222051', ..., '21014200', '12006700',
       '34033600'], dtype=object)

In [28]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['wadeId1', 'wadeId2', 'wadeId3', ..., 'wadeId29225', 'wadeId29226',
       'wadeId29227'], dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: {enter string entries here}

In [29]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Inactive"] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

17733


array(['Active'], dtype=object)

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [30]:
# N/A

## Export Data

In [31]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17733 entries, 0 to 17732
Data columns (total 63 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   WaDEUUID                                      17733 non-null  object        
 1   in_MethodUUID                                 17733 non-null  object        
 2   in_VariableSpecificUUID                       17733 non-null  object        
 3   in_OrganizationUUID                           17733 non-null  object        
 4   in_Geometry                                   17733 non-null  object        
 5   in_GNISFeatureNameCV                          17733 non-null  object        
 6   in_WaterQualityIndicatorCV                    17733 non-null  object        
 7   in_WaterSourceName                            17733 non-null  object        
 8   in_WaterSourceNativeID                        17733 non-null  obje

In [32]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,d1,MNwr_M1,MNwr_V1,MNwr_O1,,,,Superior,16000100,Surface Water,,,Lake,4326,,,,47.27682,-91.26885,,,POD,1,wadeId2,,,MN,,,,,,,,,,254.54029,Active,1947-0012,Cleveland Cliffs Northshore Mining Co,2003-05-09,,,,Individual Permit,,Industrial Processing,,,,,,0,,,,,,,,,,
1,d2,MNwr_M1,MNwr_V1,MNwr_O1,,,,Qbaa,222051,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #10 222051,wadeId3,,,MN,,,,,,,,,,3.71629,Active,1947-0014,Moorhead Public Service,1989-02-24,,,,Individual Permit,,Water Supply,,,,,,0,,,,,,,,,,
2,d3,MNwr_M1,MNwr_V1,MNwr_O1,,,,Qbaa,222049,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #8 222049,wadeId4,,,MN,,,,,,,,,,3.71629,Active,1947-0014,Moorhead Public Service,1989-02-24,,,,Individual Permit,,Water Supply,,,,,,0,,,,,,,,,,
3,d4,MNwr_M1,MNwr_V1,MNwr_O1,,,,Qbua,222050,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #9 222050,wadeId5,,,MN,,,,,,,,,,3.71629,Active,1947-0014,Moorhead Public Service,1989-02-24,,,,Individual Permit,,Water Supply,,,,,,0,,,,,,,,,,
4,d6,MNwr_M1,MNwr_V1,MNwr_O1,,,,Colby,69024900,Surface Water,,,St Louis,4326,,,,47.53334,-92.15899,,,POD,1,wadeId7,,,MN,,,,,,,,,,22.94528,Active,1949-0135,Cliffs Erie Llc Minnesota Power A Division Of ...,2018-11-01,,,,Individual Permit,,Industrial Processing,,,,,,0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17728,d30813,MNwr_M1,MNwr_V1,MNwr_O1,,,,,wadeId14,Groundwater,,,Chisago,4326,,,,45.32641,-92.99987,,,POD,Installation #1,wadeId29223,,,MN,,,,,,,,,2023-10-31,0.01527,Active,2023-2878,Erichson Mark,2023-09-25,,,,General Permit Authorization,,Water Level Maintenance,,,,,,0,,,,,,,,,,
17729,d30814,MNwr_M1,MNwr_V1,MNwr_O1,,,,,wadeId51,Surface Water,,,Otter Tail,4326,,,,46.55716,-95.96962,,,POD,23034 Unnamed Pond,wadeId29224,,,MN,,,,,,,,,2023-12-31,0.01018,Active,2023-2892,Central Specialties Inc,2023-09-20,,,,General Permit Authorization,,Special Categories,,,,,,0,,,,,,,,,,
17730,d30815,MNwr_M1,MNwr_V1,MNwr_O1,,,,Elk,21014200,Surface Water,,,Douglas,4326,,,,45.92172,-95.53039,,,POD,23025 Elk Lake,wadeId29225,,,MN,,,,,,,,,2023-12-31,0.25454,Active,2023-2894,Central Specialties Inc,2023-09-21,,,,General Permit Authorization,,Special Categories,,,,,,0,,,,,,,,,,
17731,d30816,MNwr_M1,MNwr_V1,MNwr_O1,,,,Unnamed,12006700,Surface Water,,,Chippewa,4326,,,,45.06733,-95.71502,,,POD,Installation #1,wadeId29226,,,MN,,,,,,,,,2023-12-31,0.00967,Active,2023-2913,Chippewa County Highway Dept,2023-10-05,,,,General Permit Authorization,,Special Categories,,,,,,0,,,,,,,,,,


In [33]:
# Export the output dataframe
# change output name / abbreviation to match native state provdier and wade data type 
outdf.to_csv('RawInputData/Pwr_Main.zip', compression=dict(method='zip', archive_name='Pwr_Main.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.