# Pre-processing Minnesota Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Minnesota/WaterAllocation_WaterUse" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/Minnesota/WaterAllocation_WaterUse


## Data Input 1
- asdf

In [3]:
# Input File - asdf
fileInput = "RawInputData/mpars_index_permits_installations_uses_WGS1984.zip"
dfin1 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/mpars_index_permits_installations_uses_WGS1984.zip", compression=dict(method='zip', archive_name="mpars_index_permits_installations_uses_WGS1984.csv"), index=False)

print(len(dfin1))
dfin1.head(1)

30818


  dfin1 = pd.read_csv(fileInput).replace(np.nan, "")


Unnamed: 0,ï»¿OID_,permit_number,general_permit_number,permit_status,permit_class,use_type,use_category,permit_total_volume_mgy,permit_total_acres,permit_effective_date,permit_expiration_date,project_name,landowner,agent,installation_name,installation_status,installation_pumping_rate_gpm,location_legal_description,utm_x,utm_y,latitude,longitude,county_name,watershed_major,watershed_name,resource_type,resource_category,resource_name,resource_number,well_number,well_depth_ft,aquifer,use_2022_mg,use_2021_mg,use_2020_mg,use_2019_mg,use_2018_mg,use_2017_mg,use_2016_mg,use_2015_mg,use_2014_mg,use_2013_mg,use_2012_mg,use_2011_mg,use_2010_mg,use_2009_mg,use_2008_mg,use_2007_mg,use_2006_mg,use_2005_mg,use_2004_mg,use_2003_mg,use_2002_mg,use_2001_mg,use_2000_mg,use_1999_mg,use_1998_mg,use_1997_mg,use_1996_mg,use_1995_mg,use_1994_mg,use_1993_mg,use_1992_mg,use_1991_mg,use_1990_mg,use_1989_mg,use_1988_mg,WaDEUUID
0,1,1945-0008,,Inactive,Individual Permit,Agricultural/Food Processing,Industrial Processing,1769.7,,10/8/1984 0:00:00,,Moorhead,American Crystal Sugar Company,,1,Inactive,3363.0,T140N-R48W-S32,212733.0,5199908.0,46.89055,-96.77104,Clay,57.0,Upper Red River of the North,Stream/River,Surface Water,RED RIVER OF THE NORTH,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3,0.0,0.0,7.992,0.0,29.5,24.0,0.0,d0


In [4]:
# convert MG to AF for WaDE
# 1 MG =  3.06888785 AF

def ConvertMGToAFFunc(valA):
    if str(valA).strip() == "":
        outVal = "";
    else:
        outVal = valA * 3.06888785
    return outVal

dfin1['in_AllocationVolume_AF'] = dfin1.apply(lambda row: ConvertMGToAFFunc(row['permit_total_volume_mgy']), axis=1)
dfin1.head(1)

Unnamed: 0,ï»¿OID_,permit_number,general_permit_number,permit_status,permit_class,use_type,use_category,permit_total_volume_mgy,permit_total_acres,permit_effective_date,permit_expiration_date,project_name,landowner,agent,installation_name,installation_status,installation_pumping_rate_gpm,location_legal_description,utm_x,utm_y,latitude,longitude,county_name,watershed_major,watershed_name,resource_type,resource_category,resource_name,resource_number,well_number,well_depth_ft,aquifer,use_2022_mg,use_2021_mg,use_2020_mg,use_2019_mg,use_2018_mg,use_2017_mg,use_2016_mg,use_2015_mg,use_2014_mg,use_2013_mg,use_2012_mg,use_2011_mg,use_2010_mg,use_2009_mg,use_2008_mg,use_2007_mg,use_2006_mg,use_2005_mg,use_2004_mg,use_2003_mg,use_2002_mg,use_2001_mg,use_2000_mg,use_1999_mg,use_1998_mg,use_1997_mg,use_1996_mg,use_1995_mg,use_1994_mg,use_1993_mg,use_1992_mg,use_1991_mg,use_1990_mg,use_1989_mg,use_1988_mg,WaDEUUID,in_AllocationVolume_AF
0,1,1945-0008,,Inactive,Individual Permit,Agricultural/Food Processing,Industrial Processing,1769.7,,10/8/1984 0:00:00,,Moorhead,American Crystal Sugar Company,,1,Inactive,3363.0,T140N-R48W-S32,212733.0,5199908.0,46.89055,-96.77104,Clay,57.0,Upper Red River of the North,Stream/River,Surface Water,RED RIVER OF THE NORTH,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3,0.0,0.0,7.992,0.0,29.5,24.0,0.0,d0,5431.01083


In [5]:
# create single in_WaterSourceNativeID to work with

def CreateSingleWsIDFunc(valA, valB):
    valA = str(valA).split('.')[0]
    valB = str(valB).split('.')[0]
    
    if valA != "" and valB == "":
        outString = valA
    elif valA == "" and valB != "":
        outString = valB
    else:
        outString = ""
    
    return outString

dfin1['in_WaterSourceNativeID'] = dfin1.apply(lambda row: CreateSingleWsIDFunc(row['resource_number'], row['well_number']), axis=1)
dfin1.head()

Unnamed: 0,ï»¿OID_,permit_number,general_permit_number,permit_status,permit_class,use_type,use_category,permit_total_volume_mgy,permit_total_acres,permit_effective_date,permit_expiration_date,project_name,landowner,agent,installation_name,installation_status,installation_pumping_rate_gpm,location_legal_description,utm_x,utm_y,latitude,longitude,county_name,watershed_major,watershed_name,resource_type,resource_category,resource_name,resource_number,well_number,well_depth_ft,aquifer,use_2022_mg,use_2021_mg,use_2020_mg,use_2019_mg,use_2018_mg,use_2017_mg,use_2016_mg,use_2015_mg,use_2014_mg,use_2013_mg,use_2012_mg,use_2011_mg,use_2010_mg,use_2009_mg,use_2008_mg,use_2007_mg,use_2006_mg,use_2005_mg,use_2004_mg,use_2003_mg,use_2002_mg,use_2001_mg,use_2000_mg,use_1999_mg,use_1998_mg,use_1997_mg,use_1996_mg,use_1995_mg,use_1994_mg,use_1993_mg,use_1992_mg,use_1991_mg,use_1990_mg,use_1989_mg,use_1988_mg,WaDEUUID,in_AllocationVolume_AF,in_WaterSourceNativeID
0,1,1945-0008,,Inactive,Individual Permit,Agricultural/Food Processing,Industrial Processing,1769.7,,10/8/1984 0:00:00,,Moorhead,American Crystal Sugar Company,,1,Inactive,3363.0,T140N-R48W-S32,212733.0,5199908.0,46.89055,-96.77104,Clay,57.0,Upper Red River of the North,Stream/River,Surface Water,RED RIVER OF THE NORTH,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3,0.0,0.0,7.992,0.0,29.5,24.0,0.0,d0,5431.01083,
1,2,1947-0012,,Active,Individual Permit,Mine Processing (excludes sand/gravel),Industrial Processing,50000.0,,5/9/2003 0:00:00,,LAKE SUPERIOR,Cleveland Cliffs - Northshore Mining Co,,1,Active,95000.0,T55N-R7W-S6,630929.0,5237380.0,47.27682,-91.26885,Lake,2.0,Lake Superior - South,Lake,Surface Water,Superior,16000100.0,,,,15080.4,15664.8,15811.2,27024.0,44671.0,36464.0,36849.0,44169.0,45146.0,46551.0,45813.0,45362.0,44067.0,47035.0,46505.0,46792.0,45191.2,45861.6,46998.9,47304.0,45878.4,46008.0,47433.6,46677.82,47299.8,47304.0,44452.8,46641.0,1543.7,1548.0,39775.0,44145.6,31536.0,1123.0,0.0,d1,153444.3925,16000100.0
2,3,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #10 - 222051,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBAA,,222051.0,124.0,Quaternary Buried Artesian,0.00197,15.1196,9.94974,0.0,36.10323,2.5915,7.64653,0.0,18.22546,0.0,18.03,2.81,0.0,0.54,0.0,0.0,0.0,8.501,10.593,32.31,0.346,1.86,2.901,10.312,16.428,22.13,21.223,3.88407,96.703,92.571,33.778,0.0,,,,d2,2240.28813,222051.0
3,4,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #8 - 222049,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBAA,,222049.0,122.0,Quaternary Buried Artesian,43.75936,177.65471,101.81552,94.83861,90.70029,66.43305,45.53407,51.25836,67.91247,101.29814,133.09,100.39,96.21,27.32,34.242,36.84,30.528,10.784,35.3915,33.47,89.021,32.58,31.722,54.469,75.965,119.03,100.034,93.11032,74.87,124.759,146.615,272.7,293.33,355.207,611.63,d3,2240.28813,222049.0
4,5,1947-0014,,Active,Individual Permit,Municipal/Public Water Supply,Water Supply,730.0,,2/24/1989 0:00:00,,,Moorhead Public Service,,Well #9 - 222050,Active,0.0,T139N-R47W-S5,223000.0,5198000.0,46.87777,-96.63539,Clay,58.0,Buffalo River,Groundwater,Groundwater,QBUA,,222050.0,114.0,Quaternary Buried Unconfined,89.3954,41.09104,98.08311,116.75849,145.53145,247.56611,124.52859,86.51413,24.67334,51.96678,134.57,49.13,32.47,18.51,57.858,86.11,45.682,43.795,26.6755,7.45,0.213,52.51,82.786,43.199,57.237,33.41,57.813,67.06561,128.327,65.2,115.069,0.0,,,,d4,2240.28813,222050.0


In [6]:
# create wade in_VariableCV based on use and water source type
# dictionary deteremined using list description here: https://www.dnr.state.mn.us/waters/watermgmt_section/appropriations/wateruse.html

useWsTypeDict = {
"Agricultural IrrigationGroundwater" : "Consumptive Use",
"Agricultural IrrigationSurface Water" : "Consumptive Use",
"Agricultural Irrigation" : "Consumptive Use",
"Heating/CoolingGroundwater" : "Consumptive Use",
"Heating/CoolingSurface Water" : "Withdrawal",
"Industrial ProcessingGroundwater" : "Consumptive Use",
"Industrial ProcessingSurface Water" : "Withdrawal",
"Non-Crop IrrigationGroundwater" : "Consumptive Use",
"Non-Crop IrrigationSurface Water" : "Consumptive Use",
"Power GenerationGroundwater" : "Withdrawal",
"Power GenerationSurface Water" : "Withdrawal",
"Special CategoriesGroundwater" : "Consumptive Use",
"Special CategoriesSurface Water" : "Consumptive Use",
"Water Level MaintenanceGroundwater" : "Consumptive Use",
"Water Level MaintenanceSurface Water" : "Consumptive Use",
"Water SupplyGroundwater" : "Consumptive Use",
"Water SupplySurface Water" : "Consumptive Use"}

def createVariableCVFunc(valA, valB):
    linkKeyVal = str(valA).strip() + str(valB).strip()
    linkKeyVal = linkKeyVal
    outString = useWsTypeDict[linkKeyVal]
    return outString

dfin1['in_VariableCV'] = dfin1.apply(lambda row: createVariableCVFunc(row['use_category'], row['resource_category']), axis=1)
dfin1['in_VariableCV'].unique()

array(['Withdrawal', 'Consumptive Use'], dtype=object)

In [7]:
amountColNameList = [
    "use_1988_mg",
    "use_1989_mg",
    "use_1990_mg",
    "use_1991_mg",
    "use_1992_mg",
    "use_1993_mg",
    "use_1994_mg",
    "use_1995_mg",
    "use_1996_mg",
    "use_1997_mg",
    "use_1998_mg",
    "use_1999_mg",
    "use_2000_mg",
    "use_2001_mg",
    "use_2002_mg",
    "use_2003_mg",
    "use_2004_mg",
    "use_2005_mg",
    "use_2006_mg",
    "use_2007_mg",
    "use_2008_mg",
    "use_2009_mg",
    "use_2010_mg",
    "use_2011_mg",
    "use_2012_mg",
    "use_2013_mg",
    "use_2014_mg",
    "use_2015_mg",
    "use_2016_mg",
    "use_2017_mg",
    "use_2018_mg",
    "use_2019_mg",
    "use_2020_mg",
    "use_2021_mg",
    "use_2022_mg"]
print(amountColNameList)

['use_1988_mg', 'use_1989_mg', 'use_1990_mg', 'use_1991_mg', 'use_1992_mg', 'use_1993_mg', 'use_1994_mg', 'use_1995_mg', 'use_1996_mg', 'use_1997_mg', 'use_1998_mg', 'use_1999_mg', 'use_2000_mg', 'use_2001_mg', 'use_2002_mg', 'use_2003_mg', 'use_2004_mg', 'use_2005_mg', 'use_2006_mg', 'use_2007_mg', 'use_2008_mg', 'use_2009_mg', 'use_2010_mg', 'use_2011_mg', 'use_2012_mg', 'use_2013_mg', 'use_2014_mg', 'use_2015_mg', 'use_2016_mg', 'use_2017_mg', 'use_2018_mg', 'use_2019_mg', 'use_2020_mg', 'use_2021_mg', 'use_2022_mg']


In [8]:
# create output POD dataframe
outdf1 = pd.DataFrame()

# use for loop to pair Year value and Amount value to corresponding column
for item in amountColNameList:
    
    # create temp to store for current loop
    df = pd.DataFrame()

    # Data Assessment UUID
    df['WaDEUUID'] = dfin1['WaDEUUID']

    # Method Info
    df['in_MethodUUID'] = "MNwr_M1"

    # Variable Info
    df['in_AggregationIntervalUnitCV'] = "Annual"
    df['in_VariableCV'] = dfin1['in_VariableCV']
    df['in_VariableSpecificUUID'] = "MNwr_V1"

    # Organization Info
    df['in_OrganizationUUID'] = "MNwr_O1"

    # WaterSource Info
    df['in_Geometry'] = ""
    df['in_GNISFeatureNameCV'] = ""
    df['in_WaterQualityIndicatorCV'] = ""
    df['in_WaterSourceName'] = dfin1['resource_name']
    df['in_WaterSourceNativeID'] = dfin1['in_WaterSourceNativeID'] # will supllment empty Ids below
    df['in_WaterSourceTypeCV'] = dfin1['resource_category']

    # Site Info
    df['in_CoordinateAccuracy'] = ""
    df['in_CoordinateMethodCV'] = ""
    df['in_County'] = dfin1['county_name']
    df['in_EPSGCodeCV'] = 4326
    df['in_Geometry'] = ""
    df['in_GNISCodeCV'] = ""
    df['in_HUC12'] = ""
    df['in_HUC8'] = ""
    df['in_Latitude'] = dfin1['latitude']
    df['in_Longitude'] = dfin1['longitude']
    df['in_NHDNetworkStatusCV'] = ""
    df['in_NHDProductCV'] = ""
    df['in_PODorPOUSite'] = "POD"  # "Point of Diversion"
    df['in_SiteName'] = dfin1['installation_name']
    df['in_SiteNativeID'] = "" # not available, will create custom Id below
    df['in_SitePoint'] = ""
    df['in_SiteTypeCV'] = ""
    df['in_StateCV'] = "MN"
    df['in_USGSSiteID'] = ""

    # AllocationAmount Info
    df['in_AllocationApplicationDate'] = ""
    df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
    df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
    df['in_AllocationBasisCV'] = ""
    df['in_AllocationChangeApplicationIndicator'] = ""
    df['in_AllocationCommunityWaterSupplySystem'] = ""
    df['in_AllocationCropDutyAmount'] = ""
    df['in_AllocationExpirationDate'] = dfin1['permit_expiration_date']
    df['in_AllocationFlow_CFS'] = ""
    df['in_AllocationLegalStatusCV'] = dfin1['permit_status']
    df['in_AllocationNativeID'] =  dfin1['permit_number']
    df['in_AllocationOwner'] = dfin1['landowner']
    df['in_AllocationPriorityDate'] = ""
    df['in_AllocationSDWISIdentifierCV'] = ""
    df['in_AllocationTimeframeEnd'] = ""
    df['in_AllocationTimeframeStart'] = ""
    df['in_AllocationTypeCV'] = dfin1['permit_class']
    df['in_AllocationVolume_AF'] = dfin1['in_AllocationVolume_AF']
    df['in_BeneficialUseCategory'] = dfin1['use_category']
    df['in_CommunityWaterSupplySystem'] = ""
    df['in_CropTypeCV'] = ""
    df['in_CustomerTypeCV'] = ""
    df['in_DataPublicationDate'] = ""
    df['in_DataPublicationDOI'] = ""
    df['in_ExemptOfVolumeFlowPriority'] = 1 # either a 1 for excempt or 0 for not-excempt
    df['in_GeneratedPowerCapacityMW'] = ""
    df['in_IrrigatedAcreage'] = dfin1['permit_total_acres']
    df['in_IrrigationMethodCV'] = ""
    df['in_LegacyAllocationIDs'] = ""
    df['in_OwnerClassificationCV'] = ""
    df['in_PopulationServed'] = ""
    df['in_PowerType'] = ""
    df['in_PrimaryBeneficialUseCategory'] = ""
    df['in_SDWISIdentifierCV'] = ""
    df['in_WaterAllocationNativeURL'] = ""
        
    # Site VariableAmounts Info
    # df['in_Amount'] = dfin1[item]
    df['in_Amount'] = dfin1.apply(lambda row: ConvertMGToAFFunc(row[item]), axis=1) # see above
    df['in_AssociatedNativeAllocationIDs'] = dfin1['permit_expiration_date']
    df['in_PowerGeneratedGWh'] = ""
    df['in_PrimaryUseCategory'] = ""
    yearValue = item.replace("use_", "").replace("_mg", "")
    df['in_ReportYearCV'] = yearValue
    df['in_SDWISIdentifier'] = ""
    df['in_TimeframeEnd'] = "12/31/" + yearValue
    df['in_TimeframeStart'] = "01/01/" + yearValue
    # df['in_AllocationCropDutyAmount'] = "" see above AllocationAmount Info
    # df['in_BeneficialUseCategory'] = "" see above AllocationAmount Info
    # df['in_CommunityWaterSupplySystem'] = "" see above AllocationAmount Info
    # df['in_CropTypeCV'] = "" see above AllocationAmount Info
    # df['in_CustomerTypeCV'] = "" see above AllocationAmount Info
    # df['in_DataPublicationDate'] = "" see above AllocationAmount Info
    # df['in_DataPublicationDOI'] = "" see above AllocationAmount Info
    # df['in_Geometry'] = "" see above Site Info
    # df['in_IrrigatedAcreage'] = "" see above AllocationAmount Info
    # df['in_IrrigationMethodCV'] = "" see above AllocationAmount Info
    # df['in_PopulationServed'] = "" see above AllocationAmount Info
    # df['in_PowerType'] = "" see above AllocationAmount Info
    # df['in_SDWISIdentifier'] = "" see above AllocationAmount Info
   
    outdf1 = pd.concat([outdf1, df])


outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

1078630


Unnamed: 0,WaDEUUID,in_MethodUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,d0,MNwr_M1,Annual,Withdrawal,MNwr_V1,MNwr_O1,,,,RED RIVER OF THE NORTH,,Surface Water,,,Clay,4326,,,,46.89055,-96.77104,,,POD,1,,,,MN,,,,,,,,,,,Inactive,1945-0008,American Crystal Sugar Company,,,,,Individual Permit,5431.01083,Industrial Processing,,,,,,1,,,,,,,,,,,0.0,,,,1988,,12/31/1988,01/01/1988
1,d1,MNwr_M1,Annual,Withdrawal,MNwr_V1,MNwr_O1,,,,Superior,16000100.0,Surface Water,,,Lake,4326,,,,47.27682,-91.26885,,,POD,1,,,,MN,,,,,,,,,,,Active,1947-0012,Cleveland Cliffs - Northshore Mining Co,,,,,Individual Permit,153444.3925,Industrial Processing,,,,,,1,,,,,,,,,,,0.0,,,,1988,,12/31/1988,01/01/1988
2,d2,MNwr_M1,Annual,Consumptive Use,MNwr_V1,MNwr_O1,,,,QBAA,222051.0,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #10 - 222051,,,,MN,,,,,,,,,,,Active,1947-0014,Moorhead Public Service,,,,,Individual Permit,2240.28813,Water Supply,,,,,,1,,,,,,,,,,,,,,,1988,,12/31/1988,01/01/1988
3,d3,MNwr_M1,Annual,Consumptive Use,MNwr_V1,MNwr_O1,,,,QBAA,222049.0,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #8 - 222049,,,,MN,,,,,,,,,,,Active,1947-0014,Moorhead Public Service,,,,,Individual Permit,2240.28813,Water Supply,,,,,,1,,,,,,,,,,,1877.02388,,,,1988,,12/31/1988,01/01/1988
4,d4,MNwr_M1,Annual,Consumptive Use,MNwr_V1,MNwr_O1,,,,QBUA,222050.0,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #9 - 222050,,,,MN,,,,,,,,,,,Active,1947-0014,Moorhead Public Service,,,,,Individual Permit,2240.28813,Water Supply,,,,,,1,,,,,,,,,,,,,,,1988,,12/31/1988,01/01/1988


## Concatenate POD and POU Data.  Make needed changes

In [9]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [10]:
# Concatenate dataframes
frames = [outdf1]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

1078630


## Clean Data / data types

In [11]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [12]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Red River Of The North', 'Superior', 'Qbaa', ..., 'South Lida',
       'St Louis River Estuary St Louis Bay', 'East Sunburg'],
      dtype=object)

In [13]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Clay', 'Lake', 'Otter Tail', 'St Louis', 'Blue Earth',
       'Sherburne', 'Martin', 'Scott', 'Cottonwood', 'Isanti', 'Itasca',
       'Polk', 'Anoka', 'Hennepin', 'Mcleod', 'Nobles', 'Cook', 'Grant',
       'Dakota', 'Marshall', 'Crow Wing', 'Jackson', 'Redwood',
       'Mille Lacs', 'Freeborn', 'Rock', 'Lincoln', 'Big Stone', 'Ramsey',
       'Mower', 'Carver', 'Nicollet', 'Becker', 'Kandiyohi', 'Wadena',
       'Norman', 'Le Sueur', 'Pope', 'Hubbard', 'Meeker', 'Beltrami',
       'Olmsted', 'Traverse', 'Washington', 'Stearns', 'Lac Qui Parle',
       'Benton', 'Clearwater', 'Faribault', 'Todd', 'Swift', 'Rice',
       'Aitkin', 'Renville', 'Goodhue', 'Wright', 'Fillmore', 'Red Lake',
       'Douglas', 'Sibley', 'Cass', 'Stevens', 'Pipestone', 'Winona',
       'Morrison', 'Houston', 'Chisago', 'Kittson', 'Lyon', 'Carlton',
       'Roseau', 'Watonwan', 'Dodge', 'Lake Of The Woods', 'Brown',
       'Kanabec', 'Yellow Medicine', 'Pine', 'Chippewa', 'Koochiching',
       'Wabasha

In [14]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['1', 'Well #10 222051', 'Well #8 222049', ...,
       '23034 Unnamed Pond', '23025 Elk Lake', '23027 Sunburg Lake'],
      dtype=object)

In [15]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['American Crystal Sugar Company',
       'Cleveland Cliffs Northshore Mining Co', 'Moorhead Public Service',
       ..., 'Ep Gts Housing Phase I Llc', 'Leatherman, Eric',
       'Erichson, Mark'], dtype=object)

In [16]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [17]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Red River Of The North', 'Superior', 'Qbaa', ..., 'South Lida',
       'St Louis River Estuary St Louis Bay', 'East Sunburg'],
      dtype=object)

In [18]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater', ''], dtype=object)

In [19]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array([''], dtype=object)

In [20]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['1', 'Well #10 222051', 'Well #8 222049', ...,
       '23034 Unnamed Pond', '23025 Elk Lake', '23027 Sunburg Lake'],
      dtype=object)

In [21]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Clay', 'Lake', 'Otter Tail', 'St Louis', 'Blue Earth',
       'Sherburne', 'Martin', 'Scott', 'Cottonwood', 'Isanti', 'Itasca',
       'Polk', 'Anoka', 'Hennepin', 'Mcleod', 'Nobles', 'Cook', 'Grant',
       'Dakota', 'Marshall', 'Crow Wing', 'Jackson', 'Redwood',
       'Mille Lacs', 'Freeborn', 'Rock', 'Lincoln', 'Big Stone', 'Ramsey',
       'Mower', 'Carver', 'Nicollet', 'Becker', 'Kandiyohi', 'Wadena',
       'Norman', 'Le Sueur', 'Pope', 'Hubbard', 'Meeker', 'Beltrami',
       'Olmsted', 'Traverse', 'Washington', 'Stearns', 'Lac Qui Parle',
       'Benton', 'Clearwater', 'Faribault', 'Todd', 'Swift', 'Rice',
       'Aitkin', 'Renville', 'Goodhue', 'Wright', 'Fillmore', 'Red Lake',
       'Douglas', 'Sibley', 'Cass', 'Stevens', 'Pipestone', 'Winona',
       'Morrison', 'Houston', 'Chisago', 'Kittson', 'Lyon', 'Carlton',
       'Roseau', 'Watonwan', 'Dodge', 'Lake Of The Woods', 'Brown',
       'Kanabec', 'Yellow Medicine', 'Pine', 'Chippewa', 'Koochiching',
       'Wabasha

In [22]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['American Crystal Sugar Company',
       'Cleveland Cliffs Northshore Mining Co', 'Moorhead Public Service',
       ..., 'Ep Gts Housing Phase I Llc', 'Leatherman, Eric',
       'Erichson, Mark'], dtype=object)

In [23]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['Agricultural Irrigation',
 'Heating/Cooling',
 'Industrial Processing',
 'Non-Crop Irrigation',
 'Power Generation',
 'Special Categories',
 'Water Level Maintenance',
 'Water Supply']

In [24]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([46.8905514 , 47.27681851, 46.87777341, ..., 45.92172337,
       45.06732735, 45.32635259])

In [25]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-96.7710413 , -91.26884763, -96.63538667, ..., -95.53038881,
       -95.71501657, -95.23501413])

In [26]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array([''], dtype=object)

In [27]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array([5431.010828145, 153444.3925, 2240.2881305, ..., 11293.507287999999,
       961.1756746199999, 1632.6483362], dtype=object)

In [28]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

array(['', 1877.02, 5.66, ..., 339.02, 408.96, 611.25], dtype=object)

In [29]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

array([''], dtype=object)

In [30]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

<DatetimeArray>
['NaT']
Length: 1, dtype: datetime64[ns]

In [31]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

<DatetimeArray>
['1988-12-31 00:00:00', '1989-12-31 00:00:00', '1990-12-31 00:00:00',
 '1991-12-31 00:00:00', '1992-12-31 00:00:00', '1993-12-31 00:00:00',
 '1994-12-31 00:00:00', '1995-12-31 00:00:00', '1996-12-31 00:00:00',
 '1997-12-31 00:00:00', '1998-12-31 00:00:00', '1999-12-31 00:00:00',
 '2000-12-31 00:00:00', '2001-12-31 00:00:00', '2002-12-31 00:00:00',
 '2003-12-31 00:00:00', '2004-12-31 00:00:00', '2005-12-31 00:00:00',
 '2006-12-31 00:00:00', '2007-12-31 00:00:00', '2008-12-31 00:00:00',
 '2009-12-31 00:00:00', '2010-12-31 00:00:00', '2011-12-31 00:00:00',
 '2012-12-31 00:00:00', '2013-12-31 00:00:00', '2014-12-31 00:00:00',
 '2015-12-31 00:00:00', '2016-12-31 00:00:00', '2017-12-31 00:00:00',
 '2018-12-31 00:00:00', '2019-12-31 00:00:00', '2020-12-31 00:00:00',
 '2021-12-31 00:00:00', '2022-12-31 00:00:00']
Length: 35, dtype: datetime64[ns]

In [32]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

<DatetimeArray>
['1988-01-01 00:00:00', '1989-01-01 00:00:00', '1990-01-01 00:00:00',
 '1991-01-01 00:00:00', '1992-01-01 00:00:00', '1993-01-01 00:00:00',
 '1994-01-01 00:00:00', '1995-01-01 00:00:00', '1996-01-01 00:00:00',
 '1997-01-01 00:00:00', '1998-01-01 00:00:00', '1999-01-01 00:00:00',
 '2000-01-01 00:00:00', '2001-01-01 00:00:00', '2002-01-01 00:00:00',
 '2003-01-01 00:00:00', '2004-01-01 00:00:00', '2005-01-01 00:00:00',
 '2006-01-01 00:00:00', '2007-01-01 00:00:00', '2008-01-01 00:00:00',
 '2009-01-01 00:00:00', '2010-01-01 00:00:00', '2011-01-01 00:00:00',
 '2012-01-01 00:00:00', '2013-01-01 00:00:00', '2014-01-01 00:00:00',
 '2015-01-01 00:00:00', '2016-01-01 00:00:00', '2017-01-01 00:00:00',
 '2018-01-01 00:00:00', '2019-01-01 00:00:00', '2020-01-01 00:00:00',
 '2021-01-01 00:00:00', '2022-01-01 00:00:00']
Length: 35, dtype: datetime64[ns]

In [33]:
# extract year out
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].replace("", 0).fillna(0).astype(int).astype(str)
outdf['in_ReportYearCV'].unique()

array(['1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003',
       '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011',
       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021', '2022'], dtype=object)

In [34]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

array(['Commercial/Industrial', 'Public Supply', 'Agriculture Irrigation',
       'In-stream Flow', 'Hydroelectric', 'Municipal Irrigation',
       'Thermoelectric Cooling', 'Other'], dtype=object)

In [35]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

array(['Withdrawal_Annual_Commercial/Industrial_Surface Water',
       'Consumptive Use_Annual_Public Supply_Groundwater',
       'Consumptive Use_Annual_Agriculture Irrigation_Surface Water',
       'Consumptive Use_Annual_In-Stream Flow_Groundwater',
       'Consumptive Use_Annual_In-Stream Flow_Surface Water',
       'Withdrawal_Annual_Hydroelectric_Surface Water',
       'Consumptive Use_Annual_Agriculture Irrigation_Groundwater',
       'Consumptive Use_Annual_Municipal Irrigation_Surface Water',
       'Consumptive Use_Annual_Commercial/Industrial_Groundwater',
       'Consumptive Use_Annual_Public Supply_Surface Water',
       'Withdrawal_Annual_Hydroelectric_Groundwater',
       'Consumptive Use_Annual_Municipal Irrigation_Groundwater',
       'Consumptive Use_Annual_Thermoelectric Cooling_Groundwater',
       'Consumptive Use_Annual_Other_Groundwater',
       'Consumptive Use_Annual_Other_Surface Water',
       'Consumptive Use_Annual_Agriculture Irrigation_',
       'Withdraw

In [36]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1', '16000100', '222051', ..., '21014200', '12006700',
       '34033600'], dtype=object)

In [37]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['wadeId1', 'wadeId2', 'wadeId3', ..., 'wadeId29225', 'wadeId29226',
       'wadeId29227'], dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: "Inactive"

In [38]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Inactive"] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

620655


array(['Active'], dtype=object)

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [39]:
# # PoU Shapefile Data
# shapefileInput = "RawInputData/shapefiles/{enter file name here}.zip" # ziped folder of the shp file

# dfPoUshapetemp = gpd.read_file(shapefileInput)
# dfPoUshapetemp['geometry'] = dfPoUshapetemp['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
# print(len(dfPoUshapetemp))
# dfPoUshapetemp.head()

In [40]:
# # create temp dataframe to hold native ID and geometry from shapefile input
# columnsList = ['in_SiteNativeID', 'geometry']
# dfPoUshape = pd.DataFrame(columns=columnsList)

# # assing values to temp dataframe based on shapefile input
# # for in_SiteNativeID assure ID value is the same as that listed above for POU info.
# dfPoUshape['in_SiteNativeID'] = "POU" + ""
# dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
# dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
# print(len(dfPoUshape))
# dfPoUshape.head()

## Export Outputs

In [41]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620655 entries, 0 to 620654
Data columns (total 74 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   WaDEUUID                                      620655 non-null  object        
 1   in_MethodUUID                                 620655 non-null  object        
 2   in_AggregationIntervalUnitCV                  620655 non-null  object        
 3   in_VariableCV                                 620655 non-null  object        
 4   in_VariableSpecificUUID                       620655 non-null  object        
 5   in_OrganizationUUID                           620655 non-null  object        
 6   in_Geometry                                   620655 non-null  object        
 7   in_GNISFeatureNameCV                          620655 non-null  object        
 8   in_WaterQualityIndicatorCV                    620655 n

In [42]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart,in_VariableSpecificCV
0,d1,MNwr_M1,Annual,Withdrawal,MNwr_V1,MNwr_O1,,,,Superior,16000100,Surface Water,,,Lake,4326,,,,47.27682,-91.26885,,,POD,1,wadeId2,,,MN,,,,,,,,,,,Active,1947-0012,Cleveland Cliffs Northshore Mining Co,NaT,,,,Individual Permit,153444.39250,Industrial Processing,,,,,,1,,,,,,,,,,,,,,Commercial/Industrial,1988,,1988-12-31,1988-01-01,Withdrawal_Annual_Commercial/Industrial_Surfac...
1,d2,MNwr_M1,Annual,Consumptive Use,MNwr_V1,MNwr_O1,,,,Qbaa,222051,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #10 222051,wadeId3,,,MN,,,,,,,,,,,Active,1947-0014,Moorhead Public Service,NaT,,,,Individual Permit,2240.28813,Water Supply,,,,,,1,,,,,,,,,,,,,,Public Supply,1988,,1988-12-31,1988-01-01,Consumptive Use_Annual_Public Supply_Groundwater
2,d3,MNwr_M1,Annual,Consumptive Use,MNwr_V1,MNwr_O1,,,,Qbaa,222049,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #8 222049,wadeId4,,,MN,,,,,,,,,,,Active,1947-0014,Moorhead Public Service,NaT,,,,Individual Permit,2240.28813,Water Supply,,,,,,1,,,,,,,,,,,1877.02000,,,Public Supply,1988,,1988-12-31,1988-01-01,Consumptive Use_Annual_Public Supply_Groundwater
3,d4,MNwr_M1,Annual,Consumptive Use,MNwr_V1,MNwr_O1,,,,Qbua,222050,Groundwater,,,Clay,4326,,,,46.87777,-96.63539,,,POD,Well #9 222050,wadeId5,,,MN,,,,,,,,,,,Active,1947-0014,Moorhead Public Service,NaT,,,,Individual Permit,2240.28813,Water Supply,,,,,,1,,,,,,,,,,,,,,Public Supply,1988,,1988-12-31,1988-01-01,Consumptive Use_Annual_Public Supply_Groundwater
4,d6,MNwr_M1,Annual,Withdrawal,MNwr_V1,MNwr_O1,,,,Colby,69024900,Surface Water,,,St Louis,4326,,,,47.53334,-92.15899,,,POD,1,wadeId7,,,MN,,,,,,,,,,,Active,1949-0135,"Cliffs Erie Llc Minnesota Power, A Division Of...",NaT,,,,Individual Permit,13832.09132,Industrial Processing,,,,,,1,,,,,,,,,,,17284.57000,,,Commercial/Industrial,1988,,1988-12-31,1988-01-01,Withdrawal_Annual_Commercial/Industrial_Surfac...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620650,d30813,MNwr_M1,Annual,Consumptive Use,MNwr_V1,MNwr_O1,,,,,wadeId14,Groundwater,,,Chisago,4326,,,,45.32641,-92.99987,,,POD,Installation #1,wadeId29223,,,MN,,,,,,,,,10/31/2023,,Active,2023-2878,"Erichson, Mark",NaT,,,,General Permit Authorization,9.20666,Water Level Maintenance,,,,,,1,,,,,,,,,,,,10/31/2023,,In-stream Flow,2022,,2022-12-31,2022-01-01,Consumptive Use_Annual_In-Stream Flow_Groundwater
620651,d30814,MNwr_M1,Annual,Consumptive Use,MNwr_V1,MNwr_O1,,,,,wadeId51,Surface Water,,,Otter Tail,4326,,,,46.55716,-95.96962,,,POD,23034 Unnamed Pond,wadeId29224,,,MN,,,,,,,,,12/31/2023,,Active,2023-2892,"Central Specialties, Inc",NaT,,,,General Permit Authorization,6.13778,Special Categories,,,,,,1,,,,,,,,,,,,12/31/2023,,Other,2022,,2022-12-31,2022-01-01,Consumptive Use_Annual_Other_Surface Water
620652,d30815,MNwr_M1,Annual,Consumptive Use,MNwr_V1,MNwr_O1,,,,Elk,21014200,Surface Water,,,Douglas,4326,,,,45.92172,-95.53039,,,POD,23025 Elk Lake,wadeId29225,,,MN,,,,,,,,,12/31/2023,,Active,2023-2894,"Central Specialties, Inc",NaT,,,,General Permit Authorization,153.44439,Special Categories,,,,,,1,,,,,,,,,,,,12/31/2023,,Other,2022,,2022-12-31,2022-01-01,Consumptive Use_Annual_Other_Surface Water
620653,d30816,MNwr_M1,Annual,Consumptive Use,MNwr_V1,MNwr_O1,,,,Unnamed,12006700,Surface Water,,,Chippewa,4326,,,,45.06733,-95.71502,,,POD,Installation #1,wadeId29226,,,MN,,,,,,,,,12/31/2023,,Active,2023-2913,Chippewa County Highway Dept,NaT,,,,General Permit Authorization,5.83089,Special Categories,,,,,,1,,,,,,,,,,,,12/31/2023,,Other,2022,,2022-12-31,2022-01-01,Consumptive Use_Annual_Other_Surface Water


In [43]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwr_wu_Main.zip', compression=dict(method='zip', archive_name='Pwr_wu_Main.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.