# Pre-processing Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
import os
import sys
print(os.environ['CONDA_DEFAULT_ENV'])
print(sys.version)

base
3.12.3 | packaged by conda-forge | (main, Apr 15 2024, 18:20:11) [MSC v.1938 64 bit (AMD64)]


In [2]:
# Needed Libraries / Modules

# ---- working with data ----
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [3]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/WaDE Data Folder/NorthDakota/WaterAllocation_WaterUse" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/WaDE Data Folder/NorthDakota/WaterAllocation_WaterUse


## Data Input 1 - Permits.shp file
- Permits water right data, in a shp file

In [4]:
# Input File - asdf
shapefileInput = "RawInputData/water_right/Permits.zip"
dfin1 = gpd.read_file(shapefileInput).replace(np.nan, "")

dfin1['permit_ind'] = dfin1['permit_ind'].replace("", 0).astype('Int64').astype('str')
dfin1['pod_index'] = dfin1['pod_index'].replace("", 0).astype('Int64').astype('str')
dfin1['linkKey'] = dfin1['permit_ind'] + "_" + dfin1['pod_index']

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/Permits.zip", compression=dict(method='zip', archive_name="Permits.csv"), index=False)

print(len(dfin1))
dfin1.head(1)

12889


Unnamed: 0,permit_ind,permit_num,pod_index,pod,permit_hol,priority_d,use_type,status,date_issue,date_cance,beneficial,county,hu_sub_bas,aquifer,subaquifer,req_acft,req_acre,req_rate,req_storag,app_acft,app_acre,app_rate,app_storag,pod_status,source,irrigation,source_nam,mainstem,impound_lo,impound_na,return_des,discharge_,period_sta,period_end,return_qua,held_acft,held_acre,held_rate,held_stora,longitude,latitude,geometry,linkKey,WaDEUUID
0,1,4407,1,13007302B,"KETTERLING, ROLAND & LORRAINE",1991-03-04,Irrigation,Denied,,,1111-11-11,McIntosh,Beaver,,,204.0,135.2,1000.0,0.0,0.0,0.0,0.0,0.0,Denied,Ground Water,Sprinkler,,0,,,,,,,0.0,0.0,0.0,0.0,0.0,-99.78988,46.1113,"POLYGON ((-99.79502 46.11495, -99.79374 46.114...",1_1,in10


In [5]:
# issue of similar 'pod' values not sharing same lat and long inputs.
# dissolve geometry into single geometry value by 'pod' value
# re-caculate lat and long of centroid
dfin1_s = dfin1[['pod', 'geometry']].sort_values(by=['pod'])
dfin1_s = dfin1_s.dissolve(by= 'pod', as_index=False)
dfin1_s['wade_lon'] = dfin1_s.centroid.x  
dfin1_s['wade_lat'] = dfin1_s.centroid.y
print(len(dfin1_s))
dfin1_s.head(1)

10746



  dfin1_s['wade_lon'] = dfin1_s.centroid.x

  dfin1_s['wade_lat'] = dfin1_s.centroid.y


Unnamed: 0,pod,geometry,wade_lon,wade_lat
0,02305925DB,"POLYGON ((-104.03320 47.75433, -104.03826 47.7...",-104.03573,47.75794


In [6]:
#left merge new geometry back into sites data

dfin1 = dfin1.merge(dfin1_s, left_on='pod', right_on='pod', how='left').replace(np.nan, "")
print(len(dfin1))
dfin1.head(1)

12889


Unnamed: 0,permit_ind,permit_num,pod_index,pod,permit_hol,priority_d,use_type,status,date_issue,date_cance,beneficial,county,hu_sub_bas,aquifer,subaquifer,req_acft,req_acre,req_rate,req_storag,app_acft,app_acre,app_rate,app_storag,pod_status,source,irrigation,source_nam,mainstem,impound_lo,impound_na,return_des,discharge_,period_sta,period_end,return_qua,held_acft,held_acre,held_rate,held_stora,longitude,latitude,geometry_x,linkKey,WaDEUUID,geometry_y,wade_lon,wade_lat
0,1,4407,1,13007302B,"KETTERLING, ROLAND & LORRAINE",1991-03-04,Irrigation,Denied,,,1111-11-11,McIntosh,Beaver,,,204.0,135.2,1000.0,0.0,0.0,0.0,0.0,0.0,Denied,Ground Water,Sprinkler,,0,,,,,,,0.0,0.0,0.0,0.0,0.0,-99.78988,46.1113,"POLYGON ((-99.79502 46.11495, -99.79374 46.114...",1_1,in10,"POLYGON ((-99.79502 46.11495, -99.79374 46.114...",-99.78987,46.1113


In [7]:
# swapping order owner name
import re

def createOwnerName(val):
    if val == "" or pd.isnull(val):
        outString = ""
    else:
        val = str(val)
        val = val.strip()
        if "," in val:
            x = val.split(",")
            outString = str(x[0]).strip() + " " + str(x[1]).strip()
        else:
            outString = val     
    return outString
dfin1['in_AllocationOwner'] = dfin1.apply(lambda row: createOwnerName(row['permit_hol']), axis=1)

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

dfin1['in_AllocationOwner'] = dfin1.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
dfin1['in_AllocationOwner'].unique()

array(['Ketterling Roland Lorraine', 'Hyde George H', 'Slater A L', ...,
       'Sneva Craig Mary And Smith', 'Schmit Layne', 'Treeby Hollis'],
      dtype=object)

In [8]:
dfin1['in_WaterAllocationNativeURL'] = "https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=" + dfin1['permit_ind'].astype(str)
dfin1['in_WaterAllocationNativeURL'].unique()

array(['https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=1',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=2',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=3',
       ...,
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=45239',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=45240',
       'https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=45241'],
      dtype=object)

## Data Input 2 - Time Series Use Data
- Water_Use timeseries data
- POD site data
- Permit header data (to bridge timeseries and pod site)

In [9]:
# Input File - Timeseries water use data
fileInput = "RawInputData/water_use/Water_Use.zip"
dfin2 = pd.read_csv(fileInput).replace(np.nan, "")

dfin2['Permit_Index'] = dfin2['Permit_Index'].replace("", 0).astype('Int64').astype('str')
dfin2['POD_Index'] = dfin2['POD_Index'].replace("", 0).astype('Int64').astype('str')
dfin2['Use_Year'] = dfin2['Use_Year'].replace("", 0).astype('Int64').astype('str')
dfin2['linkKey'] = dfin2['Permit_Index'] + "_" + dfin2['POD_Index']

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv('RawInputData/Water_Use.zip', compression=dict(method='zip', archive_name='Water_Use.csv'), index=False)

print(len(dfin2))
dfin2.head(1)

  dfin2 = pd.read_csv(fileInput).replace(np.nan, "")
  dfin2['Permit_Index'] = dfin2['Permit_Index'].replace("", 0).astype('Int64').astype('str')
  dfin2['POD_Index'] = dfin2['POD_Index'].replace("", 0).astype('Int64').astype('str')


206166


  dfin2['Use_Year'] = dfin2['Use_Year'].replace("", 0).astype('Int64').astype('str')


Unnamed: 0,Permit_Index,POD_Index,Use_Year,Nature_Of_Data,Reported_AcFt,Reported_Acres,Reported_Rate,KWHrs,KWH_Demand,Pump_HP,Begin_Meter,End_Meter,Meter_Units,Comments,NonConsumptive_Use,Crop_type1,Crop_Type2,Reported_Inches,Water_Use_Index,Use_Type,WaDEUUID,linkKey
0,344,484,1991,,51.9,0.0,900.0,0.0,0.0,70.0,0.0,0.0,Hours,"16,902,000 gallons of water reported used.",0.0,Corn,,,0.0,Irrigation,ndWU0,344_484


### merge site info with time series info

In [10]:
#left merge sites to water use
dfin1 = dfin1.merge(dfin2, left_on='linkKey', right_on='linkKey', how='left').replace(np.nan, "")
print(len(dfin1))
dfin1.head(1)

202860


Unnamed: 0,permit_ind,permit_num,pod_index,pod,permit_hol,priority_d,use_type,status,date_issue,date_cance,beneficial,county,hu_sub_bas,aquifer,subaquifer,req_acft,req_acre,req_rate,req_storag,app_acft,app_acre,app_rate,app_storag,pod_status,source,irrigation,source_nam,mainstem,impound_lo,impound_na,return_des,discharge_,period_sta,period_end,return_qua,held_acft,held_acre,held_rate,held_stora,longitude,latitude,geometry_x,linkKey,WaDEUUID_x,geometry_y,wade_lon,wade_lat,in_AllocationOwner,in_WaterAllocationNativeURL,Permit_Index,POD_Index,Use_Year,Nature_Of_Data,Reported_AcFt,Reported_Acres,Reported_Rate,KWHrs,KWH_Demand,Pump_HP,Begin_Meter,End_Meter,Meter_Units,Comments,NonConsumptive_Use,Crop_type1,Crop_Type2,Reported_Inches,Water_Use_Index,Use_Type,WaDEUUID_y
0,1,4407,1,13007302B,"KETTERLING, ROLAND & LORRAINE",1991-03-04,Irrigation,Denied,,,1111-11-11,McIntosh,Beaver,,,204.0,135.2,1000.0,0.0,0.0,0.0,0.0,0.0,Denied,Ground Water,Sprinkler,,0,,,,,,,0.0,0.0,0.0,0.0,0.0,-99.78988,46.1113,"POLYGON ((-99.79502 46.11495, -99.79374 46.114...",1_1,in10,"POLYGON ((-99.79502 46.11495, -99.79374 46.114...",-99.78987,46.1113,Ketterling Roland Lorraine,https://www.swc.nd.gov/info_edu/map_data_resou...,,,,,,,,,,,,,,,,,,,,,


In [11]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "NDwrwu_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "NDwrwu_V1" # for wr records portion only, will create sa portion below
df['in_AggregationIntervalUnitCV'] = "Annual"
df['in_VariableCV'] = "Water Right Use"

# Organization Info
df['in_OrganizationUUID'] = "NDwrwu_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfin1['source_nam'] # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = dfin1['source'] # need this for auto fill below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = "Centroid of Area"
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin1['wade_lat']
df['in_Longitude'] = dfin1['wade_lon']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "nd" + dfin1['pod'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "ND"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = dfin1['date_issue']
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = dfin1['date_cance']
df['in_AllocationFlow_CFS'] = dfin1['app_rate'].astype(float)
df['in_AllocationLegalStatusCV'] = dfin1['pod_status'].astype(str)
df['in_AllocationNativeID'] = dfin1['permit_num'].astype(str).str.strip() + "_" + dfin1['pod_index'].astype(str).str.strip() 
df['in_AllocationOwner'] = dfin1['in_AllocationOwner']
df['in_AllocationPriorityDate'] = dfin1['priority_d']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfin1['period_end']
df['in_AllocationTimeframeStart'] = dfin1['period_sta']
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfin1['app_acft'].astype(float)
df['in_BeneficialUseCategory'] = dfin1['use_type']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0 # 1 or 0, if we want this data excempt
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfin1['app_acre'].astype(float)
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfin1['in_WaterAllocationNativeURL']

# Site VariableAmounts Info
df['in_Amount'] = dfin1['Reported_AcFt']
df['in_AssociatedNativeAllocationIDs'] = dfin1['permit_num'].astype(str).str.strip() + "_" + dfin1['pod_index'].astype(str).str.strip() 
df['in_PowerGeneratedGWh'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] = dfin1['Use_Year']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin1['Use_Year']  + "/12/31"
df['in_TimeframeStart'] = dfin1['Use_Year'] + "/01/01"  
# df['in_AllocationCropDutyAmount'] = "" see above AllocationAmount Info
# df['in_BeneficialUseCategory'] = "" see above AllocationAmount Info
# df['in_CommunityWaterSupplySystem'] = "" see above AllocationAmount Info
# df['in_CropTypeCV'] = "" see above AllocationAmount Info
# df['in_CustomerTypeCV'] = "" see above AllocationAmount Info
# df['in_DataPublicationDate'] = "" see above AllocationAmount Info
# df['in_DataPublicationDOI'] = "" see above AllocationAmount Info
# df['in_Geometry'] = "" see above Site Info
# df['in_IrrigatedAcreage'] = "" see above AllocationAmount Info
# df['in_IrrigationMethodCV'] = "" see above AllocationAmount Info
# df['in_PopulationServed'] = "" see above AllocationAmount Info
# df['in_PowerType'] = "" see above AllocationAmount Info
# df['in_SDWISIdentifier'] = "" see above AllocationAmount Info

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

202781


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,in10,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,,Ground Water,,Centroid of Area,,4326,,,,46.1113,-99.78987,,,POD,,nd13007302B,,,ND,,,,,,,,,,0.0,Denied,4407_1,Ketterling Roland Lorraine,1991-03-04,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,,4407_1,,,,,/12/31,/01/01
1,in11,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,,Surface Water,,Centroid of Area,,4326,,,,48.02622,-103.75216,,,POD,,nd15310236CC,,,ND,,,,,,,,,,0.0,Cancelled,1E_2,Hyde George H,1901-08-15,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,0.0,1E_2,,,1976.0,,1976/12/31,1976/01/01
2,in12,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,,Surface Water,,Centroid of Area,,4326,,,,48.03706,-103.7469,,,POD,,nd15310236BA,,,ND,,,,,,,,,,0.0,Cancelled,2B_3,Slater A L,1901-09-02,,,,,0.0,Irrigation,,,,,,0,,0.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,0.0,2B_3,,,1976.0,,1976/12/31,1976/01/01
3,in13,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,,Surface Water,,Centroid of Area,,4326,,,,47.70175,-103.44125,,,POD,,nd14910026AB,,,ND,,1937-04-30,,,,,,,,1615.6,Active,2D_4,Gudmunsen Robert And Lowraine,1906-01-26,,,,,291.0,Irrigation,,,,,,0,,291.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,291.0,2D_4,,,1977.0,,1977/12/31,1977/01/01
4,in13,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,,Surface Water,,Centroid of Area,,4326,,,,47.70175,-103.44125,,,POD,,nd14910026AB,,,ND,,1937-04-30,,,,,,,,1615.6,Active,2D_4,Gudmunsen Robert And Lowraine,1906-01-26,,,,,291.0,Irrigation,,,,,,0,,291.0,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,291.0,2D_4,,,1978.0,,1978/12/31,1978/01/01


## Concatenate POD and POU Data.  Make needed changes

In [12]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [13]:
# Concatenate dataframes
frames = [outdf1]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

202781


## Clean Data / data types

In [14]:
# updating in_WaterSourceTypeCV to be more machine readable / WaDE specific
# ----------------------------------------------------------------------------------------------------

def createWaterSourceTypeCV(inWST):
    inWST = str(inWST).strip()
    
    if inWST == "":
        outString = "WaDE Blank"
    elif inWST == "Ground Water":
        outString = "Groundwater"
    else:
        outString =  inWST
      
    return outString

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: createWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater', 'Surface Water', 'WaDE Blank'], dtype=object)

In [15]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [16]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['', 'Mouse River', 'Charbonneau Creek', 'Beaver Creek',
       'Nelson Creek', 'Little Creek, Paulson Creek', 'Green River',
       'North Fork Cannonball River', 'Little Muddy River',
       'Des Lacs River', 'Unnamed Coulee Dry Fork Coulee',
       'Intermittent Draws, Trib To Little Miss', 'Heart River',
       'Apple Creek', 'Little Missouri River', 'Buford Coulee',
       'Unnamed Tributary', 'Burnt Creek', 'Souris River', 'Cedar Creek',
       'Missouri River', 'Tobacco Garden Creek', 'Sheyenne River',
       'Cherry Creek', 'Cannonball River', 'Knife River', 'Duck Creek',
       'Lake Sakakawea', 'Yellowstone River', 'Knife River Tributary',
       'Pembina River', 'Painted Woods Creek',
       'North Branch Cannonball River', 'Red River', 'Gibb Springs',
       'Sweetwater Lake', 'Mulberry Creek',
       "Big Mary'S Coulee, Trib To Mouse River", 'Rush River',
       'Sheep Creek', 'West Hay Draw Creek',
       'Tributary To West Hay Draw Creek', 'North Fork Grand River',

In [17]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array([''], dtype=object)

In [18]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array([''], dtype=object)

In [19]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Ketterling Roland Lorraine', 'Hyde George H', 'Slater A L', ...,
       'Sneva Craig Mary And Smith', 'Schmit Layne', 'Treeby Hollis'],
      dtype=object)

In [20]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [21]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['', 'Mouse River', 'Charbonneau Creek', 'Beaver Creek',
       'Nelson Creek', 'Little Creek, Paulson Creek', 'Green River',
       'North Fork Cannonball River', 'Little Muddy River',
       'Des Lacs River', 'Unnamed Coulee Dry Fork Coulee',
       'Intermittent Draws, Trib To Little Miss', 'Heart River',
       'Apple Creek', 'Little Missouri River', 'Buford Coulee',
       'Unnamed Tributary', 'Burnt Creek', 'Souris River', 'Cedar Creek',
       'Missouri River', 'Tobacco Garden Creek', 'Sheyenne River',
       'Cherry Creek', 'Cannonball River', 'Knife River', 'Duck Creek',
       'Lake Sakakawea', 'Yellowstone River', 'Knife River Tributary',
       'Pembina River', 'Painted Woods Creek',
       'North Branch Cannonball River', 'Red River', 'Gibb Springs',
       'Sweetwater Lake', 'Mulberry Creek',
       "Big Mary'S Coulee, Trib To Mouse River", 'Rush River',
       'Sheep Creek', 'West Hay Draw Creek',
       'Tributary To West Hay Draw Creek', 'North Fork Grand River',

In [22]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater', 'Surface Water', 'WaDE Blank'], dtype=object)

In [23]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array([''], dtype=object)

In [24]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array([''], dtype=object)

In [25]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array([''], dtype=object)

In [26]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Ketterling Roland Lorraine', 'Hyde George H', 'Slater A L', ...,
       'Sneva Craig Mary And Smith', 'Schmit Layne', 'Treeby Hollis'],
      dtype=object)

In [27]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['Domestic',
 'Fish and Wildlife',
 'Flood Control',
 'Industrial',
 'Irrigation',
 'Multiple Use',
 'Municipal',
 'Power Generation',
 'Recreation',
 'Rural Water',
 'Stock',
 'Undefined']

In [28]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([46.11129501, 48.02621568, 48.03706116, ..., 48.62286839,
       46.062472  , 46.05526451])

In [29]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([ -99.78987083, -103.75215903, -103.74690401, ..., -103.99897674,
        -97.97146753,  -97.97149018])

In [30]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array(['', 1615.6, 1350.0, 1840.0, 600.0, 530.0, 2000.0, 864.0, 1650.0,
       500.0, 924.0, 450.0, 366.7, 366.6, 448.0, 1062.5, 1450.0, 827.0,
       448.8, 900.0, 675.0, 134.6, 4488.0, 4398.2, 435.7, 46.0, 46.1,
       2692.8, 709.104, 519.4, 785.4, 171.0, 314.0, 225.0, 977.7, 977.8,
       337.5, 897.6, 269.2, 3666.7, 3666.6, 175.0, 949.5, 673.2, 108000.0,
       198.0, 99.0, 45.0, 700.0, 350.0, 140.0, 2244.0, 99999.9, 3455.7,
       13.9, 347.2, 10.4, 52.1, 224.4, 300.0, 710.0, 150.0, 1900.0,
       4151.4, 2500.0, 330.0, 200.0, 550.0, 280.5, 28.0, 240.1, 86.3,
       86.2, 105.0, 254.0, 718.0, 807.8, 10417.0, 223.3, 223.4, 60.0,
       1750.0, 410.0, 1122.0, 284.0, 17952.0, 506.3, 1346.4, 55000.0,
       403.9, 990.0, 2108.0, 10416.0, 1709.1, 7.3, 10.0, 538.5, 3013.0,
       3012.0, 1305.0, 435.0, 267.0, 267.1, 46.8, 46.9, 602.0, 738.0,
       187.5, 948.0, 800.0, 517.5, 270.0, 149.6, 299.2, 34692.2, 33600.0,
       4128.9, 72.0, 987.3, 250.0, 3000.0, 1000.0, 11.3, 56.1, 2100.0,
 

In [31]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array(['', 291.0, 132.0, ..., 352.5, 162.4, 1692.0], dtype=object)

In [32]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

array(['', 291.0, 145.5, ..., 77.92, 124.19, 145.92], dtype=object)

In [33]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

array([''], dtype=object)

In [34]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

<DatetimeArray>
['1991-03-04 00:00:00', '1901-08-15 00:00:00', '1901-09-02 00:00:00',
 '1906-01-26 00:00:00', '1906-02-03 00:00:00', '1902-04-18 00:00:00',
 '1932-03-16 00:00:00', '1937-09-21 00:00:00', '1901-06-18 00:00:00',
 '1932-04-16 00:00:00',
 ...
 '2023-10-23 00:00:00', '2023-11-02 00:00:00', '2023-10-30 00:00:00',
 '2023-11-03 00:00:00', '2023-11-09 00:00:00', '2023-11-21 00:00:00',
 '2023-12-01 00:00:00', '2024-01-04 00:00:00', '2024-01-11 00:00:00',
 '2024-01-16 00:00:00']
Length: 5021, dtype: datetime64[ns]

In [35]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("").replace('NaT', "")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

  outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("").replace('NaT', "")


<DatetimeArray>
[                'NaT', '1976-12-31 00:00:00', '1977-12-31 00:00:00',
 '1978-12-31 00:00:00', '1979-12-31 00:00:00', '1980-12-31 00:00:00',
 '1981-12-31 00:00:00', '1982-12-31 00:00:00', '1983-12-31 00:00:00',
 '1984-12-31 00:00:00', '1985-12-31 00:00:00', '1986-12-31 00:00:00',
 '1987-12-31 00:00:00', '1988-12-31 00:00:00', '1989-12-31 00:00:00',
 '1990-12-31 00:00:00', '1991-12-31 00:00:00', '1992-12-31 00:00:00',
 '1993-12-31 00:00:00', '1994-12-31 00:00:00', '1995-12-31 00:00:00',
 '1996-12-31 00:00:00', '1997-12-31 00:00:00', '1999-12-31 00:00:00',
 '1998-12-31 00:00:00', '2000-12-31 00:00:00', '2001-12-31 00:00:00',
 '2002-12-31 00:00:00', '2003-12-31 00:00:00', '2004-12-31 00:00:00',
 '2005-12-31 00:00:00', '2006-12-31 00:00:00', '2007-12-31 00:00:00',
 '2008-12-31 00:00:00', '2009-12-31 00:00:00', '2011-12-31 00:00:00',
 '2012-12-31 00:00:00', '2010-12-31 00:00:00', '2013-12-31 00:00:00',
 '2014-12-31 00:00:00', '2015-12-31 00:00:00', '2016-12-31 00:00:00',
 '20

In [36]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("").replace('NaT', "")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

  outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("").replace('NaT', "")


<DatetimeArray>
[                'NaT', '1976-01-01 00:00:00', '1977-01-01 00:00:00',
 '1978-01-01 00:00:00', '1979-01-01 00:00:00', '1980-01-01 00:00:00',
 '1981-01-01 00:00:00', '1982-01-01 00:00:00', '1983-01-01 00:00:00',
 '1984-01-01 00:00:00', '1985-01-01 00:00:00', '1986-01-01 00:00:00',
 '1987-01-01 00:00:00', '1988-01-01 00:00:00', '1989-01-01 00:00:00',
 '1990-01-01 00:00:00', '1991-01-01 00:00:00', '1992-01-01 00:00:00',
 '1993-01-01 00:00:00', '1994-01-01 00:00:00', '1995-01-01 00:00:00',
 '1996-01-01 00:00:00', '1997-01-01 00:00:00', '1999-01-01 00:00:00',
 '1998-01-01 00:00:00', '2000-01-01 00:00:00', '2001-01-01 00:00:00',
 '2002-01-01 00:00:00', '2003-01-01 00:00:00', '2004-01-01 00:00:00',
 '2005-01-01 00:00:00', '2006-01-01 00:00:00', '2007-01-01 00:00:00',
 '2008-01-01 00:00:00', '2009-01-01 00:00:00', '2011-01-01 00:00:00',
 '2012-01-01 00:00:00', '2010-01-01 00:00:00', '2013-01-01 00:00:00',
 '2014-01-01 00:00:00', '2015-01-01 00:00:00', '2016-01-01 00:00:00',
 '20

In [37]:
# extract year out
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].replace("", 0).fillna(0).astype(int).astype(str)
outdf['in_ReportYearCV'].unique()

array(['0', '1976', '1977', '1978', '1979', '1980', '1981', '1982',
       '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990',
       '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1999',
       '1998', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2011', '2012', '2010', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019', '2020', '2021', '1967',
       '1968', '1969', '1970', '1971', '1974', '1975', '1966', '1972',
       '1973', '1965'], dtype=object)

In [38]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

array(['Agriculture Irrigation', 'Commercial/Industrial', 'Hydroelectric',
       'In-stream Flow', 'Livestock', 'Public Supply', 'Recreation',
       'Domestic', 'Other', 'Unspecified'], dtype=object)

In [39]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

array(['Water Right_Annual_Agriculture Irrigation_Groundwater',
       'Water Right_Annual_Agriculture Irrigation_Surface Water',
       'Water Right_Annual_Commercial/Industrial_Surface Water',
       'Water Right_Annual_Hydroelectric_Surface Water',
       'Water Right_Annual_In-Stream Flow_Surface Water',
       'Water Right_Annual_Livestock_Surface Water',
       'Water Right_Annual_Public Supply_Surface Water',
       'Water Right_Annual_Recreation_Surface Water',
       'Water Right_Annual_Domestic_Surface Water',
       'Water Right_Annual_Hydroelectric_Groundwater',
       'Water Right_Annual_Commercial/Industrial_Groundwater',
       'Water Right_Annual_Public Supply_Groundwater',
       'Water Right_Annual_Other_Surface Water',
       'Water Right_Annual_In-Stream Flow_Groundwater',
       'Water Right_Annual_Domestic_Groundwater',
       'Water Right_Annual_Recreation_Groundwater',
       'Water Right_Annual_Agriculture Irrigation_WaDE Blank',
       'Water Right_Annual_Unsp

In [40]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1', 'wadeId2', 'wadeId3', 'wadeId4', 'wadeId5', 'wadeId6',
       'wadeId7', 'wadeId8', 'wadeId9', 'wadeId10', 'wadeId11',
       'wadeId12', 'wadeId13', 'wadeId14', 'wadeId15', 'wadeId16',
       'wadeId17', 'wadeId18', 'wadeId19', 'wadeId20', 'wadeId21',
       'wadeId22', 'wadeId23', 'wadeId24', 'wadeId25', 'wadeId26',
       'wadeId27', 'wadeId28', 'wadeId29', 'wadeId30', 'wadeId31',
       'wadeId32', 'wadeId33', 'wadeId34', 'wadeId35', 'wadeId36',
       'wadeId37', 'wadeId38', 'wadeId39', 'wadeId40', 'wadeId41',
       'wadeId42', 'wadeId43', 'wadeId44', 'wadeId45', 'wadeId46',
       'wadeId47', 'wadeId48', 'wadeId49', 'wadeId50', 'wadeId51',
       'wadeId52', 'wadeId53', 'wadeId54', 'wadeId55', 'wadeId56',
       'wadeId57', 'wadeId58', 'wadeId59', 'wadeId60', 'wadeId61',
       'wadeId62', 'wadeId63', 'wadeId64', 'wadeId65', 'wadeId66',
       'wadeId67', 'wadeId68', 'wadeId69', 'wadeId70', 'wadeId71',
       'wadeId72', 'wadeId73', 'wadeId74', 'wadeId75', 'wad

In [41]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['nd13007302B', 'nd15310236CC', 'nd15310236BA', ..., 'nd15910304D',
       'nd13005820A', 'nd13005820D'], dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For ND, we don't want water rights that are considered: Application In Processing, Cancelled, Deferred, Denied, Held In Abeyance, InActive, Pending Review, Under Review, Void

In [42]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Application In Processing", "Cancelled", "Deferred", "Denied", "Held In Abeyance", "InActive", "Pending Review", "Under Review", "Void"]

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

135241


array(['Active', 'POD In Processing', ''], dtype=object)

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [43]:
# # PoU Shapefile Data
# shapefileInput = "RawInputData/shapefiles/{enter file name here}.zip" # ziped folder of the shp file

# dfPoUshapetemp = gpd.read_file(shapefileInput)
# dfPoUshapetemp['geometry'] = dfPoUshapetemp['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
# print(len(dfPoUshapetemp))
# dfPoUshapetemp.head()

In [44]:
# # create temp dataframe to hold native ID and geometry from shapefile input
# columnsList = ['in_SiteNativeID', 'geometry']
# dfPoUshape = pd.DataFrame(columns=columnsList)

# # assing values to temp dataframe based on shapefile input
# # for in_SiteNativeID assure ID value is the same as that listed above for POU info.
# dfPoUshape['in_SiteNativeID'] = "POU" + ""
# dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
# dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
# print(len(dfPoUshape))
# dfPoUshape.head()

## Export Outputs

In [45]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135241 entries, 0 to 135240
Data columns (total 74 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   WaDEUUID                                      135241 non-null  object        
 1   in_MethodUUID                                 135241 non-null  object        
 2   in_VariableSpecificUUID                       135241 non-null  object        
 3   in_AggregationIntervalUnitCV                  135241 non-null  object        
 4   in_VariableCV                                 135241 non-null  object        
 5   in_OrganizationUUID                           135241 non-null  object        
 6   in_Geometry                                   135241 non-null  object        
 7   in_GNISFeatureNameCV                          135241 non-null  object        
 8   in_WaterQualityIndicatorCV                    135241 n

In [46]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart,in_VariableSpecificCV
0,in13,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,wadeId2,Surface Water,,Centroid of Area,,4326,,,,47.70175,-103.44125,,,POD,,nd14910026AB,,,ND,,1937-04-30,,,,,,,,1615.60000,Active,2D_4,Gudmunsen Robert And Lowraine,1906-01-26,,,,,291.00000,Irrigation,,,,,,0,,291.00000,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,291.00000,2D_4,,Agriculture Irrigation,1977,,1977-12-31,1977-01-01,Water Right_Annual_Agriculture Irrigation_Surf...
1,in13,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,wadeId2,Surface Water,,Centroid of Area,,4326,,,,47.70175,-103.44125,,,POD,,nd14910026AB,,,ND,,1937-04-30,,,,,,,,1615.60000,Active,2D_4,Gudmunsen Robert And Lowraine,1906-01-26,,,,,291.00000,Irrigation,,,,,,0,,291.00000,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,291.00000,2D_4,,Agriculture Irrigation,1978,,1978-12-31,1978-01-01,Water Right_Annual_Agriculture Irrigation_Surf...
2,in13,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,wadeId2,Surface Water,,Centroid of Area,,4326,,,,47.70175,-103.44125,,,POD,,nd14910026AB,,,ND,,1937-04-30,,,,,,,,1615.60000,Active,2D_4,Gudmunsen Robert And Lowraine,1906-01-26,,,,,291.00000,Irrigation,,,,,,0,,291.00000,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,291.00000,2D_4,,Agriculture Irrigation,1979,,1979-12-31,1979-01-01,Water Right_Annual_Agriculture Irrigation_Surf...
3,in13,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,wadeId2,Surface Water,,Centroid of Area,,4326,,,,47.70175,-103.44125,,,POD,,nd14910026AB,,,ND,,1937-04-30,,,,,,,,1615.60000,Active,2D_4,Gudmunsen Robert And Lowraine,1906-01-26,,,,,291.00000,Irrigation,,,,,,0,,291.00000,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,291.00000,2D_4,,Agriculture Irrigation,1980,,1980-12-31,1980-01-01,Water Right_Annual_Agriculture Irrigation_Surf...
4,in13,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,wadeId2,Surface Water,,Centroid of Area,,4326,,,,47.70175,-103.44125,,,POD,,nd14910026AB,,,ND,,1937-04-30,,,,,,,,1615.60000,Active,2D_4,Gudmunsen Robert And Lowraine,1906-01-26,,,,,291.00000,Irrigation,,,,,,0,,291.00000,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,291.00000,2D_4,,Agriculture Irrigation,1981,,1981-12-31,1981-01-01,Water Right_Annual_Agriculture Irrigation_Surf...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135236,in112884,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,wadeId1,Groundwater,,Centroid of Area,,4326,,,,48.62298,-104.02091,,,POD,,nd15910305D,,,ND,,,,,,,,,,,POD In Processing,7354_93375,Ibsen Duane And Rosemary,2024-01-04,,December 31,January 1,,,Industrial,,,,,,0,,0.00000,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,,7354_93375,,Commercial/Industrial,0,,NaT,NaT,Water Right_Annual_Commercial/Industrial_Groun...
135237,in112885,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,wadeId1,Groundwater,,Centroid of Area,,4326,,,,48.62287,-103.99898,,,POD,,nd15910304D,,,ND,,,,,,,,,,,POD In Processing,7354_93376,Ibsen Duane And Rosemary,2024-01-04,,December 31,January 1,,,Industrial,,,,,,0,,0.00000,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,,7354_93376,,Commercial/Industrial,0,,NaT,NaT,Water Right_Annual_Commercial/Industrial_Groun...
135238,in112886,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,wadeId1,Groundwater,,Centroid of Area,,4326,,,,46.06247,-97.97147,,,POD,,nd13005820A,,,ND,,,,,,,,,,,POD In Processing,7355_93377,Schmit Layne,2024-01-11,,October 31,April 1,,,Irrigation,,,,,,0,,0.00000,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,,7355_93377,,Agriculture Irrigation,0,,NaT,NaT,Water Right_Annual_Agriculture Irrigation_Grou...
135239,in112887,NDwrwu_M1,NDwrwu_V1,Annual,Water Right,NDwrwu_O1,,,,,wadeId1,Groundwater,,Centroid of Area,,4326,,,,46.05526,-97.97149,,,POD,,nd13005820D,,,ND,,,,,,,,,,,POD In Processing,7355_93378,Schmit Layne,2024-01-11,,October 31,April 1,,,Irrigation,,,,,,0,,0.00000,,,,,,,,https://www.swc.nd.gov/info_edu/map_data_resou...,,7355_93378,,Agriculture Irrigation,0,,NaT,NaT,Water Right_Annual_Agriculture Irrigation_Grou...


In [47]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwrwu_Main.zip', compression=dict(method='zip', archive_name='Pwrwu_Main.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.