# Pre-processing Overlay data for WaDE upload.
- Purpose: To preprocess state overlay data into one main file for simple DataFrame creation and extraction.

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/WaDE Data Folder/Nebraska/Overlays" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/WaDE Data Folder/Nebraska/Overlays


## Overlay Area Data #1

In [3]:
# Input File
inputFile = "RawInputData/shapefiles/Natural_Resource_District_NRD_Boundaries_20250129.zip"
dfin1 = gpd.read_file(inputFile).replace(np.nan, "")
dfin1['geometry'] = dfin1['geometry'].to_crs(epsg=4326) # Realign Geometry Projection

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "ov" + dfin1.index.astype(str)
    dfin1.to_csv('RawInputData/Natural_Resource_District_NRD_Boundaries_20250129.zip', compression=dict(method='zip', archive_name='Natural_Resource_District_NRD_Boundaries_20250129.csv'), index=False)

print(len(dfin1))
dfin1.head()

23


Unnamed: 0,OBJECTID,NRD_Name,NRD_Name_A,NRD_Num,Shape_Leng,Shape_Le_1,Shape_Area,geometry,WaDEUUID
0,1,Lower Republican,LOWER REPUBLICAN,23,605099.76849,5.22031,0.67953,"POLYGON ((-97.93460 40.00210, -98.17728 40.002...",ov0
1,2,Tri-Basin,TRI-BASIN,24,413768.86546,3.4508,0.41809,"POLYGON ((-98.72437 40.68977, -98.72389 40.451...",ov1
2,3,Little Blue,LITTLE BLUE,5,606780.75182,5.01537,0.65961,"POLYGON ((-97.36873 40.43739, -97.36879 40.350...",ov2
3,4,Lower Big Blue,LOWER BIG BLUE,2,438586.0786,3.49527,0.45064,"POLYGON ((-96.74194 40.52312, -96.68493 40.523...",ov3
4,5,Nemaha,NEMAHA,11,606872.26451,4.88719,0.67026,"POLYGON ((-95.88707 40.72605, -95.88319 40.717...",ov4


In [4]:
# Create URL
# URL link info was found here: https://www.nrdnet.org/

URLdict = {
"1" : "https://www.lrnrd.org/",
"2" : "https://www.tribasinnrd.org/",
"3" : "http://www.littlebluenrd.org/",
"4" : "http://www.lbbnrd.net",
"5" : "https://www.nemahanrd.org/",
"6" : "http://www.mrnrd.org",
"7" : "http://www.urnrd.org",
"8" : "https://www.lpsnrd.org/",
"9" : "http://www.upperbigblue.org",
"10" : "www.cpnrd.org",
"11" : "http://www.spnrd.org",
"12" : "http://www.tpnrd.org",
"13" : "http://www.lpnnrd.org",
"14" : "http://www.llnrd.org",
"15" : "http://www.npnrd.org",
"16" : "http://www.upperloupnrd.org",
"17" : "http://www.papionrd.org",
"18" : "http://www.lenrd.org",
"19" : "http://www.uenrd.org",
"20" : "https://lcnrd.nebraska.gov/",
"21" : "http://www.lnnrd.org",
"22" : "http://www.mnnrd.org",
"23" : "http://www.unwnrd.org"}

def retrieveURL(valA):
    valA = str(valA).strip()
    if valA == '' or pd.isnull(valA):
        outString = ''
    else:
        String1 = valA
        try:
            outString = URLdict[String1]
        except:
            outString = ''
    return outString


dfin1['in_RegulatoryStatuteLink'] = dfin1.apply(lambda row: retrieveURL(row['OBJECTID']), axis=1)
dfin1.head()

Unnamed: 0,OBJECTID,NRD_Name,NRD_Name_A,NRD_Num,Shape_Leng,Shape_Le_1,Shape_Area,geometry,WaDEUUID,in_RegulatoryStatuteLink
0,1,Lower Republican,LOWER REPUBLICAN,23,605099.76849,5.22031,0.67953,"POLYGON ((-97.93460 40.00210, -98.17728 40.002...",ov0,https://www.lrnrd.org/
1,2,Tri-Basin,TRI-BASIN,24,413768.86546,3.4508,0.41809,"POLYGON ((-98.72437 40.68977, -98.72389 40.451...",ov1,https://www.tribasinnrd.org/
2,3,Little Blue,LITTLE BLUE,5,606780.75182,5.01537,0.65961,"POLYGON ((-97.36873 40.43739, -97.36879 40.350...",ov2,http://www.littlebluenrd.org/
3,4,Lower Big Blue,LOWER BIG BLUE,2,438586.0786,3.49527,0.45064,"POLYGON ((-96.74194 40.52312, -96.68493 40.523...",ov3,http://www.lbbnrd.net
4,5,Nemaha,NEMAHA,11,606872.26451,4.88719,0.67026,"POLYGON ((-95.88707 40.72605, -95.88319 40.717...",ov4,https://www.nemahanrd.org/


In [5]:
# create output for Overlay Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Date Info
df['in_Date'] = "01/25/2025"
df['in_Year'] = "2025"

# Organization Info
df['in_OrganizationUUID'] = "NEov_O1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = dfin1['NRD_Name_A']
df['in_ReportingUnitNativeID'] = "ne" + dfin1['NRD_Num'].astype(str)
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Natural Resources Districts"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "NE"
df['in_Geometry'] = dfin1['geometry']

# RegulatoryOverlay Info
df['in_OversightAgency'] = dfin1['NRD_Name_A'] + "NRD"
df['in_RegulatoryDescription'] = "Natural Resources Districts were created to solve flood control, soil erosion, irrigation run-off, and groundwater quantity and quality issues. Nebraska's NRDs are involved in a wide variety of projects and programs to conserve and protect the state's natural resources. NRDs are charged under state law with 12 areas of responsibility including flood control, soil erosion, groundwater management and many others."
df['in_RegulatoryName'] = dfin1['NRD_Name_A']
df['in_RegulatoryOverlayNativeID'] = dfin1['NRD_Num']
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = dfin1['in_RegulatoryStatuteLink']
df['in_StatutoryEffectiveDate'] = "01/01/1972"
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Natural Resources Districts"
df['in_WaterSourceTypeCV'] = "Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf1 = df.copy()
print(len(outdf1))
outdf1.head()

23


Unnamed: 0,WaDEUUID,in_Date,in_Year,in_OrganizationUUID,in_EPSGCodeCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitProductVersion,in_ReportingUnitTypeCV,in_ReportingUnitUpdateDate,in_StateCV,in_Geometry,in_OversightAgency,in_RegulatoryDescription,in_RegulatoryName,in_RegulatoryOverlayNativeID,in_RegulatoryStatusCV,in_RegulatoryStatute,in_RegulatoryStatuteLink,in_StatutoryEffectiveDate,in_StatutoryEndDate,in_RegulatoryOverlayTypeCV,in_WaterSourceTypeCV
0,ov0,01/25/2025,2025,NEov_O1,4326,LOWER REPUBLICAN,ne23,,Natural Resources Districts,,NE,,LOWER REPUBLICANNRD,Natural Resources Districts were created to so...,LOWER REPUBLICAN,23,Active,,https://www.lrnrd.org/,01/01/1972,,Natural Resources Districts,Groundwater
1,ov1,01/25/2025,2025,NEov_O1,4326,TRI-BASIN,ne24,,Natural Resources Districts,,NE,,TRI-BASINNRD,Natural Resources Districts were created to so...,TRI-BASIN,24,Active,,https://www.tribasinnrd.org/,01/01/1972,,Natural Resources Districts,Groundwater
2,ov2,01/25/2025,2025,NEov_O1,4326,LITTLE BLUE,ne5,,Natural Resources Districts,,NE,,LITTLE BLUENRD,Natural Resources Districts were created to so...,LITTLE BLUE,5,Active,,http://www.littlebluenrd.org/,01/01/1972,,Natural Resources Districts,Groundwater
3,ov3,01/25/2025,2025,NEov_O1,4326,LOWER BIG BLUE,ne2,,Natural Resources Districts,,NE,,LOWER BIG BLUENRD,Natural Resources Districts were created to so...,LOWER BIG BLUE,2,Active,,http://www.lbbnrd.net,01/01/1972,,Natural Resources Districts,Groundwater
4,ov4,01/25/2025,2025,NEov_O1,4326,NEMAHA,ne11,,Natural Resources Districts,,NE,,NEMAHANRD,Natural Resources Districts were created to so...,NEMAHA,11,Active,,https://www.nemahanrd.org/,01/01/1972,,Natural Resources Districts,Groundwater


## Overlay Area #2

In [6]:
# etc etc,

## Concatenate DataFrames together

In [7]:
# Concatenate dataframes
frames = [outdf1] # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

23


## Clean Data / data types

In [8]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [9]:
outdf['in_ReportingUnitName'] = outdf.apply(lambda row: ensureEmptyString(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

array(['LOWER REPUBLICAN', 'TRI-BASIN', 'LITTLE BLUE', 'LOWER BIG BLUE',
       'NEMAHA', 'MIDDLE REPUBLICAN', 'UPPER REPUBLICAN',
       'LOWER PLATTE SOUTH', 'UPPER BIG BLUE', 'CENTRAL PLATTE',
       'SOUTH PLATTE', 'TWIN PLATTE', 'LOWER PLATTE NORTH', 'LOWER LOUP',
       'NORTH PLATTE', 'UPPER LOUP', 'PAPIO-MISSOURI RIVER',
       'LOWER ELKHORN', 'UPPER ELKHORN', 'LEWIS & CLARK',
       'LOWER NIOBRARA', 'MIDDLE NIOBRARA', 'UPPER NIOBRARA-WHITE'],
      dtype=object)

In [10]:
outdf['in_RegulatoryDescription'] = outdf.apply(lambda row: ensureEmptyString(row['in_RegulatoryDescription']), axis=1)
outdf['in_RegulatoryDescription'].unique()

array(["Natural Resources Districts were created to solve flood control, soil erosion, irrigation run-off, and groundwater quantity and quality issues. Nebraska's NRDs are involved in a wide variety of projects and programs to conserve and protect the state's natural resources. NRDs are charged under state law with 12 areas of responsibility including flood control, soil erosion, groundwater management and many others."],
      dtype=object)

In [11]:
outdf['in_RegulatoryName'] = outdf.apply(lambda row: ensureEmptyString(row['in_RegulatoryName']), axis=1)
outdf['in_RegulatoryName'].unique()

array(['LOWER REPUBLICAN', 'TRI-BASIN', 'LITTLE BLUE', 'LOWER BIG BLUE',
       'NEMAHA', 'MIDDLE REPUBLICAN', 'UPPER REPUBLICAN',
       'LOWER PLATTE SOUTH', 'UPPER BIG BLUE', 'CENTRAL PLATTE',
       'SOUTH PLATTE', 'TWIN PLATTE', 'LOWER PLATTE NORTH', 'LOWER LOUP',
       'NORTH PLATTE', 'UPPER LOUP', 'PAPIO-MISSOURI RIVER',
       'LOWER ELKHORN', 'UPPER ELKHORN', 'LEWIS & CLARK',
       'LOWER NIOBRARA', 'MIDDLE NIOBRARA', 'UPPER NIOBRARA-WHITE'],
      dtype=object)

In [12]:
#Update datatype of StatutoryEffectiveDate to fit WaDE 2.0 structure
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'], errors = 'coerce')
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'].dt.strftime('%m/%d/%Y'))
outdf['in_StatutoryEffectiveDate'].unique()

<DatetimeArray>
['1972-01-01 00:00:00']
Length: 1, dtype: datetime64[ns]

# Geometry Data
- For attaching geometry to overlay reporting unit area info.

#### Overlay Area #1 shapefile info

In [13]:
# # Input File / or use same input as above

gdfin1 = outdf.copy()
gdfin1 = gpd.GeoDataFrame(gdfin1, geometry=gdfin1['in_Geometry'], crs="EPSG:4326") # covert to geodataframe
print(len(gdfin1))
gdfin1.head()

TypeError: Input must be valid geometry objects: 

In [None]:
# plot shape info to map
gdfin1.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf1 = pd.DataFrame(columns=columnsList, index=gdfin1.index)

goutdf1['in_ReportingUnitNativeID'] =  gdfin1["in_ReportingUnitNativeID"].astype(str)  #in_ReportingUnitNativeID needs to match source from above equivalent dataframe
goutdf1['geometry'] = gdfin1['geometry']
goutdf1 = goutdf1.drop_duplicates().reset_index(drop=True)
print(len(goutdf1))
goutdf1.head()

#### Overlay Area #2 shapefile info

In [None]:
# etc etc...

#### Concatenate goutdf shapefile info into single output

In [None]:
# Concatenate Geometry DataFrames
frames = [goutdf1] # add geoutdf dataframes here
goutdf = pd.concat(frames).reset_index(drop=True)

print(len(goutdf))
goutdf.head()

## Export Data

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(goutdf.dtypes)

In [None]:
# Export out to CSV.
outdf.to_csv('RawInputData/Pov_Main.zip', compression=dict(method='zip', archive_name='Pov_Main.csv'), index=False)  # The output, save as a zip
goutdf.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.