# Pre-processing (state / organization Name) Regulatory data for WaDE upload.
- Purpose:  To preprocess the data into one master file for simple DataFrame creation and extraction.

In [2]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [3]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Nebraska/Regulatory" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/Nebraska/Regulatory


## Regulatory Area Data #1

In [4]:
#CSV input file
inputFile = "RawInputData/BND_NaturalResourceDistricts_DNR_input.csv"
df = pd.read_csv(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")
print(len(dfin1))
dfin1.head()

23


Unnamed: 0,ï»¿OID_,OBJECTID,AreaName,NRD_Name_A,NRD_Num,Shape_Leng,Shape_Length,Shape_Area
0,1,1,Lower Republican,LOWER REPUBLICAN,23,605099.76849,605099.76849,11018161576.28113
1,2,2,Tri-Basin,TRI-BASIN,24,413768.86546,413768.86546,6815290176.96236
2,3,3,Little Blue,LITTLE BLUE,5,606780.75182,606780.75182,10718680365.61676
3,4,4,Lower Big Blue,LOWER BIG BLUE,2,438586.0786,438586.0786,7324130010.01756
4,5,5,Nemaha,NEMAHA,11,606872.26451,606872.26451,10901490684.33732


In [5]:
# Clean text
dfin1['NRD_Name_A'] = dfin1['NRD_Name_A'].str.strip()
dfin1['NRD_Num'] = dfin1['NRD_Num'].astype(str).str.strip()
dfin1.head()

Unnamed: 0,ï»¿OID_,OBJECTID,AreaName,NRD_Name_A,NRD_Num,Shape_Leng,Shape_Length,Shape_Area
0,1,1,Lower Republican,LOWER REPUBLICAN,23,605099.76849,605099.76849,11018161576.28113
1,2,2,Tri-Basin,TRI-BASIN,24,413768.86546,413768.86546,6815290176.96236
2,3,3,Little Blue,LITTLE BLUE,5,606780.75182,606780.75182,10718680365.61676
3,4,4,Lower Big Blue,LOWER BIG BLUE,2,438586.0786,438586.0786,7324130010.01756
4,5,5,Nemaha,NEMAHA,11,606872.26451,606872.26451,10901490684.33732


In [None]:
# Create URL
# URL link info was found here: https://www.nrdnet.org/

URLdict = {
"1" : "https://www.lrnrd.org/",
"2" : "https://www.tribasinnrd.org/",
"3" : "http://www.littlebluenrd.org/",
"4" : "http://www.lbbnrd.net",
"5" : "https://www.nemahanrd.org/",
"6" : "http://www.mrnrd.org",
"7" : "http://www.urnrd.org",
"8" : "https://www.lpsnrd.org/",
"9" : "http://www.upperbigblue.org",
"10" : "www.cpnrd.org",
"11" : "http://www.spnrd.org",
"12" : "http://www.tpnrd.org",
"13" : "http://www.lpnnrd.org",
"14" : "http://www.llnrd.org",
"15" : "http://www.npnrd.org",
"16" : "http://www.upperloupnrd.org",
"17" : "http://www.papionrd.org",
"18" : "http://www.lenrd.org",
"19" : "http://www.uenrd.org",
"20" : "https://lcnrd.nebraska.gov/",
"21" : "http://www.lnnrd.org",
"22" : "http://www.mnnrd.org",
"23" : "http://www.unwnrd.org"}

def retrieveURL(valA):
    valA = str(valA).strip()
    if valA == '' or pd.isnull(valA):
        outString = ''
    else:
        String1 = valA
        try:
            outString = URLdict[String1]
        except:
            outString = ''
    return outString


df['in_RegulatoryStatuteLink'] = df.apply(lambda row: retrieveURL(row['OBJECTID']), axis=1)
df.head()

In [None]:
# Input File
inputFile = "RawInputData/BND_NaturalResourceDistricts_DNR_input.csv"
dfin1 = pd.read_csv(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "re" + dfin1.index.astype(str)
    dfin1.to_csv('RawInputData/{enter file name here}.zip', compression=dict(method='zip', archive_name='{enter file name here}.csv'), index=False)

print(len(dfin1))
dfin1.head()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Date Info
df['in_Date'] = ""
df['in_Year'] = ""

# Organization Info
df['in_OrganizationUUID'] = ""

# ReportingUnit Info
df['in_EPSGCodeCV'] = ""
df['in_ReportingUnitName'] = ""
df['in_ReportingUnitNativeID'] = ""
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = ""
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = ""
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = ""
df['in_RegulatoryDescription'] = ""
df['in_RegulatoryName'] = ""
df['in_RegulatoryOverlayNativeID'] = ""
df['in_RegulatoryStatusCV'] = ""
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] =""
df['in_StatutoryEffectiveDate'] = d""
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = ""
df['in_WaterSourceTypeCV'] = ""

df = df.drop_duplicates().reset_index(drop=True)
outdf1 = df.copy()
print(len(outdf1))
outdf1.head()

## Regulatory Area #2

In [None]:
# etc etc,

## Concatenate DataFrames together

In [None]:
# Concatenate dataframes
frames = [outdf1, outdf2] # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_ReportingUnitName'] = df.apply(lambda row: ensureEmptyString(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

In [None]:
outdf['in_RegulatoryDescription'] = df.apply(lambda row: ensureEmptyString(row['in_RegulatoryDescription']), axis=1)
outdf['in_RegulatoryDescription'].unique()

In [None]:
outdf['in_RegulatoryName'] = df.apply(lambda row: ensureEmptyString(row['in_RegulatoryName']), axis=1)
outdf['in_RegulatoryName'].unique()

In [None]:
#Update datatype of StatutoryEffectiveDate to fit WaDE 2.0 structure
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'], errors = 'coerce')
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'].dt.strftime('%m/%d/%Y'))
outdf['in_StatutoryEffectiveDate'].unique()

# Shapefile Data
- For attaching geometry to reporting unit info.

#### Regulatory Area #1 shapefile info

In [None]:
# Input File
shapeInputFile = "RawInputData/shapefiles/{enter file name here}.zip"
gdfin1 = gpd.read_file(shapeInputFile)
print(len(gdfin1))
gdfin1.head()

In [None]:
# plot shape info to map
gdfin1.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf1 = pd.DataFrame(columns=columnsList, index=gdfin1.index)

goutdf1['in_ReportingUnitNativeID'] = ""  # in_ReportingUnitNativeID needs to match source from above equivlaent datframe
goutdf1['geometry'] = gdfin1['geometry']
goutdf1 = goutdf1.drop_duplicates().reset_index(drop=True)
print(len(goutdf1))
goutdf1.head()

#### Regulatory Area #2 shapefile info

In [None]:
# etc etc...

#### Concatenate goutdf shapefile info into single output

In [None]:
# Concatenate Basin & Subbasin DataFrames
frames = [goutdf1, goutdf2,] # add geoutdf dataframes here
goutdf = pd.concat(frames).reset_index(drop=True)

print(len(goutdf))
goutdf.head()

## Export Data

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(goutdf.dtypes)

In [None]:
# Export out to CSV.
outdf.to_csv('RawInputData/Pre_sdMain.zip', compression=dict(method='zip', archive_name='Pre_Main.csv'), index=False)  # The output, save as a zip
goutdf.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.