# Pre-processing Utah Department of Natural Resources Regulatory data for WaDE upload.
- Purpose:  To preprocess the data into one main file for simple DataFrame creation and extraction.

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Utah/Regulatory/Regulatory_DNR" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'G:/Shared drives/WaDE Data/Utah/Regulatory/Regulatory_DNR'

## Regulatory Area Data #1 - WaterRightAreasServiceView

In [None]:
# Input File
inputFile = "RawInputData/shapefiles/WaterRightAreasServiceView.zip"
dfin1 = gpd.read_file(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "re1" + dfin1.index.astype(str)
    dfin1.to_csv('RawInputData/WaterRightAreasServiceView.zip', compression=dict(method='zip', archive_name='WaterRightAreasServiceView.csv'), index=False)

print(len(dfin1))
dfin1.head()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Date Info
df['in_Date'] = "12/10/2023"
df['in_Year'] = "2023"

# Organization Info
df['in_OrganizationUUID'] = "UTre_O1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = "WaDE Blank"
df['in_ReportingUnitNativeID'] = "ut" + dfin1['AREA_CODE'].astype(str).str.strip()
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "WaDE Blank"
df['in_ReportingUnitUpdateDate'] = "5/31/2022"
df['in_StateCV'] = "UT"
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = dfin1['Office']
df['in_RegulatoryDescription'] = "Water Right Areas are administrative boundaries based primarily on surface drainage areas. Different water right areas can have different appropriation policies and can be administered by different regional offices."
df['in_RegulatoryName'] = "WaDE Blank"
df['in_RegulatoryOverlayNativeID'] = "utr1" + dfin1['AREA_CODE'].astype(str).str.strip()
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = dfin1['Link']
df['in_StatutoryEffectiveDate'] = "12/10/2023"
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Water Right Areas"
df['in_WaterSourceTypeCV'] = "Surface Water and Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf1 = df.copy()
print(len(outdf1))
outdf1.head()

## Regulatory Area #2 - Groundwater Policy Management

In [None]:
# Input File
inputFile = "RawInputData/shapefiles/ground_water_policy.zip"
dfin2 = gpd.read_file(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "re2" + dfin2.index.astype(str)
    dfin2.to_csv('RawInputData/ground_water_policy.zip', compression=dict(method='zip', archive_name='ground_water_policy.csv'), index=False)

print(len(dfin2))
dfin2.head()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin2['WaDEUUID']

# Date Info
df['in_Date'] = "12/10/2023"
df['in_Year'] = "2023"

# Organization Info
df['in_OrganizationUUID'] = "UTre_O1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = "WaDE Blank"
df['in_ReportingUnitNativeID'] = "ut" + dfin1['AREA_CODE'].astype(str).str.strip()
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "WaDE Blank"
df['in_ReportingUnitUpdateDate'] = "5/31/2022"
df['in_StateCV'] = "UT"
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = dfin1['Office']
df['in_RegulatoryDescription'] = "Water Right Areas are administrative boundaries based primarily on surface drainage areas. Different water right areas can have different appropriation policies and can be administered by different regional offices."
df['in_RegulatoryName'] = "WaDE Blank"
df['in_RegulatoryOverlayNativeID'] = "utr1" + dfin1['AREA_CODE'].astype(str).str.strip()
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = dfin1['Link']
df['in_StatutoryEffectiveDate'] = "12/10/2023"
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Water Right Areas"
df['in_WaterSourceTypeCV'] = "Surface Water and Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf1 = df.copy()
print(len(outdf1))
outdf1.head()

## Regulatory Area #3 - BasinsClosedToNewAppropriations

In [None]:
# Input File
inputFile = "RawInputData/shapefiles/BasinsClosedToNewAppropriations.zip"
dfin3 = gpd.read_file(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin3:
    dfin3['WaDEUUID'] = "re3" + dfin3.index.astype(str)
    dfin3.to_csv('RawInputData/BasinsClosedToNewAppropriations.zip', compression=dict(method='zip', archive_name='BasinsClosedToNewAppropriations.csv'), index=False)

print(len(dfin3))
dfin3.head()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin3['WaDEUUID']

# Date Info
df['in_Date'] = "12/10/2023"
df['in_Year'] = "2023"

# Organization Info
df['in_OrganizationUUID'] = "UTre_O1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = "WaDE Blank"
df['in_ReportingUnitNativeID'] = "ut" + dfin3['AREA_CODE'].astype(str).str.strip()
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "WaDE Blank"
df['in_ReportingUnitUpdateDate'] = "5/31/2022"
df['in_StateCV'] = "UT"
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = dfin3['Office']
df['in_RegulatoryDescription'] = "Basin closed to new appropriations."
df['in_RegulatoryName'] = "WaDE Blank"
df['in_RegulatoryOverlayNativeID'] = "utr3" + dfin3['AREA_CODE'].astype(str).str.strip()
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = dfin3['Link']
df['in_StatutoryEffectiveDate'] = "12/10/2023"
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Basins Closed to New Appropriations"
df['in_WaterSourceTypeCV'] = "Surface Water and Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf3 = df.copy()
print(len(outdf3))
outdf3.head()

## Regulatory Area #4 - AreasOpenToLimitedAppropriation

In [None]:
# Input File
inputFile = "RawInputData/shapefiles/AreasOpenToLimitedAppropriation.zip"
dfin4 = gpd.read_file(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin4:
    dfin4['WaDEUUID'] = "re4" + dfin4.index.astype(str)
    dfin4.to_csv('RawInputData/AreasOpenToLimitedAppropriation.zip', compression=dict(method='zip', archive_name='AreasOpenToLimitedAppropriation.csv'), index=False)

print(len(dfin4))
dfin4.head()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin4['WaDEUUID']

# Date Info
df['in_Date'] = "12/10/2023"
df['in_Year'] = "2023"

# Organization Info
df['in_OrganizationUUID'] = "UTre_O1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = "WaDE Blank"
df['in_ReportingUnitNativeID'] = "ut" + dfin4['AREA_CODE'].astype(str).str.strip()
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "WaDE Blank"
df['in_ReportingUnitUpdateDate'] = "5/31/2022"
df['in_StateCV'] = "UT"
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = dfin4['Office']
df['in_RegulatoryDescription'] = "The following are excepted from the effect of this proclamation: 1) applications for non-consumptive uses; 2) applications that include a mitigation plan to offset depletion; 3) applications for small amounts of water, as defined in Utah Code 73-3-5.6, that comply with State Engineer basin policies. All such applications remain subject to all applicable requirements of state law."
df['in_RegulatoryName'] = "WaDE Blank"
df['in_RegulatoryOverlayNativeID'] = "utr4" + dfin4['AREA_CODE'].astype(str).str.strip()
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = dfin4['Link']
df['in_StatutoryEffectiveDate'] = "12/10/2023"
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Areas Open to Limited Appropriation"
df['in_WaterSourceTypeCV'] = "Surface Water and Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf4 = df.copy()
print(len(outdf4))
outdf4.head()

## Concatenate DataFrames together

In [None]:
# Concatenate dataframes
frames = [outdf1, outdf2, outdf3, outdf4] # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_ReportingUnitName'] = outdf.apply(lambda row: ensureEmptyString(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

In [None]:
outdf['in_RegulatoryDescription'] = outdf.apply(lambda row: ensureEmptyString(row['in_RegulatoryDescription']), axis=1)
outdf['in_RegulatoryDescription'].unique()

In [None]:
outdf['in_RegulatoryName'] = outdf.apply(lambda row: ensureEmptyString(row['in_RegulatoryName']), axis=1)
outdf['in_RegulatoryName'].unique()

In [None]:
#Update datatype of StatutoryEffectiveDate to fit WaDE 2.0 structure
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'], errors = 'coerce')
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'].dt.strftime('%m/%d/%Y'))
outdf['in_StatutoryEffectiveDate'].unique()

# Shapefile Data
- For attaching geometry to reporting unit info.

#### Regulatory Area #1 shapefile info

In [None]:
# use sampe input file as above...
gdfin1 = dfin1.copy()

gdfin1['geometry'] = gdfin1['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
print(len(gdfin1))
gdfin1.head()

In [None]:
# plot shape info to map
gdfin1.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf1 = pd.DataFrame(columns=columnsList, index=gdfin1.index)

goutdf1['in_ReportingUnitNativeID'] = "utr1" + gdfin1['AREA_CODE'].astype(str).str.strip() # in_ReportingUnitNativeID needs to match source from above equivlaent datframe
goutdf1['geometry'] = gdfin1['geometry']
goutdf1 = goutdf1.drop_duplicates().reset_index(drop=True)
print(len(goutdf1))
goutdf1.head()

#### Regulatory Area #2 shapefile info

In [None]:
# use sampe input file as above...
gdfin2 = dfin2.copy()

gdfin2['geometry'] = gdfin2['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
print(len(gdfin2))
gdfin2.head()

In [None]:
# plot shape info to map
gdfin2.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf2 = pd.DataFrame(columns=columnsList, index=gdfin2.index)

goutdf2['in_ReportingUnitNativeID'] = "utr2" + gdfin2['AREA_CODE'].astype(str).str.strip() # in_ReportingUnitNativeID needs to match source from above equivlaent datframe
goutdf2['geometry'] = gdfin2['geometry']
goutdf2 = goutdf2.drop_duplicates().reset_index(drop=True)
print(len(goutdf2))
goutdf2.head()

#### Regulatory Area #3 shapefile info

In [None]:
# use sampe input file as above...
gdfin3 = dfin3.copy()

gdfin3['geometry'] = gdfin3['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
print(len(gdfin3))
gdfin3.head()

In [None]:
# plot shape info to map
gdfin3.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf3 = pd.DataFrame(columns=columnsList, index=gdfin3.index)

goutdf3['in_ReportingUnitNativeID'] = "utr3" + gdfin3['AREA_CODE'].astype(str).str.strip() # in_ReportingUnitNativeID needs to match source from above equivlaent datframe
goutdf3['geometry'] = gdfin3['geometry']
goutdf3 = goutdf3.drop_duplicates().reset_index(drop=True)
print(len(goutdf3))
goutdf3.head()

#### Concatenate goutdf shapefile info into single output

In [None]:
# Concatenate Basin & Subbasin DataFrames
frames = [goutdf1, goutdf2, goutdf3] # add geoutdf dataframes here
goutdf = pd.concat(frames).reset_index(drop=True)

print(len(goutdf))
goutdf.head()

## Export Data

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(goutdf.dtypes)

In [None]:
# Export out to CSV.
outdf.to_csv('RawInputData/Pre_utMain.zip', compression=dict(method='zip', archive_name='Pre_Main.csv'), index=False)  # The output, save as a zip
goutdf.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.