# Preprocessing Idaho Regulatory data for WaDE upload.
- Purpose:  To preprocess the Idaho data into one master file for simple DataFrame creation and extraction.

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd # the library that lets us read in shapefiles

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import date
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Idaho/Regulatory/RawInputData"
os.chdir(workingDir)

# Inputs
- CAMP (three separate areas & inputs)
- Groundwater Districts
- Groundwater Management Areas
- Aquifer Recharge District 

### CAMP data

In [None]:
# Eastern Snake CAMP
inputFile = "Eastern_Snake_Comprehensive_Aquifer_Management_Plan.zip"
gdf_es = gpd.read_file(inputFile)

# Data Assessment UUID
gdf_es['WaDEUUID'] = "wade1"

# ReportingUnits
gdf_es['in_ReportingUnitName'] = "Eastern Snake"
gdf_es['in_ReportingUnitNativeID'] = "wade1"

# RegulatoryOverlay
gdf_es['in_RegulatoryName'] = "Eastern Snake CAMP"
gdf_es['in_RegulatoryOverlayNativeID'] = "wade1"
gdf_es['in_RegulatoryStatuteLink'] = "https://idwr.idaho.gov/iwrb/water-planning/camps/espa/"
gdf_es['in_StatutoryEffectiveDate'] = "01/01/2009"
gdf_es = gdf_es[['WaDEUUID', 'in_ReportingUnitName', 'in_ReportingUnitNativeID', 'in_RegulatoryName', 'in_RegulatoryOverlayNativeID', 'in_RegulatoryStatuteLink', 'in_StatutoryEffectiveDate', 'geometry']]

print(len(gdf_es))
gdf_es.head(1)

In [None]:
# Rathdrum CAMP
inputFile = "Rathdrum_Comprehensive_Aquifer_Management_Plan.zip"
gdf_r = gpd.read_file(inputFile)

# Data Assessment UUID
gdf_r['WaDEUUID'] = "wade2"

# ReportingUnits
gdf_r['in_ReportingUnitName'] = "Rathdrum Prairie"
gdf_r['in_ReportingUnitNativeID'] = "wade2"

# RegulatoryOverlay
gdf_r['in_RegulatoryName'] = "Rathdrum Prairie CAMP"
gdf_r['in_RegulatoryOverlayNativeID'] = "wade2"
gdf_r['in_RegulatoryStatuteLink'] = "https://idwr.idaho.gov/IWRB/water-planning/CAMPs/rathdrum-prairie/"
gdf_r['in_StatutoryEffectiveDate'] = "01/01/2009"
gdf_r = gdf_r[['WaDEUUID', 'in_ReportingUnitName', 'in_ReportingUnitNativeID', 'in_RegulatoryName', 'in_RegulatoryOverlayNativeID', 'in_RegulatoryStatuteLink', 'in_StatutoryEffectiveDate', 'geometry']]

print(len(gdf_r))
gdf_r.head()

In [None]:
# Treasure Valley CAMP
inputFile = "Treasure_Valley_Comprehensive_Aquifer_Management_Plan.zip"
gdf_tv = gpd.read_file(inputFile)

# Data Assessment UUID
gdf_tv['WaDEUUID'] = "wade3"

# ReportingUnits
gdf_tv['in_ReportingUnitName'] = "Treasure Valley"
gdf_tv['in_ReportingUnitNativeID'] = "wade3"

# RegulatoryOverlay
gdf_tv['in_RegulatoryName'] = "Treasure Valley CAMP"
gdf_tv['in_RegulatoryOverlayNativeID'] = "wade3"
gdf_tv['in_RegulatoryStatuteLink'] = "https://idwr.idaho.gov/iwrb/water-planning/camps/treasure-valley/"
gdf_tv['in_StatutoryEffectiveDate'] = "01/01/2009"
gdf_tv = gdf_tv[['WaDEUUID', 'in_ReportingUnitName', 'in_ReportingUnitNativeID', 'in_RegulatoryName', 'in_RegulatoryOverlayNativeID', 'in_RegulatoryStatuteLink', 'in_StatutoryEffectiveDate', 'geometry']]


print(len(gdf_tv))
gdf_tv.head(1)

In [None]:
# Concatenate inputs DataFrames together
frames = [gdf_es, gdf_r, gdf_tv]
gdfIn = pd.concat(frames)
print(len(gdfIn))
gdfIn.head()

In [None]:
# create output POD dataframe
df = pd.DataFrame(index=gdfIn.index)

# Data Assessment UUID
df['WaDEUUID'] = gdfIn['WaDEUUID']

# Date Info
df['in_Date'] = date.today().strftime('%m/%d')
df['in_Year'] = date.today().strftime('%Y')

# Organization
df['in_OrganizationUUID'] = "IDre_O1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = gdfIn['in_ReportingUnitName']
df['in_ReportingUnitNativeID'] = gdfIn['in_ReportingUnitNativeID']
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Comprehensive Aquifer Management Plan"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "ID"
df['in_Geometry'] = gdfIn['geometry']

# RegulatoryOverlay Info
df['in_OversightAgency'] = "Idaho Department of Water Resources"

df['in_RegulatoryDescription'] = """The 2008 Legislature approved House Bill 428 and House Bill 644 establishing the Statewide Comprehensive Aquifer Planning and Management Program (Idaho Code § 42-1779) and the Aquifer Planning and Management Fund (Idaho Code § 42-1780). This legislation authorized characterization and planning efforts for ten different basins from 2008 through 2018.

The Comprehensive Aquifer Planning and Management Program was designed to provide IWRB and IDWR with the necessary information to develop plans for managing groundwater and surface water resources into the future. The program had two phases: a technical component to characterize the surface water and groundwater resources of each basin a planning component to integrate the technical knowledge with an assessment of current and projected future water uses and constraints.  At the culmination of this program, the intent was to develop long-range plans for conjunctively managing the water resources of each basin by integrating hydrologic realities with the social needs. These water management plans were designed to address water supply and demand issues looking out 50 years into the future, and investigated strategies which will lead to sustainable water supplies and optimum use of the water resources."""

df['in_RegulatoryName'] = gdfIn['in_RegulatoryName']
df['in_RegulatoryOverlayNativeID'] = gdfIn['in_RegulatoryOverlayNativeID']
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = gdfIn['in_RegulatoryStatuteLink']
df['in_StatutoryEffectiveDate'] = gdfIn['in_StatutoryEffectiveDate']
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Comprehensive Aquifer Management Plan"
df['in_WaterSourceTypeCV'] = "Groundwater"


df = df.drop_duplicates().reset_index(drop=True)
outdf = df.copy()
print(len(outdf))
outdf.head()

### Groundwater Districts data

In [None]:
# Input File
inputFile = "RawInputData/shapefiles/Groundwater_Districts/Groundwater_Districts.shp"
dfin1 = gpd.read_file(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "re" + dfin1.index.astype(str)
    dfin1.to_csv('RawInputData/shapefiles/Groundwater_Districts/Groundwater_Districts.shp', compression=dict(method='zip', archive_name='Groundwater_District.csv'), index=False)

print(len(dfin1))
dfin1.head()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Date Info
df['in_Date'] = "9/29/2023"
df['in_Year'] = "2023"

# Organization Info
df['in_OrganizationUUID'] = "IDre_RO1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = dfin1["NAME"]
df['in_ReportingUnitNativeID'] = "GWD" + dfin1["OBJECTID"].astype(str)
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Groundwater Districts"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "ID"
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = "Idaho Department of Water Resources"
df['in_RegulatoryDescription'] = "manages water via water allocation and distribution processes"
df['in_RegulatoryName'] = dfin1["NAME"]
df['in_RegulatoryOverlayNativeID'] = "GWD" + dfin1["OBJECTID"].astype(str)
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] =""
df['in_StatutoryEffectiveDate'] = "1/1/1995"
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Groundwater Districts"
df['in_WaterSourceTypeCV'] = "Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf1 = df.copy()
print(len(outdf1))
outdf1.head()

### Groundwater Management Areas data

In [None]:
# Input File
inputFile = "RawInputData/shapefiles/Groundwater_Management_Areas/Groundwater_Management_Areas.shp"
dfin2 = gpd.read_file(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "re" + dfin2.index.astype(str)
    dfin2.to_csv('RawInputData/shapefiles/Groundwater_Management_Areas/Groundwater_Management_Areas.shp', compression=dict(method='zip', archive_name='Groundwater_Management_Areas.csv'), index=False)

print(len(dfin2))
dfin2.head()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Date Info
df['in_Date'] = "10/2/2023"
df['in_Year'] = "2023"

# Organization Info
df['in_OrganizationUUID'] = "IDre_RO1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = dfin2["NAME"]
df['in_ReportingUnitNativeID'] = "GWMA" + dfin2["OBJECTID"].astype(str)
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Groundwater Management Areas"
df['in_ReportingUnitUpdateDate'] = "4/27/2023"
df['in_StateCV'] = "ID"
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = "Idaho Department of Water Resources"
df['in_RegulatoryDescription'] = "manages water via water allocation and distribution processes"
df['in_RegulatoryName'] = dfin2["NAME"]
df['in_RegulatoryOverlayNativeID'] = "GWMA" + dfin2["OBJECTID"].astype(str)
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] =""
df['in_StatutoryEffectiveDate'] = "1/1/1982"
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Groundwater Management Areas"
df['in_WaterSourceTypeCV'] = "Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf2 = df.copy()
print(len(outdf2))
outdf2.head()

### Aquifer Recharge District data

In [None]:
# Input File
inputFile = "RawInputData/shapefiles/Lower_Snake_River_Aquifer_Recharge_District/Lower_Snake_River_Aquifer_Recharge_District.shp"
dfin3 = gpd.read_file(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin3:
    dfin3['WaDEUUID'] = "re" + dfin3.index.astype(str)
    dfin3.to_csv('RawInputData/shapefiles/Lower_Snake_River_Aquifer_Recharge_District/Lower_Snake_River_Aquifer_Recharge_District.shp', compression=dict(method='zip', archive_name='Lower_Snake_River_Aquifer_Recharge_District.csv'), index=False)

# Create Name Column
dfin3["NAME"] = ['Lower Snake River Aquifer Recharge District']

print(len(dfin3))
dfin3.head()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin3['WaDEUUID']

# Date Info
df['in_Date'] = "10/2/2023"
df['in_Year'] = "2023"

# Organization Info
df['in_OrganizationUUID'] = "IDre_RO1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = dfin3["NAME"]
df['in_ReportingUnitNativeID'] = "ARD" + dfin3["OBJECTID"].astype(str)
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Aquifer Recharge Districts"
df['in_ReportingUnitUpdateDate'] = "4/27/2023"
df['in_StateCV'] = "ID"
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = "Idaho Department of Water Resources"
df['in_RegulatoryDescription'] = "manages water via water allocation and distribution processes"
df['in_RegulatoryName'] = dfin3["NAME"]
df['in_RegulatoryOverlayNativeID'] = "ARD" + dfin3["OBJECTID"].astype(str)
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] =""
df['in_StatutoryEffectiveDate'] = "7/1/1978"
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Aquifer Recharge Districts"
df['in_WaterSourceTypeCV'] = "Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf3 = df.copy()
print(len(outdf3))
outdf3.head()


## Concatenate DataFrames together

In [None]:
# Concatenate dataframes
frames = [outdf1, outdf2, outdf3] # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))
outdf

## Clean Data / data types

In [None]:
# Ensure Empty String

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_ReportingUnitName'] = df.apply(lambda row: ensureEmptyString(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

In [None]:
outdf['in_RegulatoryDescription'] = df.apply(lambda row: ensureEmptyString(row['in_RegulatoryDescription']), axis=1)
outdf['in_RegulatoryDescription'].unique()

In [None]:
outdf['in_RegulatoryName'] = df.apply(lambda row: ensureEmptyString(row['in_RegulatoryName']), axis=1)
outdf['in_RegulatoryName'].unique()

In [None]:
#Update datatype of StatutoryEffectiveDate to fit WaDE 2.0 structure
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'], errors = 'coerce')
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'].dt.strftime('%m/%d/%Y'))
outdf['in_StatutoryEffectiveDate'].unique()

# Shapefile Data

### CAMP data

In [None]:
# already got the geometery for IDre
df_shape_out = outdf.copy()
print(len(df_shape_out))
df_shape_out.head()

In [None]:
# plot
df_shape_out = df_shape_out[df_shape_out['in_Geometry'] != ""].reset_index(drop=True)
df_shape_out['Geometry'] = gpd.GeoSeries.from_wkt(df_shape_out['in_Geometry'].astype(str), crs="EPSG:4326")
gdf_shape_out = gpd.GeoDataFrame(df_shape_out, geometry=df_shape_out['Geometry'], crs="EPSG:4326") # covert to geodataframe
gdf_shape_out.plot()

### Groundwater Districts data

In [None]:
# Input File
shapeInputFile = "RawInputData/shapefiles/Groundwater_Districts/Groundwater_Districts.shp"
gdfin1 = gpd.read_file(shapeInputFile)

# Realign Geometry Projection
gdfin1['geometry'] = dfin1['geometry'].to_crs(epsg=4326)

print(len(gdfin1))
gdfin1.head()

In [None]:
# plot shape info to map
gdfin1.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf1 = pd.DataFrame(columns=columnsList, index=gdfin1.index)

goutdf1['in_ReportingUnitNativeID'] = "GWD" + gdfin1["OBJECTID"].astype(str)  # in_ReportingUnitNativeID needs to match source from above equivlaent datframe
goutdf1['geometry'] = gdfin1['geometry']
goutdf1 = goutdf1.drop_duplicates().reset_index(drop=True)
print(len(goutdf1))
goutdf1.head()

### Groundwater Management Areas data

In [None]:
# Input File
shapeInputFile = "RawInputData/shapefiles/Groundwater_Management_Areas/Groundwater_Management_Areas.shp"
gdfin2 = gpd.read_file(shapeInputFile)

# Realign Geometry Projection
gdfin2['geometry'] = dfin2['geometry'].to_crs(epsg=4326)

print(len(gdfin2))
gdfin2.head()

In [None]:
# plot shape info to map
gdfin2.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf2 = pd.DataFrame(columns=columnsList, index=gdfin2.index)

goutdf2['in_ReportingUnitNativeID'] = "GWMA" + gdfin2["OBJECTID"].astype(str)  # in_ReportingUnitNativeID needs to match source from above equivlaent datframe
goutdf2['geometry'] = gdfin2['geometry']
goutdf2 = goutdf2.drop_duplicates().reset_index(drop=True)
print(len(goutdf2))
goutdf2.head()

### Aquifer Recharge District data

In [None]:
# Input File
shapeInputFile = "RawInputData/shapefiles/Lower_Snake_River_Aquifer_Recharge_District/Lower_Snake_River_Aquifer_Recharge_District.shp"
gdfin3 = gpd.read_file(shapeInputFile)

# Realign Geometry Projection
gdfin3['geometry'] = dfin3['geometry'].to_crs(epsg=4326)

print(len(gdfin3))
gdfin3.head()

In [None]:
# plot shape info to map
gdfin3.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf3 = pd.DataFrame(columns=columnsList, index=gdfin2.index)

goutdf3['in_ReportingUnitNativeID'] = "ARD" + gdfin3["OBJECTID"].astype(str)  # in_ReportingUnitNativeID needs to match source from above equivlaent datframe
goutdf3['geometry'] = gdfin3['geometry']
goutdf3 = goutdf3.drop_duplicates().reset_index(drop=True)
print(len(goutdf3))
goutdf3.head()

### Concatenate goutdf shapefile info into single output

In [None]:
# Concatenate Basin & Subbasin DataFrames
frames = [goutdf1, goutdf2, goutdf3] # add geoutdf dataframes here
goutdf = pd.concat(frames).reset_index(drop=True)

print(len(goutdf))
goutdf.head()

### Inspect Output Data & Export

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(goutdf.dtypes)

In [None]:
# Export out to CSV.
outdf.to_csv('RawInputData/Pre_Main.zip', compression=dict(method='zip', archive_name='Pre_Main.csv'), index=False)  # The output, save as a zip
goutdf.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.