# Pre-processing Overlay data for WaDE upload.
- Purpose: To preprocess state overlay data into one main file for simple DataFrame creation and extraction.

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/WaDE Data Folder/Oregon/Overlays" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

## Overlay Area Data #1
- Administrative Basins

In [None]:
# Input File
inputFile = "RawInputData/shapefiles/oregon-water-resources-department-owrd-administrative-basins.zip"
dfin1 = gpd.read_file(inputFile).replace(np.nan, "")
dfin1['geometry'] = dfin1['geometry'].to_crs(epsg=4326) # Realign Geometry Projection

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "ov" + dfin1.index.astype(str)
    dfin1.to_csv('RawInputData/oregon-water-resources-department-owrd-administrative-basins.zip', compression=dict(method='zip', archive_name='oregon-water-resources-department-owrd-administrative-basins.csv'), index=False)

print(len(dfin1))
dfin1.head(1)

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Date Info
df['in_Date'] = "10/29/2024"
df['in_Year'] = "2024"

# Organization Info
df['in_OrganizationUUID'] = "ORov_O1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = dfin1["BASIN_NAME"]
df['in_ReportingUnitNativeID'] = "orAB" + dfin1["BASIN_NUM"].astype(str)
df['in_ReportingUnitProductVersion'] = "9.6"
df['in_ReportingUnitTypeCV'] = "Administrative Basins"
df['in_ReportingUnitUpdateDate'] = "9/22/2021"
df['in_StateCV'] = "OR"
df['in_Geometry'] = dfin1['geometry']

# RegulatoryOverlay Info
df['in_OversightAgency'] = "Oregon Water Resources Department"
df['in_RegulatoryDescription'] = "Administrative rules which establish water management policies and objectives and which govern the appropriation and use of the surface and ground water."
df['in_RegulatoryName'] = dfin1["BASIN_NAME"]
df['in_RegulatoryOverlayNativeID'] = "orAB" + dfin1["BASIN_NUM"].astype(str)
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = "https://www.oregon.gov/owrd/programs/administrativebasins/pages/default.aspx"
df['in_StatutoryEffectiveDate'] = "1993-10-07"
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Administrative Basins"
df['in_WaterSourceTypeCV'] = "Surface and Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf1 = df.copy()
print(len(outdf1))
outdf1.head()

## Overlay Area Data #2
- Restricted Groundwater Areas

In [None]:
# Input File
inputFile = "RawInputData/shapefiles/GW_Restricted_Areas.zip"
dfin2 = gpd.read_file(inputFile).replace(np.nan, "")
dfin2['geometry'] = dfin2['geometry'].to_crs(epsg=4326) # Realign Geometry Projection

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "ov" + dfin2.index.astype(str)
    dfin2.to_csv('RawInputData/GW_Restricted_Areas.zip', compression=dict(method='zip', archive_name='GW_Restricted_Areas.csv'), index=False)

print(len(dfin2))
dfin2.head(1)

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin2['WaDEUUID']

# Date Info
df['in_Date'] = "10/29/2024"
df['in_Year'] = "2024"

# Organization Info
df['in_OrganizationUUID'] = "ORov_O1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = dfin2["gwra_area_"]
df['in_ReportingUnitNativeID'] = ""
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Groundwater Restricted Areas"
df['in_ReportingUnitUpdateDate'] = "07/23/2023"
df['in_StateCV'] = "OR"
df['in_Geometry'] = dfin2['geometry']

# RegulatoryOverlay Info
df['in_OversightAgency'] = "Oregon Water Resources Department"
df['in_RegulatoryDescription'] = "To protect existing water rights by preventing excessive groundwater declines, restoring aquifer stability, and preserving aquifers with limited storage capacity for designated high public value uses."
df['in_RegulatoryName'] = dfin2["gwra_area_"]
df['in_RegulatoryOverlayNativeID'] = ""
df['in_RegulatoryStatusCV'] = dfin2["gwra_statu"]
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = dfin2["source_lin"]
df['in_StatutoryEffectiveDate'] = dfin2["effective_"]
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Groundwater Restricted Areas"
df['in_WaterSourceTypeCV'] = "Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf2 = df.copy()
print(len(outdf2))
outdf2.head(3)

## Concatenate DataFrames together

In [None]:
# Concatenate dataframes
frames = [outdf1, outdf2] # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_ReportingUnitName'] = outdf.apply(lambda row: ensureEmptyString(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

In [None]:
outdf['in_RegulatoryDescription'] = outdf.apply(lambda row: ensureEmptyString(row['in_RegulatoryDescription']), axis=1)
outdf['in_RegulatoryDescription'].unique()

In [None]:
outdf['in_RegulatoryName'] = outdf.apply(lambda row: ensureEmptyString(row['in_RegulatoryName']), axis=1)
outdf['in_RegulatoryName'].unique()

In [None]:
# Update datatype of StatutoryEffectiveDate to fit WaDE 2.0 structure
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'], errors = 'coerce')
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'].dt.strftime('%m/%d/%Y'))
outdf['in_StatutoryEffectiveDate'].unique()

In [None]:
# Creating WaDE Custom Reporting Unit Area native ID for easy area identification
# use Unique ReportingUnitName and ReportingUnitTypeCV
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_ReportingUnitName'] = outdf['in_ReportingUnitName'].astype(str).str.strip()
dfTempID['in_ReportingUnitTypeCV'] = outdf['in_ReportingUnitTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_ReportingUnitNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_ReportingUnitName'].astype(str) + dfTempID['in_ReportingUnitTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_ReportingUnitNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_ReportingUnitNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_ReportingUnitNativeID'], 
                                                                       row['in_ReportingUnitName'], row['in_ReportingUnitTypeCV']), axis=1)
outdf['in_ReportingUnitNativeID'].unique()

In [None]:
# Creating WaDE Custom Regulatory Overlay Native ID for easy area identification
# use Unique RegulatoryName and RegulatoryOverlayTypeCV
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_RegulatoryName'] = outdf['in_RegulatoryName'].astype(str).str.strip()
dfTempID['in_RegulatoryOverlayTypeCV'] = outdf['in_RegulatoryOverlayTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_RegulatoryOverlayNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_RegulatoryName'].astype(str) + dfTempID['in_RegulatoryOverlayTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_RegulatoryOverlayNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_RegulatoryOverlayNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_RegulatoryOverlayNativeID'], 
                                                                                    row['in_RegulatoryName'], row['in_RegulatoryOverlayTypeCV']), axis=1)
outdf['in_RegulatoryOverlayNativeID'].unique()

# Geometry Data
- For attaching geometry to overlay reporting unit area info.

#### Overlay Area #1 shapefile info

In [None]:
# # Input File / or use same input as above
gdfin1 = outdf[outdf['in_ReportingUnitTypeCV'] == "Administrative Basins"].copy()
gdfin1 = gpd.GeoDataFrame(gdfin1, geometry=gdfin1['in_Geometry'], crs="EPSG:4326") # covert to geodataframe
print(len(gdfin1))
gdfin1.head()

In [None]:
# plot shape info to map
gdfin1.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf1 = pd.DataFrame(columns=columnsList, index=gdfin1.index)

goutdf1['in_ReportingUnitNativeID'] =  gdfin1["in_ReportingUnitNativeID"].astype(str)  # in_ReportingUnitNativeID needs to match source from above equivlaent datframe
goutdf1['geometry'] = gdfin1['geometry']
goutdf1 = goutdf1.drop_duplicates().reset_index(drop=True)
print(len(goutdf1))
goutdf1.head()

#### Overlay Area #2 shapefile info

In [None]:
# # Input File / or use same input as above
gdfin2 = outdf[outdf['in_ReportingUnitTypeCV'] == "Groundwater Restricted Areas"].copy()
gdfin2 = gpd.GeoDataFrame(gdfin2, geometry=gdfin2['in_Geometry'], crs="EPSG:4326") # covert to geodataframe
print(len(gdfin2))
gdfin2.head()

In [None]:
# plot shape info to map
gdfin2.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf2 = pd.DataFrame(columns=columnsList, index=gdfin2.index)

goutdf2['in_ReportingUnitNativeID'] =  gdfin2["in_ReportingUnitNativeID"].astype(str)  # in_ReportingUnitNativeID needs to match source from above equivlaent datframe
goutdf2['geometry'] = gdfin2['geometry']
goutdf2 = goutdf2.drop_duplicates().reset_index(drop=True)
print(len(goutdf2))
goutdf2.head()

#### Concatenate goutdf shapefile info into single output

In [None]:
# Concatenate Basin & Subbasin DataFrames
frames = [goutdf1, goutdf2] # add geoutdf dataframes here
goutdf = pd.concat(frames).reset_index(drop=True)

print(len(goutdf))
goutdf.head()

## Export Data

In [None]:
try:
    outdf = outdf.drop(['in_Geometry'], axis=1)
except:
    print("No geometry to drop")

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(goutdf.dtypes)

In [None]:
# Export out to CSV.
outdf.to_csv('RawInputData/Pov_Main.zip', compression=dict(method='zip', archive_name='Pov_Main.csv'), index=False)  # The output, save as a zip
goutdf.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.