# Preprocessing New Mexico Regulatory data for WaDE upload.
- Purpose:  To preprocess the New Mexico data into one master file for simple DataFrame creation and extraction.

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import date
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/NewMexico/Regulatory/RawInputData"
os.chdir(workingDir)

## Interstate Stream Compact Regions

In [None]:
# Input File - OSEWaterRightDistrictBoundary_input.csv
df_iscr = pd.read_csv('Interstate Stream Compact Regions/InterstateStreamCompactRegions_input.zip', compression='zip')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_iscr:
    df_iscr['WaDEUUID'] = "nmiscr" + df_iscr.index.astype(str)
    df_iscr.to_csv('Interstate Stream Compact Regions/InterstateStreamCompactRegions_input.zip', compression=dict(method='zip', archive_name='InterstateStreamCompactRegions_input.csv'), index=False)

print(len(df_iscr))
df_iscr.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = df_iscr['WaDEUUID']

# Date Info
df['in_Date'] = date.today().strftime('%m/%d')
df['in_Year'] = date.today().strftime('%Y')

# Organization
df['in_OrganizationUUID'] = "NMwr_O1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = df_iscr['Full_Name']
df['in_ReportingUnitNativeID'] = "ISCR" + df_iscr['OID_'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Interstate River Compact"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "NM"
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = "WaDE Unspecfied"
df['in_RegulatoryDescription'] = df_iscr['RegulatoryDescription']
df['in_RegulatoryName'] = df_iscr['Full_Name']
df['in_RegulatoryOverlayNativeID'] = "ISCR" # make custom below
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = df_iscr['URL']
df['in_StatutoryEffectiveDate'] = df_iscr['EffectiveDate']
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Interstate River Compact"
df['in_WaterSourceTypeCV'] = "Surface Water"


df = df.drop_duplicates().reset_index(drop=True)
outdf_iscr = df.copy()
print(len(outdf_iscr))
outdf_iscr.head()

## OSE Water Right District Boundary

In [None]:
# Input File - OSEWaterRightDistrictBoundary_input.csv
df_ose = pd.read_csv('OSE Water Right District Boundary/OSEWaterRightDistrictBoundary_input.csv')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_ose:
    df_ose['WaDEUUID'] = "nmiscr" + df_ose.index.astype(str)
    df_ose.to_csv('OSE Water Right District Boundary/OSEWaterRightDistrictBoundary_input.csv', index=False)

print(len(df_ose))
df_ose.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = df_ose['WaDEUUID']

# Date Info
df['in_Date'] = date.today().strftime('%m/%d')
df['in_Year'] = date.today().strftime('%Y')

# Organization
df['in_OrganizationUUID'] = "NMwr_O1"

# ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = df_ose['name']
df['in_ReportingUnitNativeID'] = "OSEWRB" + df_ose['ose_dist_i'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Water Rights District"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "NM"
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = "WaDE Unspecfied"
df['in_RegulatoryDescription'] = "District operated by a Water Master appointed by the Office of the State Engineer, who is charged with administering the state's water resources. The State Engineer has authority over the supervision, measurement, appropriation, and distribution of all surface and groundwater in New Mexico, including streams and rivers that cross state boundaries."
df['in_RegulatoryName'] = df_ose['name']
df['in_RegulatoryOverlayNativeID'] = "OSEWR" # make custom below
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = df_ose['URL']
df['in_StatutoryEffectiveDate'] = "08/12/2021"
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Water Rights District"
df['in_WaterSourceTypeCV'] = "Surface Water and Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf_ose = df.copy()
print(len(outdf_ose))
outdf_ose.head()

## Special Conditions Water Right

In [None]:
# Input File - SpecialConditionsWaterRight_input.csv
df_scwr = pd.read_csv("Special Conditions Water Right Regulations/SpecialConditionsWaterRight_input.csv", encoding = "ISO-8859-1")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_scwr:
    df_scwr['WaDEUUID'] = "nmscwr" + df_scwr.index.astype(str)
    df_scwr.to_csv('Special Conditions Water Right Regulations/SpecialConditionsWaterRight_input.csv', index=False)

print(len(df_scwr))
df_scwr.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = df_scwr['WaDEUUID']

# Date Info
df['in_Date'] = date.today().strftime('%m/%d')
df['in_Year'] = date.today().strftime('%Y')

# Organization
df['in_OrganizationUUID'] = "NMwr_O1"

# # ReportingUnit Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = df_scwr['Name']
df['in_ReportingUnitNativeID'] = "SCWR" + df_scwr['OID_'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Special Condition Water Right"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "NM"
df['in_Geometry'] = ""

# # RegulatoryOverlay Info
df['in_OversightAgency'] = "WaDE Unspecified"
df['in_RegulatoryDescription'] = df_scwr['requiremen']
df['in_RegulatoryName'] = df_scwr['Name']
df['in_RegulatoryOverlayNativeID'] = "SCWR" # make custom below
df['in_RegulatoryStatusCV'] = "Active"
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] = "WaDE Unspecified"
df['in_StatutoryEffectiveDate'] = df_scwr['effect_dat']
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = "Special Condition Water Right"
df['in_WaterSourceTypeCV'] = "Surface Water and Groundwater"

df = df.drop_duplicates().reset_index(drop=True)
outdf_scwr = df.copy()
print(len(outdf_scwr))
outdf_scwr.head()

## Clean Data and Concatenate DataFrames together into single output

In [None]:
# make custom in_RegulatoryOverlayNativeID

outdf_iscr['in_RegulatoryOverlayNativeID'] = outdf_iscr['in_RegulatoryOverlayNativeID'].astype(str) + outdf_iscr.index.astype(str)
outdf_ose['in_RegulatoryOverlayNativeID'] = outdf_ose['in_RegulatoryOverlayNativeID'].astype(str) + outdf_ose.index.astype(str)
outdf_scwr['in_RegulatoryOverlayNativeID'] = outdf_scwr['in_RegulatoryOverlayNativeID'].astype(str) + outdf_scwr.index.astype(str)

In [None]:
# Concatenate Subbasin DataFrames
frames = [outdf_iscr, outdf_ose, outdf_scwr]
outdf = pd.concat(frames)
print(len(outdf))
outdf.head(1)

In [None]:
# Fixing empty string names
def fixEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = val
    return outString

In [None]:
outdf['in_ReportingUnitName'] = df.apply(lambda row: fixEmptyString(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

In [None]:
outdf['in_RegulatoryDescription'] = df.apply(lambda row: fixEmptyString(row['in_RegulatoryDescription']), axis=1)
outdf['in_RegulatoryDescription'].unique()

In [None]:
outdf['in_RegulatoryName'] = df.apply(lambda row: fixEmptyString(row['in_RegulatoryName']), axis=1)
outdf['in_RegulatoryName'].unique()

# Shapefile Data

### Interstate Stream Compact Regions

In [None]:
ISCRShapeFile = "Interstate Stream Compact Regions/shapefile/InterstateStreamCompactRegions.zip"
ISCRShape = gpd.read_file(ISCRShapeFile)
print(len(ISCRShape))
ISCRShape.head(1)

In [None]:
ISCRShape.plot()

In [None]:
# Interstate Stream Compact Regions
columnsList = ['in_SiteNativeID', 'geometry']
df_ISCR_shp = pd.DataFrame(columns=columnsList, index=ISCRShape.index)

df_ISCR_shp['in_ReportingUnitNativeID'] = "ISCR" + ISCRShape.index.fillna(0).astype(int).astype(str)
df_ISCR_shp['geometry'] = ISCRShape['geometry']
df_ISCR_shp = df_ISCR_shp.drop_duplicates().reset_index(drop=True)
print(len(df_ISCR_shp))
df_ISCR_shp.head(3)

In [None]:
# # Interstate Stream Compact Regions

# ISCRShapeFile = "Interstate Stream Compact Regions/shapefile/InterstateStreamCompactRegions.zip"
# ISCRShape = gpd.read_file(ISCRShapeFile)

# columnList = ['in_ReportingUnitName', 
#               'in_ReportingUnitNativeID',
#               'in_ReportingUnitTypeCV',
#               'in_Geomerty']
# df_ISCR_shp = pd.DataFrame(columns=columnList, index=ISCRShape.index)

# df_ISCR_shp['in_ReportingUnitName'] = ISCRShape['Full_Name']
# df_ISCR_shp['in_ReportingUnitNativeID'] = ISCRShape.index.fillna(0).astype(int).astype(str)
# df_ISCR_shp['in_ReportingUnitTypeCV'] = "Interstate River Compact"
# df_ISCR_shp['in_Geomerty'] = ISCRShape['geometry']

# print(len(df_ISCR_shp))
# df_ISCR_shp.head(3)

### OSE Water Right District Boundary

In [None]:
EWRDhapeFile = "OSE Water Right District Boundary/shapefile/OSEWaterRightDistrictBoundary.shp"
EWRDShape = gpd.read_file(EWRDhapeFile)
print(len(EWRDShape))
EWRDShape.head(1)

In [None]:
EWRDShape.plot()

In [None]:
# OSE Water Right District Boundary
columnsList = ['in_SiteNativeID', 'geometry']
df_EWRD_shp = pd.DataFrame(columns=columnsList, index=EWRDShape.index)

df_EWRD_shp['in_ReportingUnitNativeID'] = "OSEWRB" + EWRDShape['ose_dist_i'].replace("", 0).fillna(0).astype(int).astype(str)
df_EWRD_shp['geometry'] = EWRDShape['geometry']
df_EWRD_shp = df_EWRD_shp.drop_duplicates().reset_index(drop=True)
print(len(df_EWRD_shp))
df_EWRD_shp.head(3)

In [None]:
# # OSE Water Right District Boundary

# columnList = ['in_ReportingUnitName', 
#               'in_ReportingUnitNativeID',
#               'in_ReportingUnitTypeCV',
#               'in_Geomerty']
# df_EWRD_shp = pd.DataFrame(columns=columnList, index=EWRDShape.index)

# df_EWRD_shp['in_ReportingUnitName'] = EWRDShape['name']
# df_EWRD_shp['in_ReportingUnitNativeID'] = EWRDShape['ose_dist_i'].replace("", 0).fillna(0).astype(int).astype(str)
# df_EWRD_shp['in_ReportingUnitTypeCV'] = "Water Rights District"
# df_EWRD_shp['in_Geomerty'] = EWRDShape['geometry']

# print(len(df_EWRD_shp))
# df_EWRD_shp.head(3)

### Special Conditoins Water Right

In [None]:
SCWRShapeFile = "Special Conditions Water Right Regulations/shapefile/SpecialConditionsWaterRightRegulations.shp"
SCWRShape = gpd.read_file(SCWRShapeFile)
print(len(SCWRShape))
SCWRShape.head(1)

In [None]:
SCWRShape.plot()

In [None]:
# Special Conditoins Water Right
columnsList = ['in_SiteNativeID', 'geometry']
df_SCWR_shp = pd.DataFrame(columns=columnsList, index=SCWRShape.index)

df_SCWR_shp['in_ReportingUnitNativeID'] = "SCWR" + SCWRShape.index.fillna(0).astype(int).astype(str)
df_SCWR_shp['geometry'] = SCWRShape['geometry']
df_SCWR_shp = df_SCWR_shp.drop_duplicates().reset_index(drop=True)
print(len(df_SCWR_shp))
df_SCWR_shp.head(3)

In [None]:
# # Special Conditoins Water Right

# columnList = ['in_ReportingUnitName', 
#               'in_ReportingUnitNativeID',
#               'in_ReportingUnitTypeCV',
#               'in_Geomerty']
# df_SCWR_shp = pd.DataFrame(columns=columnList, index=SCWRShape.index)

# df_SCWR_shp['in_ReportingUnitName'] = SCWRShape['Name']
# df_SCWR_shp['in_ReportingUnitNativeID'] = SCWRShape.index.fillna(0).astype(int).astype(str)
# df_SCWR_shp['in_ReportingUnitTypeCV'] = "Special Condition Water Right"
# df_SCWR_shp['in_Geomerty'] = SCWRShape['geometry']

# print(len(df_SCWR_shp))
# df_SCWR_shp.head(3)

In [None]:
# Concatenate Basin & Subbasin DataFrames
frames = [df_ISCR_shp, df_EWRD_shp, df_SCWR_shp]
df_shape_out = pd.concat(frames).reset_index(drop=True)

print(len(df_shape_out))
df_shape_out

### Inspect Output Data & Export

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df_shape_out.dtypes)

In [None]:
# Export out to CSV.
outdf.to_csv('Pre_nmMain.zip', index=False, compression="zip")  # The output, save as a zip
df_shape_out.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.