# Data Assessment & Analytics - Overlay Data
Notes:
- change os directory location
- be aware of the number of provdied native source data files

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import geoplot as gplt  # for plotting maps and geo-data
import geoplot.crs as gcrs  #used to pull in webdata related to maps and geo-data
import missingno as msno # creates a matrix chart to show missing values
import plotly.express as px
import plotly.graph_objects as go  # for subplot creation
from plotly.subplots import make_subplots  # for subplot creation
import matplotlib.pyplot as mplt  # use with gplt to save fig to pdf

# ---- cleanup ----
import re # string regular expression manipulation
from datetime import datetime # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = os.getcwd()
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

In [None]:
# ---- where to find input files ----
InputFolderString = "G:/Shared drives/WaDE Data/WaDE Data Folder/Nebraska/Overlays" # set working directory folder string here

In [None]:
# Native Input Data
#################################################################
# ---- This needs to be custom per state ----

# Data Set 1: Natural_Resource_District_NRD_Boundaries_20250129
dfin1 = pd.read_csv(InputFolderString + '/RawinputData/Natural_Resource_District_NRD_Boundaries_20250129.zip') # will need name of souce input files

In [None]:
# WaDE Processed Input Data
#################################################################
dfd = pd.read_csv(InputFolderString + "/ProcessedInputData/date.csv").replace(np.nan, "")

dfo = pd.read_csv(InputFolderString + "/ProcessedInputData/organizations.csv", encoding = "ISO-8859-1").replace(np.nan, "")

dfru = pd.read_csv(InputFolderString + "/ProcessedInputData/reportingunits.csv").replace(np.nan, "")
dfrupurge = pd.read_csv(InputFolderString + "/ProcessedInputData/reportingunits_missing.csv").replace(np.nan, "")

dfov = pd.read_csv(InputFolderString + "/ProcessedInputData/overlays.csv").replace(np.nan, "")
dfovpurge = pd.read_csv(InputFolderString + "/ProcessedInputData/overlays_missing.csv").replace(np.nan, "")

dfovru = pd.read_csv(InputFolderString + "/ProcessedInputData/overlayreportingunits.csv").replace(np.nan, "")
dfovrupurge = pd.read_csv(InputFolderString + "/ProcessedInputData/overlayreportingunits_missing.csv").replace(np.nan, "")

# Reporting Unit Area Info
- reportingunits.csv

In [None]:
print(len(dfru))
dfru.head(1)

In [None]:
# Check what columns contain missing information.
#msno.matrix(dfru, figsize=(10,5), fontsize=10)
msno.bar(dfru)

In [None]:
# Unique values for 'ReportingUnitName'
for x in dfru['ReportingUnitName'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'ReportingUnitNativeID'
for x in dfru['ReportingUnitNativeID'].astype(str).sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'ReportingUnitTypeCV'
for x in dfru['ReportingUnitTypeCV'].sort_values().unique():
    print(f'"' + x + '",')

# Overlay Info
- overlays.csv

In [None]:
print(len(dfov))
dfov.head(1)

In [None]:
# Check what columns contain missing information.
#msno.matrix(dfov, figsize=(10,5), fontsize=10)
msno.bar(dfov)

In [None]:
# Unique values for 'OversightAgency'
for x in dfov['OversightAgency'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'OverlayName'
for x in dfov['OverlayName'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'OverlayNativeID'
for x in dfov['OverlayNativeID'].astype(str).sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'OverlayTypeCV'
for x in dfov['OverlayTypeCV'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'WaterSourceTypeCV'
for x in dfov['WaterSourceTypeCV'].sort_values().unique():
    print(f'"' + x + '",')

## Num of Record Summary

In [None]:
print(f"Num of Source Entries (rows): ", len(dfin1))
#print(f"Num of Source #2 Entries (rows): ", len(dfin2))

In [None]:
# Num of entries compiled into WaDE
ruString =  len(dfru)
roString =  len(dfov)
print("Dataset  | Num of Identified Reporting Units | Num of Identified Overlays")
print("**Compiled WaDE Data** |", ruString, "|", roString)

## Markdown Table Creation

In [None]:
# date.csv
dftmp = dfd.copy().to_markdown()
print(dftmp)

In [None]:
# organizations.csv
dftmp = dfo.copy().to_markdown()
print(dftmp)

In [None]:
# reportingunits.csv
dftmp = dfru.loc[[1]].copy()
dftmp = dftmp.drop(['Geometry'], axis=1).to_markdown()
print(dftmp)

In [None]:
# overlays.csv
dftmp = dfov.loc[[1]].copy().to_markdown()
print(dftmp)

In [None]:
# overlayreportingunits.csv
dftmp = dfovru.loc[[1]].copy().to_markdown()
print(dftmp)

## Why Removed Records Summary

In [None]:
print("---- Reasons why records were removed from reporting unit source info ----")
if len(dfrupurge) != 0:
    print(dfrupurge['ReasonRemoved'].value_counts().astype(str) + " | removed from reportingunits.csv input")
else:
    print("...nothing removed.")

In [None]:
print(" ---- Reasons why records were removed from regula toryoverlays info ---- ")
if len(dfovpurge) != 0:
    print(dfovpurge['ReasonRemoved'].value_counts().astype(str) + " | removed from overlays.csv input")
else:
    print("...nothing removed.")

## Figures

In [None]:
# ReportingUnitName: histogram distribution of WaDE values
print(dfru.ReportingUnitName.value_counts())

fig = px.histogram(dfru, x="ReportingUnitName", text_auto=True)
fig.update_layout(bargap=0.2,
                  title="Histogram of ReportingUnitName Entries in reportingunits.csv",
                  xaxis_title="ReportingUnitName Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/ReportingUnitName.png', engine="kaleido")

In [None]:
# ReportingUnitTypeCV: histogram distribution of WaDE values
print(dfru.ReportingUnitTypeCV.value_counts())

fig = px.histogram(dfru, x="ReportingUnitTypeCV", text_auto=True)
fig.update_layout(bargap=0.2,
                  title="Histogram of ReportingUnitTypeCV Entries in reportingunits.csv",
                  xaxis_title="ReportingUnitTypeCV Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/ReportingUnitTypeCV.png', engine="kaleido")

In [None]:
# OversightAgency: histogram distribution of WaDE values
print(dfov.OversightAgency.value_counts())

fig = px.histogram(dfov, x="OversightAgency", text_auto=True)
fig.update_layout(bargap=0.2,
                  title="Histogram of OversightAgency Entries in overlays.csv",
                  xaxis_title="OversightAgency Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/OversightAgency.png', engine="kaleido")

In [None]:
# OverlayName: histogram distribution of WaDE values
print(dfov.OverlayName.value_counts())

fig = px.histogram(dfov, x="OverlayName", text_auto=True)
fig.update_layout(bargap=0.2,
                  title="Histogram of OverlayName Entries in overlays.csv",
                  xaxis_title="OverlayName Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/OverlayName.png', engine="kaleido")

In [None]:
# OverlayTypeCV: histogram distribution of WaDE values
print(dfov.OverlayTypeCV.value_counts())

fig = px.histogram(dfov, x="OverlayTypeCV", text_auto=True)
fig.update_layout(bargap=0.2,
                  title="Histogram of OverlayTypeCV Entries in overlays.csv",
                  xaxis_title="OverlayTypeCV Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/OverlayTypeCV.png', engine="kaleido")

In [None]:
# Map poly info

dfruPoly = dfru.copy()
dfruPoly = dfruPoly[dfruPoly['Geometry'] != ""].reset_index(drop=True)

try:
    contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) # use for background map in subplot
    ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator()) # set subplot

    dfruPoly['Geometry'] = gpd.GeoSeries.from_wkt(dfruPoly['Geometry'], crs="EPSG:4326")
    gdfruPoly = gpd.GeoDataFrame(dfruPoly, geometry=dfruPoly['Geometry'], crs="EPSG:4326") # covert to geodataframe
    #gdfruPoly['Geometry'] = gdfruPoly.simplify(0.001) # simplify the geometry. Lower the number the larger the exported file.
    gplt.choropleth(gdfruPoly, edgecolor='white', linewidth=1, hue='ReportingUnitTypeCV', legend=True, legend_kwargs={'loc': 'lower left'}, ax=ax)
    mplt.savefig(format="png", fname='figures/ReportingUnitMap.png') 
except:
    print('No geometry data to plot')

# Custom Queries and Analysis for this Dataset

In [None]:
# asdf