# Data Assessment & Analytics - Regulatory Data
Notes:
- change os directory location
- be aware of the number of provdied native source data files

In [None]:
# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd # the library that lets us read in shapefiles
import geoplot as gplt # for plotting maps
import geoplot.crs as gcrs #used to pull in webdata

# visulizaiton
import missingno as msno # creates a matrix chart to show missing values
import plotly.express as px
import plotly.graph_objects as go # for subplot creation
from plotly.subplots import make_subplots # for subplot creation
import matplotlib.pyplot as mplt # use with gplt to save fig to pdf

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Set Working Directory
workingDir = "G:/Shared drives/WaDE Data/Idaho/Regulatory" # change here
os.chdir(workingDir)

In [None]:
# # This needs to be custom per state

# # Input Data - Native Source Data
# #################################################################
# # Data 1: OSE POD Data
# df1 = pd.read_csv('RawinputData/OSE_PODs.zip', compression='zip')

In [None]:
# Input Data - Processed WaDE Input files
#################################################################
dfru = pd.read_csv("ProcessedInputData/reportingunits.csv").replace(np.nan, "")
dfrupurge = pd.read_csv("ProcessedInputData/reportingunits_missing.csv").replace(np.nan, "")

dfro = pd.read_csv("ProcessedInputData/regulatoryoverlays.csv").replace(np.nan, "")
dfropurge = pd.read_csv("ProcessedInputData/regulatoryoverlays_missing.csv").replace(np.nan, "")

dfs = pd.read_csv("ProcessedInputData/sites.csv").replace(np.nan, "")

# Reporting Unit Info
- reportingunits.csv

In [None]:
print(len(dfru))
dfru.head(1)

In [None]:
# Check what columns contain missing information.
msno.matrix(dfru, figsize=(10,5), fontsize=10)

In [None]:
# ReportingUnitName: histogram distribution of WaDE values
print(dfru.ReportingUnitName.value_counts())

fig = px.histogram(dfru, x="ReportingUnitName")
fig.update_layout(bargap=0.2,
                  title="Histogram of ReportingUnitName Entries in reportingunits.csv",
                  xaxis_title="ReportingUnitName Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/ReportingUnitName.pdf', engine="kaleido")

In [None]:
# ReportingUnitTypeCV: histogram distribution of WaDE values
print(dfru.ReportingUnitTypeCV.value_counts())

fig = px.histogram(dfru, x="ReportingUnitTypeCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of ReportingUnitTypeCV Entries in reportingunits.csv",
                  xaxis_title="ReportingUnitTypeCV Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/ReportingUnitTypeCV.pdf', engine="kaleido")

In [None]:
# Map poly info
try:
    contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) # use for background map in subplot
    ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator()) # set subplot
    
    dfruPoly = dfru.copy()
    dfruPoly = dfruPoly[dfruPoly['Geometry'] != ""].reset_index(drop=True)
    dfruPoly['Geometry'] = gpd.GeoSeries.from_wkt(dfruPoly['Geometry'], crs="EPSG:4326")
    gdfruPoly = gpd.GeoDataFrame(dfruPoly, geometry=dfruPoly['Geometry'], crs="EPSG:4326") # covert to geodataframe
    #gdfruPoly['Geometry'] = gdfruPoly.simplify(0.001) # simplify the geometry. Lower the number the larger the exported file.
    gplt.polyplot(gdfruPoly, ax=ax)
    mplt.savefig(format="pdf", fname='DataAssessment/figures/ReportingUnitMap.pdf')
except:
    print('No geometry data to plot')

# Regulatory Overlay Info
- regulatoryoverlays.csv

In [None]:
print(len(dfro))
dfro.head(1)

In [None]:
# OversightAgency: histogram distribution of WaDE values
print(dfro.OversightAgency.value_counts())

fig = px.histogram(dfro, x="OversightAgency")
fig.update_layout(bargap=0.2,
                  title="Histogram of OversightAgency Entries in regulatoryoverlays.csv",
                  xaxis_title="OversightAgency Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/OversightAgency.pdf', engine="kaleido")

In [None]:
# RegulatoryName: histogram distribution of WaDE values
print(dfro.RegulatoryName.value_counts())

fig = px.histogram(dfro, x="RegulatoryName")
fig.update_layout(bargap=0.2,
                  title="Histogram of RegulatoryName Entries in regulatoryoverlays.csv",
                  xaxis_title="RegulatoryName Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/RegulatoryName.pdf', engine="kaleido")

In [None]:
# RegulatoryOverlayTypeCV: histogram distribution of WaDE values
print(dfro.RegulatoryOverlayTypeCV.value_counts())

fig = px.histogram(dfro, x="RegulatoryOverlayTypeCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of RegulatoryOverlayTypeCV Entries in regulatoryoverlays.csv",
                  xaxis_title="RegulatoryOverlayTypeCV Value",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/RegulatoryOverlayTypeCV.pdf', engine="kaleido")

# Site Info (related to Overlays)
- sites.csv

In [None]:
print(len(dfs))
dfs.head(1)

In [None]:
# RegulatoryOverlayUUIDs: histogram distribution of WaDE values
print(dfs.RegulatoryOverlayUUIDs.value_counts())

fig = px.histogram(dfs, x="RegulatoryOverlayUUIDs")
fig.update_layout(bargap=0.2,
                  title="Histogram of RegulatoryOverlayUUIDs Entries in sites.csv",
                  xaxis_title="RegulatoryOverlayUUIDs Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/SiteRegulatoryOverlayUUIDs.pdf', engine="kaleido")

In [None]:
# map the site info (this would be lat & long Points only)
try:
    contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) # use for background map in subplot
    ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator()) # set subplot
    
    dfsPoint = dfs.copy()
    dfsPoint = dfsPoint[dfsPoint['RegulatoryOverlayUUIDs'] != ""]
    gdfsPoint = gpd.GeoDataFrame(dfsPoint, geometry=gpd.points_from_xy(dfsPoint.Longitude.astype(float), dfsPoint.Latitude.astype(float)), crs="EPSG:4326")
    gplt.pointplot(gdfsPoint, hue='PODorPOUSite', legend=True, legend_var='hue', ax=ax)
    mplt.savefig(format="pdf", fname='DataAssessment/figures/PointInRegMap.pdf')
    
except:
    print('No point data to plot')

# Merge all figure pdfs into single output pdf

In [None]:
# merge all figure pdfs into single output pdf

'''
Notes:
'merger' is used for merging multiple files into one and merger.append(absfile) will append 
 the files one by one until all pdfs are appended in the result file.
'''

from PyPDF2 import PdfFileMerger

# If files are saved in the folder 'C:\Users' then Full_Path will be replaced with C:\Users
filePath = str(os.getcwd()) + '/DataAssessment/figures'
pdfsList = os.listdir(filePath)
print(pdfsList)


# os.listdir will create the list of all files in a directory
merger = PdfFileMerger(strict=False)

for file in pdfsList:
    if file.endswith(".pdf"):
        path_with_file = os.path.join(filePath, file)
        print(path_with_file)
        merger.append(path_with_file,  import_bookmarks=False )
merger.write("DataAssessment/Figures Merged Copy.pdf")

merger.close()

# Removed Records compared to Source Data
- this is working just fine
- just want to comment out temporarily

In [None]:
# Explode purge.xlsx files by WaDEUUID, concat together
#################################################################

# Explode watersources_missing.xlsx records by WaDEUUID
dfwspurgeCopy = dfwspurge.assign(WaDEUUID=dfwspurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
dfwspurgeCopy = dfwspurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# Explode sites_missing.xlsx records by WaDEUUID
dfspurgeCopy = dfspurge.assign(WaDEUUID=dfspurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
dfspurgeCopy = dfspurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# Explode waterallocations_missing.xlsx records by WaDEUUID
dfaapurgeCopy = dfaapurge.assign(WaDEUUID=dfaapurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
dfaapurgeCopy = dfaapurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# concat purge dataframes togehter
frames = [dfwspurgeCopy, dfspurgeCopy, dfaapurgeCopy] 
dfWaDEUUID = pd.concat(frames)
dfWaDEUUID = dfWaDEUUID.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(dfWaDEUUID))
dfWaDEUUID.head(1)

In [None]:
# # this is custom to the provided data

# # attach purge dataframe to Native Source Data
# # df1: OSE POD Data
# #################################################################

# if 'ReasonRemoved' in df1:
#     df1 = df1.drop(['ReasonRemoved', 'IncompleteField'], axis=1)

# df1Copy = dfWaDEUUID.merge(df1, how='right', on='WaDEUUID')
# df1Copy = df1Copy.groupby('WaDEUUID').agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem!=''])).replace(np.nan, "").reset_index()
# df1Copy.to_csv('RawInputData/OSE_PODs.zip', compression=dict(method='zip', archive_name='OSE_PODs.csv'), index=False)

# Custom Queries and Analysis for this Dataset

In [None]:
# asdf