# Data Assessment & Analytics
Notes:
- change os directory location
- be aware of the number of provdied native source data files
- beware of of what data is available

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import geoplot as gplt  # for plotting maps and geo-data
import geoplot.crs as gcrs  #used to pull in webdata related to maps and geo-data
import missingno as msno # creates a matrix chart to show missing values
import plotly.express as px
import plotly.graph_objects as go  # for subplot creation
from plotly.subplots import make_subplots  # for subplot creation
import matplotlib.pyplot as mplt  # use with gplt to save fig to pdf

# ---- cleanup ----
import re # string regular expression manipulation
from datetime import datetime # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = os.getcwd() # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

In [None]:
# WaDE Processed Input Data
#################################################################

dfs_ws = pd.read_csv("data/sites_ws.zip").replace(np.nan, "") # site info with ws
#dfs_geo = pd.read_csv("data/sites_geo.zip").replace(np.nan, "") # just sub data of site with geometry
dfaa = pd.read_csv("data/waterallocations.zip").replace(np.nan, "") # waterallocations info

## Num of Record Summary

In [None]:
print(f"Num of Identified PODs: ", len(dfs_ws[dfs_ws['PODorPOUSite'] == 'POD']))
print(f"Num of Identified POUs: ", len(dfs_ws[dfs_ws['PODorPOUSite'] == 'POU']))
print(f"Num of Identified Water Right Records: ", len(dfaa))

## Figures

In [None]:
dfs_ws.head(1)

In [None]:
dfaa.head(1)

In [None]:
# ---- Histogram: Num of POD sites vs POU sites ----
print(dfs_ws.PODorPOUSite.value_counts())

fig = px.histogram(dfs_ws, x="PODorPOUSite")
fig.update_layout(bargap=0.2,
                  title="Histogram of Point-of-Diversion (POD) / Place-of-Use (POU) Entries per Site",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/PODorPOUSite.png', engine="kaleido")

In [None]:
# ---- Histogram: Num of sites via WatersourceTypeCV ----
print(dfs_ws.WaterSourceTypeCV.value_counts())

fig = px.histogram(dfs_ws, x="WaterSourceTypeCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of Water Source Type Entries per Site",
                  xaxis_title="Water Source Category",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/WaterSourceTypeCV.png', engine="kaleido")

In [None]:
# ---- Histogram: Distribution of PrimaryBeneficialUseCategory WaDE Values ----
print(dfaa.PrimaryBeneficialUseCategory.value_counts())

fig = px.histogram(dfaa, x="PrimaryBeneficialUseCategory")
fig.update_layout(bargap=0.2,
                  title="Histogram of WaDE Primary Beneficial Use Entries in of Water Rights Records",
                  xaxis_title="Primary Beneficial Use Category",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/PrimaryBeneficialUseCategory.png', engine="kaleido")

In [None]:
# ---- AllocationPriorityDate #1: histogram distribution of WaDE values 
print(dfaa.AllocationPriorityDate.value_counts())

dfaatemp = dfaa.copy()
dfaatemp = dfaatemp[(dfaatemp['ExemptOfVolumeFlowPriority'] < 1)].reset_index(drop=True)
fig = px.histogram(dfaatemp, x="AllocationPriorityDate")
fig.update_layout(bargap=0.2,
                  title="Histogram of Priority Date Entries in of Water Rights Records",
                  xaxis_title="Priority Date Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/AllocationPriorityDate1.png', engine="kaleido")

In [None]:
# ---- AllocationPriorityDate #2: cumulative distribution of WaDE values 
print(dfaa.AllocationPriorityDate.value_counts())

dfaatemp = dfaa.copy()
dfaatemp = dfaatemp[(dfaatemp['ExemptOfVolumeFlowPriority'] < 1)].reset_index(drop=True)
fig = px.ecdf(dfaatemp, x="AllocationPriorityDate", ecdfnorm=None)
fig.update_layout(bargap=0.2,
                  title="Cumulative Distribution of Priority Date Entries in of Water Rights Records",
                  xaxis_title="Priority Date Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/AllocationPriorityDate2.png', engine="kaleido")

In [None]:
# ---- AllocationLegalStatusCV: histogram distribution of WaDE values ----
print(dfaa.AllocationLegalStatusCV.value_counts())

fig = px.histogram(dfaa, x="AllocationLegalStatusCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of Legal Status of Water Rights Records",
                  xaxis_title="Legal Status Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/AllocationLegalStatusCV.png', engine="kaleido")

In [None]:
# ---- State: histogram distribution of WaDE values ----
print(dfaa.AllocationLegalStatusCV.value_counts())

fig = px.histogram(dfaa, x="State")
fig.update_layout(bargap=0.2,
                  title="Histogram of Water Right Records entries per State",
                  xaxis_title="State",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/State.png', engine="kaleido")

In [None]:
# ---- State: Table of Water Right Count per State ----
dftemp = dfaa[['State']].copy()
dftemp = dftemp.State.value_counts().reset_index().rename(columns={"index": "State", "State": "Count of WR Records"})

fig = go.Figure(data=[go.Table(header=dict(values=list(dftemp.columns),
                                           line_color='darkslategray',
                                           fill_color='royalblue',
                                           align='center',
                                           font=dict(color='white', size=16),
                                           height=30),
                               cells=dict(values=[dftemp['State'], dftemp['Count of WR Records']], 
                                          line_color='darkslategray',
                                          fill=dict(color=['paleturquoise', 'white']),
                                          align='center',
                                          font=dict(size=12),
                                          height=30)
                              )])

fig.show()
fig.write_image('figures/TState.png', engine="kaleido")

In [None]:
# ---- AllocationFlow_CFS: Boxplot distribution of WaDE values ----

try: 
    trace1 = go.Violin(x=dfaa['AllocationFlow_CFS'], points='outliers', name='Violin Plot')
    trace2 = go.Histogram(x=dfaa['AllocationFlow_CFS'], name='Historgram')

    fig = make_subplots(rows=2, cols=1)
    fig.add_trace(trace1, row=1, col=1)
    fig.add_trace(trace2, row=2, col=1)

    fig.update_layout(showlegend=False, bargap=0.2, title="Flow Distribution in Water Right Records", font=dict(family="Arial Bold", size=12,color="Black"))
    fig.update_xaxes(title_text="Flow Value", row=1, col=1)
    fig.update_xaxes(title_text="Flow Value", row=2, col=1)
    fig.update_yaxes(title_text="Num. of Records", row=2, col=1)
    fig.show()
    fig.write_image('figures/AllocationFlow_CFS.png', engine="kaleido")

except: print('Could not plot AllocationFlow_CFS value.')

In [None]:
# ---- AllocationVolume_AF: Boxplot distribution of WaDE values ----

try:
    trace1 = go.Violin(x=dfaa['AllocationVolume_AF'], points='outliers', name='Violin Plot')
    trace2 = go.Histogram(x=dfaa['AllocationVolume_AF'], name='Historgram')

    fig = make_subplots(rows=2, cols=1)
    fig.add_trace(trace1, row=1, col=1)
    fig.add_trace(trace2, row=2, col=1)

    fig.update_layout(showlegend=False, bargap=0.2, title="Volume Distribution in Water Right Records", font=dict(family="Arial Bold", size=12,color="Black"))
    fig.update_xaxes(title_text="Volume Value", row=1, col=1)
    fig.update_xaxes(title_text="Volume Value", row=2, col=1)
    fig.update_yaxes(title_text="Num. of Records", row=2, col=1)
    fig.show()
    fig.write_image('figures/AllocationVolume_AF.png', engine="kaleido")

except: print('Could not plot AllocationVolume_AF value.')

In [None]:
# # ---- Map of Points sites ----

# dfstemp = dfs.copy()
# dfstemp = dfstemp[dfstemp['Geometry'] == ''].reset_index(drop=True)

# try:
#     contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) # use for background map in subplot
#     ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator()) # set subplot
#     gdfstemp = gpd.GeoDataFrame(dfstemp, geometry=gpd.points_from_xy(dfstemp.Longitude.astype(float), dfstemp.Latitude.astype(float)), crs="EPSG:4326")
#     gplt.pointplot(gdfstemp, hue='PODorPOUSite', legend=True, legend_var='hue', ax=ax)
#     mplt.savefig(format="png", fname='figures/PointMap.png') 
# except:
#     print('No point data to plot')

In [None]:
# # ---- Map of Polygons ----

# dfstemp = dfs.copy()
# dfstemp = dfstemp[dfstemp['Geometry'] != ""].reset_index(drop=True)

# try:
#     contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) # use for background map in subplot
#     ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator()) # set subplot

#     dfstemp['Geometry'] = gpd.GeoSeries.from_wkt(dfstemp['Geometry'], crs="EPSG:4326")
#     gdfstemp = gpd.GeoDataFrame(dfstemp, geometry=dfstemp['Geometry'], crs="EPSG:4326") # covert to geodataframe
#     gplt.polyplot(gdfstemp, ax=ax)
#     mplt.savefig(format="png", fname='figures/PolyMap.png')
# except:
#     print('No geometry data to plot')

# Removed Records compared to Source Data
- this is working just fine, just want to comment out temporarily for future use

In [None]:
# # Explode purge.xlsx files by WaDEUUID, concat together
# #################################################################

# # Explode watersources_missing.xlsx records by WaDEUUID
# dfwspurgeCopy = dfwspurge.assign(WaDEUUID=dfwspurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
# dfwspurgeCopy = dfwspurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# # Explode sites_missing.xlsx records by WaDEUUID
# dfspurgeCopy = dfspurge.assign(WaDEUUID=dfspurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
# dfspurgeCopy = dfspurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# # Explode waterallocations_missing.xlsx records by WaDEUUID
# dfaapurgeCopy = dfaapurge.assign(WaDEUUID=dfaapurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
# dfaapurgeCopy = dfaapurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# # concat purge dataframes togehter
# frames = [dfwspurgeCopy, dfspurgeCopy, dfaapurgeCopy] 
# dfWaDEUUID = pd.concat(frames)
# dfWaDEUUID = dfWaDEUUID.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
# print(len(dfWaDEUUID))
# dfWaDEUUID.head(1)

# Custom Queries and Analysis for this Dataset

In [None]:
# N/A