# Data Assessment & Analytics
Notes:
- change os directory location
- be aware of the number of provdied native source data files
- beware of of what data is available

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import plotly.express as px
import plotly.graph_objects as go  # for subplot creation
from plotly.subplots import make_subplots  # for subplot creation
import matplotlib.pyplot as mplt  # use with gplt to save fig to pdf

# ---- cleanup ----
import re # string regular expression manipulation
from datetime import datetime # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = os.getcwd() # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

In [None]:
# WaDE Processed Input Data
#################################################################
dfs_ws = pd.read_csv("data/sites_ws.zip").replace(np.nan, "") # site info with ws
dfaa = pd.read_csv("data/waterallocations.zip").replace(np.nan, "") # waterallocations info
dfo = pd.read_csv("data/organizations.zip").replace(np.nan, "") # organizations info

## Figures

In [None]:
# ---- Histogram: Num of POD sites vs POU sites ----
print(dfs_ws.PODorPOUSite.value_counts())

fig = px.histogram(dfs_ws, x="PODorPOUSite")
fig.update_layout(bargap=0.2,
                  title="Histogram of Point-of-Diversion (POD) / Place-of-Use (POU) Entries per Site",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/PODorPOUSite.png', engine="kaleido")

In [None]:
# ---- Histogram: Num of sites via WatersourceTypeCV ----
print(dfs_ws.WaterSourceTypeCV.value_counts())

fig = px.histogram(dfs_ws, x="WaterSourceTypeCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of Water Source Type Entries per Site",
                  xaxis_title="Water Source Category",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/WaterSourceTypeCV.png', engine="kaleido")

In [None]:
# ---- Histogram: Distribution of PrimaryBeneficialUseCategory WaDE Values ----
print(dfaa.PrimaryBeneficialUseCategory.value_counts())

fig = px.histogram(dfaa, x="PrimaryBeneficialUseCategory")
fig.update_layout(bargap=0.2,
                  title="Histogram of WaDE Primary Beneficial Use Entries in of Water Rights Records",
                  xaxis_title="Primary Beneficial Use Category",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/PrimaryBeneficialUseCategory.png', engine="kaleido")

In [None]:
# ---- AllocationPriorityDate #1: histogram distribution of WaDE values 
print(dfaa.AllocationPriorityDate.value_counts())

dfaatemp = dfaa.copy()
dfaatemp = dfaatemp[(dfaatemp['ExemptOfVolumeFlowPriority'] < 1)].reset_index(drop=True)
fig = px.histogram(dfaatemp, x="AllocationPriorityDate")
fig.update_layout(bargap=0.2,
                  title="Histogram of Priority Date Entries in of Water Rights Records",
                  xaxis_title="Priority Date Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/AllocationPriorityDate1.png', engine="kaleido")

In [None]:
# ---- AllocationPriorityDate #2: cumulative distribution of WaDE values 
print(dfaa.AllocationPriorityDate.value_counts())

dfaatemp = dfaa.copy()
dfaatemp = dfaatemp[(dfaatemp['ExemptOfVolumeFlowPriority'] < 1)].reset_index(drop=True)
fig = px.ecdf(dfaatemp, x="AllocationPriorityDate", ecdfnorm=None)
fig.update_layout(bargap=0.2,
                  title="Cumulative Distribution of Priority Date Entries in of Water Rights Records",
                  xaxis_title="Priority Date Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/AllocationPriorityDate2.png', engine="kaleido")

In [None]:
# ---- AllocationLegalStatusCV: histogram distribution of WaDE values ----
print(dfaa.AllocationLegalStatusCV.value_counts())

fig = px.histogram(dfaa, x="AllocationLegalStatusCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of Legal Status of Water Rights Records",
                  xaxis_title="Legal Status Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/AllocationLegalStatusCV.png', engine="kaleido")

In [None]:
# ---- State: histogram distribution of WaDE values ----
print(dfaa.AllocationLegalStatusCV.value_counts())

fig = px.histogram(dfaa, x="State")
fig.update_layout(bargap=0.2,
                  title="Histogram of Water Right Records entries per State",
                  xaxis_title="State",
                  yaxis_title="# of entries",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/State.png', engine="kaleido")

In [None]:
# ---- State: Table of Water Right Count per State ----
dftemp = dfaa[['State']].copy()
dftemp = dftemp.State.value_counts().reset_index().rename(columns={"index": "State", "State": "Count of WR Records"})

fig = go.Figure(data=[go.Table(header=dict(values=list(dftemp.columns),
                                           line_color='darkslategray',
                                           fill_color='royalblue',
                                           align='center',
                                           font=dict(color='white', size=16),
                                           height=30),
                               cells=dict(values=[dftemp['State'], dftemp['Count of WR Records']], 
                                          line_color='darkslategray',
                                          fill=dict(color=['paleturquoise', 'white']),
                                          align='center',
                                          font=dict(size=12),
                                          height=30)
                              )])

fig.show()
fig.write_image('figures/TState.png', engine="kaleido")

In [None]:
# ---- Organization: Table #1 of Organization by WaDE data type ----

dftemp = dfo.copy().reset_index()

fig = go.Figure(data=[go.Table(columnwidth=[20, 70, 30, 100, 80],
                               header=dict(values=list(dftemp.columns),
                                           line_color='darkslategray',
                                           fill_color='royalblue',
                                           align='center',
                                           font=dict(color='white', size=16),
                                           height=30),
                               cells=dict(values=[dftemp['index'], dftemp['WaDEDataType'], dftemp['State'], dftemp['OrganizationName'], dftemp['OrganizationWebsite']], 
                                          line_color='darkslategray',
                                          align=['center', 'left', 'center', 'left', 'left'],
                                          font=dict(size=12),
                                          height=30)
                              )])
fig.update_layout(autosize=False, width=1000, height=3100)

fig.show()
fig.write_image('figures/T1Organization.png', engine="kaleido")

In [None]:
# ---- Organization: Table #2 of Organization by WaDE data type ----
# groupby() WaDEDataType, drop OrganizationWebsite

dftemp = dfo.copy()
dftemp = dftemp.drop(['OrganizationWebsite'], axis=1)
dftemp = dftemp.groupby(['OrganizationName', 'State']).agg(lambda x: ', '.join([str(elem) for elem in (list(set(x))) if elem!=''])).replace(np.nan, "").reset_index()
dftemp['WaDEDataType'] = dftemp['WaDEDataType'].str.split(',')
dftemp['Num WaDE Data Contributions'] = dftemp['WaDEDataType'].str.len()
dftemp['WaDEDataType'] = dftemp['WaDEDataType'].str.join(', ')
dftemp = dftemp.reset_index()

fig = go.Figure(data=[go.Table(columnwidth=[15, 50, 30, 120, 50],
                               header=dict(values=list(dftemp.columns),
                                           line_color='darkslategray',
                                           fill_color='royalblue',
                                           align='center',
                                           font=dict(color='white', size=16),
                                           height=30),
                               cells=dict(values=[dftemp['index'], dftemp['OrganizationName'], dftemp['State'], dftemp['WaDEDataType'], dftemp['Num WaDE Data Contributions']], 
                                          line_color='darkslategray',
                                          align=['center', 'left', 'left', 'left', 'center'],
                                          font=dict(size=12),
                                          height=30)
                              )])
fig.update_layout(autosize=False, width=1000, height=2200)

fig.show()
fig.write_image('figures/T2Organization.png', engine="kaleido")

In [None]:
# ---- AllocationFlow_CFS: Boxplot distribution of WaDE values ----

try: 
    trace1 = go.Violin(x=dfaa['AllocationFlow_CFS'], points='outliers', name='Violin Plot')
    trace2 = go.Histogram(x=dfaa['AllocationFlow_CFS'], name='Historgram')

    fig = make_subplots(rows=2, cols=1)
    fig.add_trace(trace1, row=1, col=1)
    fig.add_trace(trace2, row=2, col=1)

    fig.update_layout(showlegend=False, bargap=0.2, title="Flow Distribution in Water Right Records", font=dict(family="Arial Bold", size=12,color="Black"))
    fig.update_xaxes(title_text="Flow Value", row=1, col=1)
    fig.update_xaxes(title_text="Flow Value", row=2, col=1)
    fig.update_yaxes(title_text="Num. of Records", row=2, col=1)
    fig.show()
    fig.write_image('figures/AllocationFlow_CFS.png', engine="kaleido")

except: print('Could not plot AllocationFlow_CFS value.')

In [None]:
# ---- AllocationVolume_AF: Boxplot distribution of WaDE values ----

try:
    trace1 = go.Violin(x=dfaa['AllocationVolume_AF'], points='outliers', name='Violin Plot')
    trace2 = go.Histogram(x=dfaa['AllocationVolume_AF'], name='Historgram')

    fig = make_subplots(rows=2, cols=1)
    fig.add_trace(trace1, row=1, col=1)
    fig.add_trace(trace2, row=2, col=1)

    fig.update_layout(showlegend=False, bargap=0.2, title="Volume Distribution in Water Right Records", font=dict(family="Arial Bold", size=12,color="Black"))
    fig.update_xaxes(title_text="Volume Value", row=1, col=1)
    fig.update_xaxes(title_text="Volume Value", row=2, col=1)
    fig.update_yaxes(title_text="Num. of Records", row=2, col=1)
    fig.show()
    fig.write_image('figures/AllocationVolume_AF.png', engine="kaleido")

except: print('Could not plot AllocationVolume_AF value.')