# Data Assessment & Analytics
Notes:
- change os directory location
- be aware of the number of provdied native source data files

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import geoplot as gplt  # for plotting maps and geo-data
import geoplot.crs as gcrs  #used to pull in webdata related to maps and geo-data
import missingno as msno # creates a matrix chart to show missing values
import plotly.express as px
import plotly.graph_objects as go  # for subplot creation
from plotly.subplots import make_subplots  # for subplot creation
import matplotlib.pyplot as mplt  # use with gplt to save fig to pdf

# ---- cleanup ----
import re # string regular expression manipulation
from datetime import datetime # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = os.getcwd() # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

In [None]:
# ---- where to find input files ----
InputFolderString = "G:/Shared drives/WaDE Data/WaDE Data Folder/GreatLakes/WaterUse_AggregatedArea"  # set this to where input files are located

In [None]:
# Native Input Data
#################################################################
# ---- This needs to be custom per state ----

# Data Set 1: WSWC Great Lakes Data 2012-2022
dfin1 = pd.read_csv(InputFolderString + '/RawinputData/WSWC Great Lakes Data 2012-2022.zip')

# Data Set 2: Exploded_GLCompact_Basins_Jurisdictions
dfin2 = pd.read_csv(InputFolderString + '/RawinputData/Exploded_GLCompact_Basins_Jurisdictions.zip')

In [None]:
# Input Data - Processed WaDE Input files
#################################################################
dfm = pd.read_csv(InputFolderString + "/ProcessedInputData/methods.csv").replace(np.nan, "")

dfv = pd.read_csv(InputFolderString + "/ProcessedInputData/variables.csv").replace(np.nan, "")

dfo = pd.read_csv(InputFolderString + "/ProcessedInputData/organizations.csv").replace(np.nan, "")

dfws = pd.read_csv(InputFolderString + "/ProcessedInputData/watersources.csv").replace(np.nan, "")
dfwspurge = pd.read_csv(InputFolderString + "/ProcessedInputData/watersources_missing.csv").replace(np.nan, "")

dfru = pd.read_csv(InputFolderString + "/ProcessedInputData/reportingunits.csv").replace(np.nan, "")
dfrupurge = pd.read_csv(InputFolderString + "/ProcessedInputData/reportingunits_missing.csv").replace(np.nan, "")

dfag = pd.read_csv(InputFolderString + "/ProcessedInputData/aggregatedamounts.csv").replace(np.nan, "")
dfagpurge = pd.read_csv(InputFolderString + "/ProcessedInputData/aggregatedamounts_missing.csv").replace(np.nan, "")

# Water Source Info (watersources.csv)

In [None]:
print(len(dfws))
dfws.head(1)

In [None]:
# Check what columns contain missing information.
msno.bar(dfws)

In [None]:
# Unique values for 'WaterSourceTypeCV'
for x in dfws['WaterSourceTypeCV'].sort_values().unique():
    print(f'"' + x + '",')

# Reporting Unit Info (reportingunits.csv)

In [None]:
print(len(dfru))
dfru.head(1)

In [None]:
# Check what columns contain missing information.
msno.bar(dfru)

In [None]:
# Unique values for 'ReportingUnitName'
for x in dfru['ReportingUnitName'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'ReportingUnitTypeCV'
for x in dfru['ReportingUnitTypeCV'].sort_values().unique():
    print(f'"' + x + '",')

# Aggregated Amounts Info (aggregatedamounts.csv)

In [None]:
print(len(dfag))
dfag.head(1)

In [None]:
# Check what columns contain missing information.
msno.bar(dfag)

In [None]:
# Unique values for 'AllocationCropDutyAmount'
for x in dfag['AllocationCropDutyAmount'].sort_values().unique():
    print(f'"' + str(x) + '",')

In [None]:
# Unique values for 'CommunityWaterSupplySystem'
for x in dfag['CommunityWaterSupplySystem'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'CropTypeCV'
for x in dfag['CropTypeCV'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'CustomerTypeCV'
for x in dfag['CustomerTypeCV'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'IrrigationMethodCV'
for x in dfag['IrrigationMethodCV'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'PopulationServed'
for x in dfag['PopulationServed'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'BeneficialUseCategory'
uniqueList = list(set([i.strip() for i in ','.join(dfag['BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

## Markdown Table Creation

In [None]:
# method.csv
try:
  dftmp = dfm.loc[[0]].drop(['MethodDescription'], axis=1).copy().to_markdown()
except:
  dftmp = dfm.drop(['MethodDescription'], axis=1).copy().to_markdown()
print(dftmp)

In [None]:
# variable.csv
try:
  dftmp = dfv.loc[[0]].copy().to_markdown()
except:
  dftmp = dfv.copy().to_markdown()
print(dftmp)

In [None]:
# organizations.csv
try:
  dftmp = dfo.loc[[0]].copy().to_markdown()
except:
  dftmp = dfo.copy().to_markdown()
print(dftmp)

In [None]:
# watersources.csv
try:
  dftmp = dfws.loc[[0]].copy().to_markdown()
except:
  dftmp = dfws.copy().to_markdown()
print(dftmp)

In [None]:
# reportingunits.csv
try:
  dftmp = dfru.loc[[0]].drop(['Geometry'], axis=1).copy().to_markdown()
except:
  dftmp = dfru.drop(['Geometry'], axis=1).copy().to_markdown()
print(dftmp)

In [None]:
# aggregatedamounts.csv
try:
  dftmp = dfag.loc[[0]].copy().to_markdown()
except:
  dftmp = dfag.copy().to_markdown()
print(dftmp)

## Num of Record Summary

In [None]:
# Num of entries of source data
print(f"Num of Source #1 Entries (rows): ", "|", len(dfin1))
print(f"Num of Source #2 Entries (rows): ", "|", len(dfin2))

In [None]:
# Num of entries compiled into WaDE
ruString=  len(dfru)
agString = len(dfag)
print("Dataset  | Num of Identified Reporting Unit Areas | Num of Identified Aggregated Amount Records")
print("**Compiled WaDE Data** |", ruString, "|", agString)

## Why Removed Records Summary

In [None]:
print("---- Reasons why records were removed from water source info ----")
if len(dfwspurge) != 0:
    print(dfwspurge['ReasonRemoved'].value_counts().astype(str) + " | removed from watersources.csv input")
else:
    print("...nothing removed.")

In [None]:
print(" ---- Reasons why records were removed from reportingunits info ---- ")
if len(dfrupurge) != 0:
    print(dfrupurge['ReasonRemoved'].value_counts().astype(str) + " | removed from reportingunits.csv input")
else:
    print("...nothing removed.")

In [None]:
print(" ---- Reasons why records were removed from aggregatedamounts amount info ---- ")
if len(dfagpurge) != 0:
    print(dfagpurge['ReasonRemoved'].value_counts().astype(str) + " | removed from aggregatedamounts.csv input")
else:
    print("...nothing removed.")

## Figures

In [None]:
# ---- merge watersource.csv to aggregatedamounts.csv to reportingunits.csv ----

dfagtemp = dfag.copy()

# Count the number of unique records per ReportingUnitUUID
dfagtemp['RecordCountByRu'] = dfagtemp['ReportingUnitUUID'].map(dfagtemp['ReportingUnitUUID'].value_counts())

# explode aggregatedamounts.csv on WaterSourceUUID
dfagtemp = dfagtemp.assign(WaterSourceUUID=dfagtemp['WaterSourceUUID'].str.split(',')).explode('WaterSourceUUID').reset_index(drop=True)

# merge watersource.csv to aggregatedamounts.csv ----
dfagtemp_ws = pd.merge(dfagtemp, dfws[['WaterSourceUUID', 'WaterSourceTypeCV']], left_on='WaterSourceUUID', right_on='WaterSourceUUID', how='left')

# merge watersource-aggregatedamounts combo to reportingunits.csv ----
dfrutemp = dfru.copy()
dfrutemp_ws = pd.merge(dfrutemp, dfagtemp_ws[['ReportingUnitUUID', 'RecordCountByRu', 'WaterSourceTypeCV']], left_on='ReportingUnitUUID', right_on='ReportingUnitUUID', how='left')

print(len(dfrutemp_ws))
dfrutemp_ws.head(1)

In [None]:
# ---- Histogram: Num of reportingunits via WatersourceTypeCV ----
print(dfagtemp_ws.WaterSourceTypeCV.value_counts())

fig = px.histogram(dfagtemp_ws, x="WaterSourceTypeCV", text_auto=True)
fig.update_layout(bargap=0.2,
                  title="Histogram of WaterSourceTypeCV by ReportingUnit Areas",
                  xaxis_title="WaterSourceTypeCV Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/WaterSourceTypeCV.png', engine="kaleido")

In [None]:
# ---- Histogram: Num of StatCV in reportingunit.csv ----
print(dfru.StateCV.value_counts())

fig = px.histogram(dfru, x="StateCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of StateCV Entries in reportingunit.csv",
                  xaxis_title="StateCV Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/StateCV.pdf', engine="kaleido")

In [None]:
# Amount: Boxplot distribution of WaDE values
try:
    trace1 = go.Violin(x=dfag['Amount'], points='outliers', name='Violin Plot')
    trace2 = go.Histogram(x=dfag['Amount'], name='Historgram')

    fig = make_subplots(rows=2, cols=1)
    fig.add_trace(trace1, row=1, col=1)
    fig.add_trace(trace2, row=2, col=1)

    fig.update_layout(showlegend=False, bargap=0.2, title="Amount Distribution in aggregatedamounts.csv", font=dict(family="Arial Bold", size=12,color="Black"))
    fig.update_xaxes(title_text="Amount Value", row=1, col=1)
    fig.update_xaxes(title_text="Amount Value", row=2, col=1)
    fig.update_yaxes(title_text="Num. of Records", row=2, col=1)
    fig.show(renderer="png")
    fig.write_image('figures/Amount.pdf', engine="kaleido")

except: print('Could not plot Amount value.')

In [None]:
# PrimaryUseCategory: histogram distribution of WaDE values
print(dfag.PrimaryUseCategoryCV.value_counts())

fig = px.histogram(dfag, x="PrimaryUseCategoryCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of PrimaryUseCategoryCV Entries in aggregatedamounts.csv",
                  xaxis_title="PrimaryUseCategoryCV Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/PrimaryUseCategoryCV.pdf', engine="kaleido")

In [None]:
# ReportYearCV: histogram distribution of WaDE values
print(dfag.ReportYearCV.value_counts())

fig = px.histogram(dfag, x="ReportYearCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of ReportYearCV Entries in aggregatedamounts.csv",
                  xaxis_title="ReportYearCV Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('figures/ReportYearCV.pdf', engine="kaleido")

## map

In [None]:
# Polygon Map

dfruPoly = dfru.copy()
dfruPoly = dfruPoly[dfruPoly['Geometry'] != ""].reset_index(drop=True)

try:
    contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) # use for background map in subplot
    ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator()) # set subplot
    
    dfruPoly['Geometry'] = gpd.GeoSeries.from_wkt(dfruPoly['Geometry'], crs="EPSG:4326")
    gdfruPoly = gpd.GeoDataFrame(dfruPoly, geometry=dfruPoly['Geometry'], crs="EPSG:4326") # covert to geodataframe
    gplt.choropleth(gdfruPoly, edgecolor='lightgray', linewidth=0.5, hue='ReportingUnitTypeCV', legend=True, ax=ax)
    mplt.savefig(format="png", fname='figures/PolyMap.png')
except:
    print('No geometry data to plot')

# Removed Records compared to Source Data
- this is working just fine
- just want to comment out temporarily

In [None]:
# # Explode purge.xlsx files by WaDEUUID, concat together
# #################################################################

# # Explode watersources_missing.csv records by WaDEUUID
# dfwspurgeCopy = dfwspurge.assign(WaDEUUID=dfwspurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
# dfwspurgeCopy = dfwspurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# # Explode reportingunits_missing.csv records by WaDEUUID
# dfrupurgeCopy = dfrupurge.assign(WaDEUUID=dfrupurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
# dfrupurgeCopy = dfrupurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# # Explode aggregatedamounts_missing.csv records by WaDEUUID
# dfagpurgeCopy = dfagpurge.assign(WaDEUUID=dfagpurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
# dfagpurgeCopy = dfagpurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# # concat purge dataframes togehter
# frames = [dfwspurgeCopy, dfrupurgeCopy, dfagpurgeCopy] 
# dfWaDEUUID = pd.concat(frames)
# dfWaDEUUID = dfWaDEUUID.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
# print(len(dfWaDEUUID))
# dfWaDEUUID.head(1)

# Custom Queries and Analysis for this Dataset

In [None]:
# N/A