# Data Assessment & Analytics
Notes:
- change os directory location
- be aware of the number of provdied native source data files

In [None]:
# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd # the library that lets us read in shapefiles
import geoplot as gplt # for plotting maps
import geoplot.crs as gcrs #used to pull in webdata


# visulizaiton
import missingno as msno # creates a matrix chart to show missing values
import plotly.express as px
import plotly.graph_objects as go # for subplot creation
from plotly.subplots import make_subplots # for subplot creation
import matplotlib.pyplot as mplt # use with gplt to save fig to pdf

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Set Working Directory
workingDir = "G:/Shared drives/WaDE Data/Nebraska/SS_ReservoirsObservationSites" # change here
os.chdir(workingDir)

In [None]:
# This needs to be custom per state

# Input Data
#################################################################
# Data 1: Stream Gage Data
df1 = pd.read_csv("RawinputData/P_caStreamObsRecords.zip", compression='zip')

In [None]:
# Input Data - Processed WaDE Input files
#################################################################
dfws = pd.read_csv("ProcessedInputData/watersources.csv").replace(np.nan, "")
dfwspurge = pd.read_csv("ProcessedInputData/watersources_missing.csv").replace(np.nan, "")

dfs = pd.read_csv("ProcessedInputData/sites.csv").replace(np.nan, "")
dfspurge = pd.read_csv("ProcessedInputData/sites_missing.csv").replace(np.nan, "")

dfssro = pd.read_csv("ProcessedInputData/sitespecificamounts.csv").replace(np.nan, "")
dfssropurge = pd.read_csv("ProcessedInputData/sitespecificamounts_missing.csv").replace(np.nan, "")

# Water Source Info
- watersources.csv

In [None]:
print(len(dfws))
dfws.head(1)

In [None]:
# Check what columns contain missing information.
msno.matrix(dfws, figsize=(10,5), fontsize=10)

In [None]:
# Unique values for 'WaterSourceTypeCV'
for x in dfws['WaterSourceTypeCV'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# WaterSourceTypeCV: histogram distribution of WaDE values
print(dfws.WaterSourceTypeCV.value_counts())

fig = px.histogram(dfws, x="WaterSourceTypeCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of WaterSourceTypeCV Entries in watersource.csv",
                  xaxis_title="WaterSourceTypeCV Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/WaterSourceTypeCV.pdf', engine="kaleido")

# Site Info
- sites.csv

In [None]:
print(len(dfs))
dfs.head(1)

In [None]:
# Check what columns contain missing information.
msno.matrix(dfs, figsize=(10,5), fontsize=10)

In [None]:
# Unique values for 'CoordinateMethodCV'
for x in dfs['CoordinateMethodCV'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# Unique values for 'SiteTypeCV'
for x in dfs['SiteTypeCV'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# SiteTypeCV: histogram distribution of WaDE values
print(dfs.SiteTypeCV.value_counts())

fig = px.histogram(dfs, x="SiteTypeCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of SiteTypeCV Entries in sites.csv",
                  xaxis_title="SiteTypeCV Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/SiteTypeCV.pdf', engine="kaleido")

In [None]:
# PODorPOUSite: histogram distribution of WaDE values
print(dfs.PODorPOUSite.value_counts())

fig = px.histogram(dfs, x="PODorPOUSite")
fig.update_layout(bargap=0.2,
                  title="Histogram of PODorPOUSite Entries in sites.csv",
                  xaxis_title="PODorPOUSite Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/PODorPOUSite.pdf', engine="kaleido")

In [None]:
# map the site info (this would be lat & long Points only)
try:
    contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) # use for background map in subplot
    ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator()) # set subplot
    
    dfsPoint = dfs.copy()
    gdfsPoint = gpd.GeoDataFrame(dfsPoint, geometry=gpd.points_from_xy(dfsPoint.Longitude.astype(float), dfsPoint.Latitude.astype(float)), crs="EPSG:4326")
    gplt.pointplot(gdfsPoint, hue='PODorPOUSite', legend=True, legend_var='hue', ax=ax)
    mplt.savefig(format="pdf", fname='DataAssessment/figures/PointMap.pdf')
    
except:
    print('No point data to plot')

In [None]:
# Map poly info
try:
    contiguous_usa = gpd.read_file(gplt.datasets.get_path('contiguous_usa')) # use for background map in subplot
    ax = gplt.webmap(contiguous_usa, projection=gcrs.WebMercator()) # set subplot
    
    dfsPoly = dfs.copy()
    dfsPoly = dfsPoly[dfsPoly['Geometry'] != ""].reset_index(drop=True)
    dfsPoly['Geometry'] = gpd.GeoSeries.from_wkt(dfsPoly['Geometry'], crs="EPSG:4326")
    gdfsPoly = gpd.GeoDataFrame(dfsPoly, geometry=dfsPoly['Geometry'], crs="EPSG:4326") # covert to geodataframe
    gdfsPoly['Geometry'] = gdfsPoly.simplify(0.001) # simplify the geometry. Lower the number the larger the exported file.
    gplt.polyplot(gdfsPoly, ax=ax)
    mplt.savefig(format="pdf", fname='DataAssessment/figures/PolyMap.pdf')
except:
    print('No geometry data to plot')

In [None]:
dfsPoly.head()

# Site Specific Amount Info
- sitespecificamounts.csv

In [None]:
print(len(dfssro))
dfssro.head(1)

In [None]:
# # Unique values for 'AllocationBasisCV'
# for x in dfssro['AllocationBasisCV'].sort_values().unique():
#     print(f'"' + str(x) + '",')

In [None]:
# Unique values for 'BeneficialUseCategory'
uniqueList = list(set([i.strip() for i in ','.join(dfssro['BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Check what columns contain missing information.
msno.matrix(dfssro, figsize=(10,5), fontsize=10)

In [None]:
# # Amount: Boxplot distribution of WaDE values
# try:
#     trace1 = go.Violin(x=dfssro['Amount'], points='outliers', name='Violin Plot')
#     trace2 = go.Histogram(x=dfssro['Amount'], name='Historgram')

#     fig = make_subplots(rows=2, cols=1)
#     fig.add_trace(trace1, row=1, col=1)
#     fig.add_trace(trace2, row=2, col=1)

#     fig.update_layout(showlegend=False, bargap=0.2, title="Amount Distribution in sitespecificamounts.csv", font=dict(family="Arial Bold", size=12,color="Black"))
#     fig.update_xaxes(title_text="Amount Value", row=1, col=1)
#     fig.update_xaxes(title_text="Amount Value", row=2, col=1)
#     fig.update_yaxes(title_text="Num. of Records", row=2, col=1)
#     fig.show()
#     fig.write_image('DataAssessment/figures/Amount.pdf', engine="kaleido")

# except: print('Could not plot Amount value.')

In [None]:
# PrimaryUseCategory: histogram distribution of WaDE values
print(dfssro.PrimaryUseCategory.value_counts())

fig = px.histogram(dfssro, x="PrimaryUseCategory")
fig.update_layout(bargap=0.2,
                  title="Histogram of PrimaryUseCategory Entries in sitespecificamounts.csv",
                  xaxis_title="PrimaryUseCategory Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/PrimaryUseCategory.pdf', engine="kaleido")

In [None]:
# ReportYearCV: histogram distribution of WaDE values
print(dfssro.ReportYearCV.value_counts())

fig = px.histogram(dfssro, x="ReportYearCV")
fig.update_layout(bargap=0.2,
                  title="Histogram of ReportYearCV Entries in sitespecificamounts.csv",
                  xaxis_title="ReportYearCV Value",
                  yaxis_title="# of entries",
                  #legend_title="Legend Title",
                    font=dict(
                        family="Arial Bold",
                        size=12,
                        color="Black")
                 )
fig.show()
fig.write_image('DataAssessment/figures/ReportYearCV.pdf', engine="kaleido")

In [None]:
# merge all figure pdfs into single output pdf

'''
Notes:
'merger' is used for merging multiple files into one and merger.append(absfile) will append 
 the files one by one until all pdfs are appended in the result file.
'''

from PyPDF2 import PdfFileMerger

# If files are saved in the folder 'C:\Users' then Full_Path will be replaced with C:\Users
filePath = str(os.getcwd()) + '/DataAssessment/figures'
pdfsList = os.listdir(filePath)
print(pdfsList)


# os.listdir will create the list of all files in a directory
merger = PdfFileMerger(strict=False)

for file in pdfsList:
    if file.endswith(".pdf"):
        path_with_file = os.path.join(filePath, file)
        print(path_with_file)
        merger.append(path_with_file,  import_bookmarks=False )
merger.write("DataAssessment/Figures Merged Copy.pdf")

merger.close()

# Removed Records compared to Source Data
- this is working just fine
- just want to comment out temporarily

In [None]:
# Explode purge.xlsx files by WaDEUUID, concat together
#################################################################

# Explode watersources_missing.xlsx records by WaDEUUID
dfwspurgeCopy = dfwspurge.assign(WaDEUUID=dfwspurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
dfwspurgeCopy = dfwspurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# Explode sites_missing.xlsx records by WaDEUUID
dfspurgeCopy = dfspurge.assign(WaDEUUID=dfspurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
dfspurgeCopy = dfspurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# Explode waterallocations_missing.xlsx records by WaDEUUID
dfssropurgeCopy = dfssropurge.assign(WaDEUUID=dfssropurge['WaDEUUID'].str.split(',')).explode('WaDEUUID').reset_index(drop=True)
dfssropurgeCopy = dfssropurgeCopy[['WaDEUUID','ReasonRemoved','IncompleteField']]

# concat purge dataframes togehter
frames = [dfwspurgeCopy, dfspurgeCopy, dfssropurgeCopy] 
dfWaDEUUID = pd.concat(frames)
dfWaDEUUID = dfWaDEUUID.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(dfWaDEUUID))
dfWaDEUUID.head(1)

In [None]:
# this is custom to the provided data

# attach purge dataframe to Native Source Data
# Data 1: Stream Gage Data
#################################################################

if 'ReasonRemoved' in df1:
    df1 = df1.drop(['ReasonRemoved', 'IncompleteField'], axis=1)

df1Copy = dfWaDEUUID.merge(df1, how='right', on='WaDEUUID')
df1Copy = df1Copy.groupby('WaDEUUID').agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem!=''])).replace(np.nan, "").reset_index()
df1Copy.to_csv('RawInputData/P_caStreamObsRecords.zip', compression=dict(method='zip', archive_name='P_caStreamObsRecords.csv'), index=False)

In [None]:
# this is custom to the provided data

# attach purge dataframe to Native Source Data
# Data 2: Reservoir Level Data
#################################################################

if 'ReasonRemoved' in df2:
    df2 = df2.drop(['ReasonRemoved', 'IncompleteField'], axis=1)

df2Copy = dfWaDEUUID.merge(df2, how='right', on='WaDEUUID')
df2Copy = df2Copy.groupby('WaDEUUID').agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem!=''])).replace(np.nan, "").reset_index()
df2Copy.to_csv('RawInputData/P_caReservoirObsRecords.zip', compression=dict(method='zip', archive_name='P_caReservoirObsRecords.csv'), index=False)

# Custom Queries and Analysis for this Dataset

In [None]:
# ???