In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns #use sns.__version__ to see version should be 0.11.1
import matplotlib.pyplot as plt
import datetime as dt

# turn off top and rigt axis line in matplotlib
plt.rcParams["axes.spines.right"] = False
plt.rcParams["axes.spines.top"] = False

# change font size in matplolib
plt.rcParams.update({'font.size': 14})

# increasing the deafult DPI to improve resolution
#plt.rcParams['figure.dpi']= 50 # for preview
plt.rc("savefig", dpi = 300) # for saving

# Routine water quality data

In [2]:
# Import Kateri's processed water quality data
df = pd.read_csv("../../data/raw/ul_data_wqp_processed_2020-03-05.csv")

In [3]:
# Selecting relevant columns
df = df.loc[:,["MonitoringLocationIdentifier","datetime",
               "CharacteristicName", 
               "ResultMeasureValue"]].copy()

In [68]:
# Transform table
wq = df.pivot_table("ResultMeasureValue", ['MonitoringLocationIdentifier','datetime'],'CharacteristicName') 
wq.columns.name = ''

In [69]:
# Make datetime column datetime type
wq = wq.reset_index()
wq.datetime = pd.to_datetime(wq['datetime'])

In [70]:
# Take average to get daily values
# used this to groupby https://pbpython.com/pandas-grouper-agg.html 
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
wq = wq.groupby(['MonitoringLocationIdentifier',
                 pd.Grouper(key='datetime', freq='D')])[wq.columns[2:]].mean()
wq = wq.reset_index()
wq.datetime = pd.to_datetime(wq['datetime'])

In [71]:
wq.head()

Unnamed: 0,MonitoringLocationIdentifier,datetime,"Alkalinity, total",Aluminum,Ammonia-nitrogen,Arsenic,Barium,Bicarbonate,"Biochemical oxygen demand, standard conditions",Boron,...,Total Kjeldahl nitrogen,Total dissolved solids,Total fixed solids,Total suspended solids,Total volatile solids,Turbidity,Volatile suspended solids,Weather condition (WMO code 4501) (choice list),Zinc,pH
0,NALMS-7458,2012-06-27,,,,,,,,,...,,,,,,,,0.0,,
1,NALMS-7458,2015-07-03,,,,,,,,,...,,,,,,,,0.0,,8.0
2,USGS-401327111462601,2016-08-10,163.0,,,,,,,,...,,1037.386667,,,,110.0,,,,
3,USGS-401432111454301,2016-08-10,165.0,,,,,,,,...,,1034.04,,,,74.0,,,,
4,USGS-401613111463301,2016-08-10,164.0,,,,,,,,...,,980.683333,,,,100.0,,,,


In [72]:
wq.shape

(1243, 77)

In [73]:
wq.datetime.describe()

count                    1243
unique                    214
top       2018-05-16 00:00:00
freq                       20
first     1978-08-31 00:00:00
last      2019-09-23 00:00:00
Name: datetime, dtype: object

Two reasons to collapse spatial variability
* Social media data does not have spatial information
* Water quality parameters do not have a consistent temporal or spatial pattern of measuremnts

In [10]:
# Collapse spatial variability by dropping location and averaging (or taking maximum) of daily values
# First see how many unique observations
wq.datetime.describe()

count                    1243
unique                    214
top       2018-05-16 00:00:00
freq                       20
first     1978-08-31 00:00:00
last      2019-09-23 00:00:00
Name: datetime, dtype: object

In [11]:
# Select all rows after 2016
wq_16 = wq.loc[wq['datetime'] > '2015-12-31']
wq_16.datetime.describe()

In [12]:
# Drop locations column
wq_16 = wq_16.iloc[:,1:]

In [13]:
wq_16.head()

Unnamed: 0,datetime,"Alkalinity, total",Aluminum,Ammonia-nitrogen,Arsenic,Barium,Bicarbonate,"Biochemical oxygen demand, standard conditions",Boron,Bromide,...,Total Kjeldahl nitrogen,Total dissolved solids,Total fixed solids,Total suspended solids,Total volatile solids,Turbidity,Volatile suspended solids,Weather condition (WMO code 4501) (choice list),Zinc,pH
2,2016-08-10,163.0,,,,,,,,0.398,...,,1037.386667,,,,110.0,,,,
3,2016-08-10,165.0,,,,,,,,0.395,...,,1034.04,,,,74.0,,,,
4,2016-08-10,164.0,,,,,,,,0.379,...,,980.683333,,,,100.0,,,,
5,2016-08-10,166.0,,,,,,,,0.395,...,,1050.723333,,,,59.0,,,,
13,2016-09-26,,,,,,,,,,...,,,,,,,,,,


In [14]:
# Take daily average
wq_16 = wq_16.groupby([
                 pd.Grouper(key='datetime', freq='D')])[wq.columns[1:]].mean()

In [15]:
wq_16 = wq_16.reset_index()
wq_16.describe()

Unnamed: 0,"Alkalinity, total",Aluminum,Ammonia-nitrogen,Arsenic,Barium,Bicarbonate,"Biochemical oxygen demand, standard conditions",Boron,Bromide,Cadmium,...,Total Kjeldahl nitrogen,Total dissolved solids,Total fixed solids,Total suspended solids,Total volatile solids,Turbidity,Volatile suspended solids,Weather condition (WMO code 4501) (choice list),Zinc,pH
count,41.0,14.0,53.0,27.0,26.0,3.0,0.0,35.0,1.0,4.0,...,20.0,64.0,0.0,63.0,39.0,38.0,24.0,0.0,11.0,42.0
mean,196.534002,450.118152,0.137085,9.505821,86.302973,225.177778,,321.041172,0.39175,0.081125,...,2.418558,1136.164226,,69.300348,12.542847,56.815465,25.958808,,9.288606,8.50793
std,23.029922,292.484772,0.206261,3.832338,13.355922,11.42635,,96.127825,,0.036456,...,2.672706,275.475429,,46.341656,10.4446,30.049953,14.828525,,3.007211,0.18338
min,159.0,7.275,0.01,2.28,60.55,212.0,,109.0,0.39175,0.0585,...,0.310167,386.0,,7.0,3.12,6.4,11.023699,,5.04,8.210972
25%,179.5,331.246429,0.0245,7.47225,79.409091,221.6,,277.15,0.39175,0.058875,...,1.0065,1027.29072,,35.39125,6.460167,34.516667,15.418086,,6.344,8.385891
50%,193.181818,410.403409,0.0415,9.239091,85.658766,231.2,,337.375,0.39175,0.0655,...,1.478292,1203.392857,,60.866667,8.270625,56.325,17.80432,,9.98,8.478534
75%,204.0,682.583333,0.184875,11.543636,91.529545,231.766667,,378.45,0.39175,0.08775,...,1.812906,1289.308141,,93.216667,14.159091,68.544097,34.145894,,11.35,8.604687
max,257.0,904.25,0.998667,17.65,131.496667,232.333333,,475.0,0.39175,0.135,...,10.776667,1770.0,,237.0,52.0,160.0,71.462264,,13.9,8.992857


In [16]:
# Select variables of interest
pred = wq_16.loc[:,["datetime","Depth, Secchi disk depth", "Turbidity",
                    "Total suspended solids",
                    "Chlorophyll a, uncorrected for pheophytin", 
                     "Chlorophyll a, corrected for pheophytin",
                    "Chlorophyll a, free of pheophytin"]].copy()

In [None]:
# https://www.youtube.com/watch?v=3d_8nQpSCgE&t=637s 