In [1]:
# Import libraries
import pandas as pd
import numpy as np
import datetime as dt

Naming convention for dataframes:

* sd - Secchi disk depth

* t - turbidity

* tss - total suspended solids

* c - chlorophyll

* tcc - total cell count

* tb - total biovolume

* ccc - cyano cell count

* cb - cyano biovolume

# Data Wrangling

## Secchi depth

In [2]:
# Import pre-interpolated data and post-interpolated data
pre_sd = pd.read_csv("../../data/processed/secchi_depth.csv")
post_sd = pd.read_excel("../../data/processed/Secchi_merge.xlsx")

In [3]:
# assign explanatory variables in a list for easy use later

In [4]:
pre_sd.head()

Unnamed: 0,mlid,location,datetime,lat,long,secchi_depth_meters
0,USGS-401327111462601,UTAH LAKE HAB STUDY SITE 3,20160810,40.224119,-111.773939,
1,USGS-401432111454301,UTAH LAKE HAB STUDY SITE 4,20160810,40.242311,-111.761811,
2,USGS-401613111463301,UTAH LAKE HAB STUDY SITE 1,20160810,40.270319,-111.775881,
3,USGS-401658111491601,UTAH LAKE HAB STUDY SITE 2,20160810,40.2827,-111.8212,
4,UTAHDWQ_WQX-4917305,Utah Lake at American Fork Marina near boat ramp,20160926,40.34238,-111.800839,


In [5]:
post_sd.head()

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MEAN,STD,SUM,X25.,X50.,X75.
0,0,20160420,27254,0.036135,0.326584,0.065224,8900.717586,0.292608,0.326837,0.379791
1,1,20160525,27254,0.036135,0.2,0.0,5450.800081,0.2,0.2,0.2
2,2,20160630,27254,0.036135,0.2,0.0,5450.800081,0.2,0.2,0.2
3,3,20170510,27254,0.036135,0.266793,0.010521,7271.177156,0.256585,0.267754,0.272464
4,4,20170511,27254,0.036135,0.271572,0.011862,7401.413938,0.265434,0.269956,0.278423


In [6]:
# Drop rows with NA values for water quality parameters
pre_sd = pre_sd.dropna(subset=['secchi_depth_meters'])

In [7]:
pre_sd.shape

(230, 6)

In [8]:
pre_sd.datetime.nunique()

38

In [9]:
post_sd.OBJECTID_1.nunique()

33

In [10]:
# Dates for which spatial interpolation is not needed because there is only 1 value
# good answer https://stackoverflow.com/questions/19960077/how-to-filter-pandas-dataframe-using-in-and-not-in-like-in-sql
single_sd = pre_sd[~pre_sd.datetime.isin(post_sd.OBJECTID_1)]
single_sd.head()

Unnamed: 0,mlid,location,datetime,lat,long,secchi_depth_meters
257,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20170918,40.189139,-111.699931,0.2
258,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20171010,40.189139,-111.699931,0.38
260,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20180517,40.189139,-111.699931,0.25
261,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20180611,40.189139,-111.699931,0.1
324,UTAHDWQ_WQX-4917600,UTAH LAKE GOSHEN BAY SOUTHWEST END,20171017,40.060235,-111.874384,0.2


In [11]:
# Append rows that were not used in interpolation
for index, row in single_sd.iterrows():
    new_row = [["n", row.datetime, "", "", row[-1], "", row[-1], row[-1], 
                row[-1], row[-1]]]
    df = pd.DataFrame(new_row,columns=['OID','OBJECTID_1','COUNT','AREA','MEAN','STD','SUM','X25.','X50.','X75.'])
    post_sd = post_sd.append(df,ignore_index=True)

In [12]:
post_sd.tail(10)

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MEAN,STD,SUM,X25.,X50.,X75.
28,28,20190617,27254.0,0.0361352,0.650252,0.152027,17721.959504,0.549797,0.630033,0.737957
29,29,20190618,27254.0,0.0361352,0.345823,0.043238,9425.064244,0.305284,0.348735,0.389007
30,30,20190708,27254.0,0.0361352,0.294211,0.0362082,8018.421409,0.270048,0.298638,0.304595
31,31,20190812,27254.0,0.0361352,0.291811,0.0537108,7953.004422,0.273472,0.278827,0.290563
32,32,20190923,27254.0,0.0361352,0.25621,0.0470815,6982.756469,0.220447,0.251533,0.270792
33,n,20170918,,,0.2,,0.2,0.2,0.2,0.2
34,n,20171010,,,0.38,,0.38,0.38,0.38,0.38
35,n,20180517,,,0.25,,0.25,0.25,0.25,0.25
36,n,20180611,,,0.1,,0.1,0.1,0.1,0.1
37,n,20171017,,,0.2,,0.2,0.2,0.2,0.2


## Turbidity

In [13]:
# Import pre-interpolated data and post-interpolated data
pre_t = pd.read_csv("../../data/processed/turbidity.csv")
post_t = pd.read_excel("../../data/processed/turbidity_merge.xlsx")

In [14]:
# assign explanatory variables in a list for easy use later

In [15]:
pre_t.head()

Unnamed: 0,mlid,location,datetime,lat,long,turbidity
0,USGS-401327111462601,UTAH LAKE HAB STUDY SITE 3,20160810,40.224119,-111.773939,110.0
1,USGS-401432111454301,UTAH LAKE HAB STUDY SITE 4,20160810,40.242311,-111.761811,74.0
2,USGS-401613111463301,UTAH LAKE HAB STUDY SITE 1,20160810,40.270319,-111.775881,100.0
3,USGS-401658111491601,UTAH LAKE HAB STUDY SITE 2,20160810,40.2827,-111.8212,59.0
4,UTAHDWQ_WQX-4917305,Utah Lake at American Fork Marina near boat ramp,20160926,40.34238,-111.800839,


In [16]:
post_t.head()

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
0,0,20160810,27254,0.036135,59.000137,109.995926,50.995789,85.229099,8.579957,2322834.0,79.266926,86.921856,90.624678
1,1,20170510,27254,0.036135,33.504688,130.998306,97.493618,51.014825,18.29207,1390358.0,42.205771,45.639376,48.808829
2,2,20170511,27254,0.036135,29.902393,54.098595,24.196201,38.008367,3.353599,1035880.0,36.238186,38.380987,38.875797
3,3,20170615,27254,0.036135,19.306004,75.298866,55.992863,32.834742,10.1718,894878.1,28.327065,29.883014,32.734902
4,4,20170711,27254,0.036135,44.313015,208.996475,164.68346,79.25874,31.058095,2160118.0,61.766941,66.322838,89.202103


In [17]:
# Drop rows with NA values for water quality parameters
pre_t = pre_t.dropna(subset=['turbidity'])

In [18]:
pre_t.shape

(225, 6)

In [19]:
pre_t.datetime.nunique()

38

In [20]:
post_t.OBJECTID_1.nunique()

32

In [21]:
# Dates for which spatial interpolation is not needed because there is only 1 value
# good answer https://stackoverflow.com/questions/19960077/how-to-filter-pandas-dataframe-using-in-and-not-in-like-in-sql
single_t = pre_t[~pre_t.datetime.isin(post_t.OBJECTID_1)]
single_t.head()

Unnamed: 0,mlid,location,datetime,lat,long,turbidity
257,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20170918,40.189139,-111.699931,79.0
258,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20171010,40.189139,-111.699931,8.91
260,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20180517,40.189139,-111.699931,25.6
261,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20180611,40.189139,-111.699931,105.0
324,UTAHDWQ_WQX-4917600,UTAH LAKE GOSHEN BAY SOUTHWEST END,20171017,40.060235,-111.874384,46.2


In [22]:
# Append rows that were not used in interpolation
for index, row in single_t.iterrows():
    new_row = [["n", row.datetime, "", "","","","", row[-1], "", row[-1], row[-1], 
                row[-1], row[-1]]]
    df = pd.DataFrame(new_row,columns=['OID','OBJECTID_1','COUNT','AREA','MIN','MAX',
                                       'RANGE','MEAN','STD','SUM','X25.','X50.','X75.'])
    post_t = post_t.append(df,ignore_index=True)

In [23]:
post_t.tail(10)

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
28,28,20190618,27254.0,0.0361352,37.2001,64.549,27.3489,51.943398,5.37159,1415665.0,50.108074,51.7372,53.588611
29,29,20190708,27254.0,0.0361352,33.4397,165.494,132.054,72.326562,28.5274,1971188.0,50.302506,61.098532,88.363346
30,30,20190812,27254.0,0.0361352,48.1126,87.5472,39.4347,60.476896,5.32161,1648237.0,57.758058,60.466404,62.093691
31,31,20190923,27254.0,0.0361352,46.7046,104.049,57.3443,66.235042,8.68276,1805170.0,62.19749,64.621334,67.690212
32,n,20170918,,,,,,79.0,,79.0,79.0,79.0,79.0
33,n,20171010,,,,,,8.91,,8.91,8.91,8.91,8.91
34,n,20180517,,,,,,25.6,,25.6,25.6,25.6,25.6
35,n,20180611,,,,,,105.0,,105.0,105.0,105.0,105.0
36,n,20171017,,,,,,46.2,,46.2,46.2,46.2,46.2
37,n,20160107,,,,,,6.4,,6.4,6.4,6.4,6.4


## Total suspended solids

In [24]:
# Import pre-interpolated data and post-interpolated data
pre_tss = pd.read_csv("../../data/processed/total_suspended_solids.csv")
post_tss = pd.read_excel("../../data/processed/total_suspended_solids_merge.xlsx")

In [25]:
pre_tss.head()

Unnamed: 0,mlid,location,datetime,lat,long,total_suspended_soilds
0,USGS-401327111462601,UTAH LAKE HAB STUDY SITE 3,20160810,40.224119,-111.773939,
1,USGS-401432111454301,UTAH LAKE HAB STUDY SITE 4,20160810,40.242311,-111.761811,
2,USGS-401613111463301,UTAH LAKE HAB STUDY SITE 1,20160810,40.270319,-111.775881,
3,USGS-401658111491601,UTAH LAKE HAB STUDY SITE 2,20160810,40.2827,-111.8212,
4,UTAHDWQ_WQX-4917305,Utah Lake at American Fork Marina near boat ramp,20160926,40.34238,-111.800839,


In [26]:
post_tss.head()

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
0,0,20160210,27254,0.036135,5.128242,10.706867,5.578625,7.420204,1.13973,202230.2,6.724619,7.171308,8.14137
1,1,20160309,27254,0.036135,41.844074,176.386063,134.541988,107.057277,22.441072,2917739.0,93.791235,108.807423,117.412632
2,2,20160310,27254,0.036135,17.918865,252.526886,234.608021,85.200913,54.750808,2322066.0,44.364865,62.771431,119.2225
3,3,20160420,27254,0.036135,29.600904,127.997116,98.396212,51.209973,17.415235,1395677.0,43.227881,48.478968,51.38736
4,4,20160524,27254,0.036135,34.875626,120.215744,85.340118,75.706134,8.364905,2063295.0,75.114706,77.440876,78.273655


In [27]:
# Drop rows with NA values for water quality parameters
pre_tss = pre_tss.dropna(subset=['total_suspended_soilds'])

In [28]:
pre_tss.shape

(352, 6)

In [29]:
pre_tss.datetime.nunique()

63

In [30]:
post_tss.OBJECTID_1.nunique()

56

In [31]:
# Dates for which spatial interpolation is not needed because there is only 1 value
# good answer https://stackoverflow.com/questions/19960077/how-to-filter-pandas-dataframe-using-in-and-not-in-like-in-sql
single_tss = pre_tss[~pre_tss.datetime.isin(post_tss.OBJECTID_1)]
single_tss.head()

Unnamed: 0,mlid,location,datetime,lat,long,total_suspended_soilds
215,UTAHDWQ_WQX-4917433,Utah Lake SP @ Marina,20160204,40.238428,-111.738826,10.0
257,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20170918,40.189139,-111.699931,113.0
258,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20171010,40.189139,-111.699931,14.4
261,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20180611,40.189139,-111.699931,90.0
324,UTAHDWQ_WQX-4917600,UTAH LAKE GOSHEN BAY SOUTHWEST END,20171017,40.060235,-111.874384,58.0


In [32]:
# Append rows that were not used in interpolation
for index, row in single_tss.iterrows():
    new_row = [["n", row.datetime, "", "","","","", row[-1], "", row[-1], row[-1], 
                row[-1], row[-1]]]
    df = pd.DataFrame(new_row,columns=['OID','OBJECTID_1','COUNT','AREA','MIN','MAX',
                                       'RANGE','MEAN','STD','SUM','X25.','X50.','X75.'])
    post_tss = post_tss.append(df,ignore_index=True)

In [33]:
post_tss.tail(10)

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
53,53,20190708,27254.0,0.0361352,30.0315,143.995,113.963,63.067505,25.2824,1718842.0,43.043705,53.330881,77.487083
54,54,20190812,27254.0,0.0361352,41.8111,76.343,34.5319,54.65718,5.32136,1489627.0,52.16559,54.857952,55.796984
55,55,20190923,27254.0,0.0361352,41.41,85.9991,44.5891,56.248971,7.01037,1533009.0,52.325653,55.27165,58.119693
56,n,20160204,,,,,,10.0,,10.0,10.0,10.0,10.0
57,n,20170918,,,,,,113.0,,113.0,113.0,113.0,113.0
58,n,20171010,,,,,,14.4,,14.4,14.4,14.4,14.4
59,n,20180611,,,,,,90.0,,90.0,90.0,90.0,90.0
60,n,20171017,,,,,,58.0,,58.0,58.0,58.0,58.0
61,n,20160107,,,,,,7.0,,7.0,7.0,7.0,7.0
62,n,20180620,,,,,,124.2,,124.2,124.2,124.2,124.2


## Chlorophyll

In [34]:
# Import pre-interpolated data and post-interpolated data
pre_c = pd.read_csv("../../data/processed/chlorophyll.csv")
post_c = pd.read_excel("../../data/processed/chlorophyll_merge.xlsx")

In [35]:
pre_c.head()

Unnamed: 0,mlid,location,datetime,lat,long,chlorophyll
0,USGS-401327111462601,UTAH LAKE HAB STUDY SITE 3,20160810,40.224119,-111.773939,
1,USGS-401432111454301,UTAH LAKE HAB STUDY SITE 4,20160810,40.242311,-111.761811,
2,USGS-401613111463301,UTAH LAKE HAB STUDY SITE 1,20160810,40.270319,-111.775881,
3,USGS-401658111491601,UTAH LAKE HAB STUDY SITE 2,20160810,40.2827,-111.8212,
4,UTAHDWQ_WQX-4917305,Utah Lake at American Fork Marina near boat ramp,20160926,40.34238,-111.800839,


In [36]:
post_c.head()

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
0,0,20160630,27254,0.036135,12.600169,44.398483,31.798314,20.023672,6.375036,545725.2,16.660503,18.61759,19.578767
1,1,20170510,27254,0.036135,3.889101,65.859085,61.969984,18.346686,11.235891,500020.6,10.193552,16.977897,22.789905
2,2,20170511,27254,0.036135,4.360165,4.839949,0.479784,4.576494,0.081685,124727.8,4.537379,4.559062,4.606094
3,3,20170615,27254,0.036135,2.729604,76.986504,74.256899,11.977235,11.18179,326427.6,7.357743,9.357888,11.70163
4,4,20170711,27254,0.036135,44.539921,250.619919,206.079998,81.362256,30.33314,2217447.0,66.965586,72.332157,82.581816


In [37]:
# Drop rows with NA values for water quality parameters
pre_c = pre_c.dropna(subset=['chlorophyll'])

In [38]:
pre_c.shape

(164, 6)

In [39]:
pre_c.datetime.nunique()

37

In [40]:
post_c.OBJECTID_1.nunique()

31

In [41]:
# Dates for which spatial interpolation is not needed because there is only 1 value
# good answer https://stackoverflow.com/questions/19960077/how-to-filter-pandas-dataframe-using-in-and-not-in-like-in-sql
single_c = pre_c[~pre_c.datetime.isin(post_c.OBJECTID_1)]
single_c.head()

Unnamed: 0,mlid,location,datetime,lat,long,chlorophyll
257,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20170918,40.189139,-111.699931,123.766667
258,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20171010,40.189139,-111.699931,13.56
260,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20180517,40.189139,-111.699931,85.2
261,UTAHDWQ_WQX-4917450,UTAH LAKE AT MIDDLE OF PROVO BAY,20180611,40.189139,-111.699931,218.0
324,UTAHDWQ_WQX-4917600,UTAH LAKE GOSHEN BAY SOUTHWEST END,20171017,40.060235,-111.874384,34.956667


In [42]:
# Append rows that were not used in interpolation
for index, row in single_c.iterrows():
    new_row = [["n", row.datetime, "", "","","","", row[-1], "", row[-1], row[-1], 
                row[-1], row[-1]]]
    df = pd.DataFrame(new_row,columns=['OID','OBJECTID_1','COUNT','AREA','MIN','MAX',
                                       'RANGE','MEAN','STD','SUM','X25.','X50.','X75.'])
    post_c = post_c.append(df,ignore_index=True)

In [43]:
post_c.tail(10)

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
27,27,20190618,27254.0,0.0361352,4.85,4.85,0.0,4.85,0.0,132181.897401,4.85,4.85,4.85
28,28,20190708,27254.0,0.0361352,4.23212,146.687,142.455,16.362065,23.5941,445931.73116,7.555949,10.544593,14.083227
29,29,20190812,27254.0,0.0361352,3.8801,21.2696,17.3895,9.710701,3.21818,264655.443435,7.713109,9.214616,10.909076
30,30,20190923,27254.0,0.0361352,12.6073,85.1844,72.5771,24.904932,10.6207,678759.025649,19.881626,22.203969,24.943311
31,n,20170918,,,,,,123.766667,,123.766667,123.766667,123.766667,123.766667
32,n,20171010,,,,,,13.56,,13.56,13.56,13.56,13.56
33,n,20180517,,,,,,85.2,,85.2,85.2,85.2,85.2
34,n,20180611,,,,,,218.0,,218.0,218.0,218.0,218.0
35,n,20171017,,,,,,34.956667,,34.956667,34.956667,34.956667,34.956667
36,n,20160525,,,,,,22.28,,22.28,22.28,22.28,22.28


## Total cell count

In [44]:
# Import pre-interpolated data and post-interpolated data
pre_tcc = pd.read_csv("../../data/processed/total_cell_count.csv")
post_tcc = pd.read_excel("../../data/processed/total_cell_count_merge.xlsx")

In [45]:
pre_tcc.head()

Unnamed: 0,mlid,location,datetime,lat,long,cell_count,sample_depth
0,4917512,Utah Lake American Fork Marina,20160726,0.0,,7162.994,Composite
1,4917512,Utah Lake American Fork Marina,20160802,0.0,,295.6425,Composite
2,4917512,Utah Lake American Fork Marina,20160823,0.0,,1140335.0,Composite
3,4917486,Utah Lake Saratoga Private Dock,20160720,0.0,,0.0,Composite
4,4917486,Utah Lake Saratoga Private Dock,20160726,0.0,,0.0,Composite


In [46]:
post_tcc.head()

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
0,0,20160713,27254,0.036135,34194030.0,36007970.0,1813944.0,34953290.0,178704.3,952616900000.0,34888420.0,34968330.0,35006710.0
1,1,20160714,27254,0.036135,504.6424,673381.4,672876.8,316713.4,123880.0,8631707000.0,261408.1,342062.4,384145.0
2,2,20160715,27254,0.036135,1760.878,49429180.0,49427420.0,6291926.0,7687363.0,171480100000.0,1479650.0,3153114.0,8431640.0
3,3,20160720,27254,0.036135,3681.267,20095070.0,20091390.0,2565454.0,3468971.0,69918890000.0,359125.8,1111689.0,3617549.0
4,4,20160726,27254,0.036135,29.6744,73628.7,73599.03,34632.51,12116.27,943874500.0,28176.28,33860.95,39602.25


In [47]:
# Drop rows with NA values for water quality parameters
pre_tcc = pre_tcc.dropna(subset=['cell_count'])

In [48]:
pre_tcc.shape

(364, 7)

In [49]:
pre_tcc.datetime.nunique()

78

In [50]:
post_tcc.OBJECTID_1.nunique()

67

In [51]:
# Dates for which spatial interpolation is not needed because there is only 1 value
# good answer https://stackoverflow.com/questions/19960077/how-to-filter-pandas-dataframe-using-in-and-not-in-like-in-sql
single_tcc = pre_tcc[~pre_tcc.datetime.isin(post_tcc.OBJECTID_1)]
single_tcc.head()

Unnamed: 0,mlid,location,datetime,lat,long,cell_count,sample_depth
22,4917600,Goshen Bay,20180918,40.060235,-111.874384,81687.611358,Surface
117,4917702,Sandy Beach,20180703,40.171116,-111.745876,114651.093165,Composite
148,4917446,Provo Bay,20190612,40.18039,-111.71756,44543.502998,Composite
149,4917446,Provo Bay,20190627,40.18039,-111.71756,33274.298242,Composite
154,4917446,Provo Bay Ski Dock,20180719,40.18039,-111.71756,915194.065181,Composite


In [52]:
# Append rows that were not used in interpolation
for index, row in single_tcc.iterrows():
    new_row = [["n", row.datetime, "", "","","","", row[-2], "", row[-2], row[-2], 
                row[-2], row[-2]]]
    df = pd.DataFrame(new_row,columns=['OID','OBJECTID_1','COUNT','AREA','MIN','MAX',
                                       'RANGE','MEAN','STD','SUM','X25.','X50.','X75.'])
    post_tcc = post_tcc.append(df,ignore_index=True)

In [53]:
post_tcc.tail(10)

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
68,n,20180703,,,,,,114651.093165,,114651.093165,114651.093165,114651.093165,114651.093165
69,n,20190612,,,,,,44543.502998,,44543.502998,44543.502998,44543.502998,44543.502998
70,n,20190627,,,,,,33274.298242,,33274.298242,33274.298242,33274.298242,33274.298242
71,n,20180719,,,,,,915194.065181,,915194.065181,915194.065181,915194.065181,915194.065181
72,n,20190618,,,,,,498662.948894,,498662.948894,498662.948894,498662.948894,498662.948894
73,n,20180710,,,,,,784674.0,,784674.0,784674.0,784674.0,784674.0
74,n,20190419,,,,,,47501.0,,47501.0,47501.0,47501.0,47501.0
75,n,20190613,,,,,,904.312411,,904.312411,904.312411,904.312411,904.312411
76,n,20170824,,,,,,176367.5784,,176367.5784,176367.5784,176367.5784,176367.5784
77,n,20180823,,,,,,115229.123016,,115229.123016,115229.123016,115229.123016,115229.123016


## Total biovolume

In [54]:
# Import pre-interpolated data and post-interpolated data
pre_tb = pd.read_csv("../../data/processed/total_biovolume.csv")
post_tb = pd.read_excel("../../data/processed/total_biovolume_merge.xlsx")

In [55]:
pre_tb.head()

Unnamed: 0,mlid,location,datetime,lat,long,biovolume,sample_depth
0,4917512,Utah Lake American Fork Marina,20160726,0.0,,,Composite
1,4917512,Utah Lake American Fork Marina,20160802,0.0,,,Composite
2,4917512,Utah Lake American Fork Marina,20160823,0.0,,,Composite
3,4917486,Utah Lake Saratoga Private Dock,20160720,0.0,,,Composite
4,4917486,Utah Lake Saratoga Private Dock,20160726,0.0,,,Composite


In [56]:
post_tb.head()

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
0,0,20180503,27254,0.036135,1728651.0,13855320.0,12126670.0,5401473.0,1395017.0,147211700000.0,4936006.0,5265765.0,5531623.0
1,1,20180516,27254,0.036135,246792.5,57376800.0,57130010.0,7612261.0,8052065.0,207464600000.0,3797031.0,6045109.0,9200914.0
2,2,20180523,27254,0.036135,1653163.0,12058330.0,10405170.0,5520645.0,1782491.0,150459700000.0,4479446.0,4847454.0,5998409.0
3,3,20180531,27254,0.036135,2243411.0,129786700.0,127543300.0,30888440.0,24611020.0,841833600000.0,16735200.0,22429990.0,33644110.0
4,4,20180606,27254,0.036135,153012800.0,6417919000.0,6264906000.0,3309140000.0,202470400.0,90187290000000.0,3263644000.0,3326082000.0,3388209000.0


In [57]:
# Drop rows with NA values for water quality parameters
pre_tb = pre_tb.dropna(subset=['biovolume'])

In [58]:
pre_tb.shape

(221, 7)

In [59]:
pre_tb.datetime.nunique()

49

In [60]:
post_tb.OBJECTID_1.nunique()

39

In [61]:
# Dates for which spatial interpolation is not needed because there is only 1 value
# good answer https://stackoverflow.com/questions/19960077/how-to-filter-pandas-dataframe-using-in-and-not-in-like-in-sql
single_tb = pre_tb[~pre_tb.datetime.isin(post_tb.OBJECTID_1)]
single_tb.head()

Unnamed: 0,mlid,location,datetime,lat,long,biovolume,sample_depth
22,4917600,Goshen Bay,20180918,40.060235,-111.874384,14061840.0,Surface
117,4917702,Sandy Beach,20180703,40.171116,-111.745876,25173980.0,Composite
148,4917446,Provo Bay,20190612,40.18039,-111.71756,13337060.0,Composite
149,4917446,Provo Bay,20190627,40.18039,-111.71756,9800569.0,Composite
154,4917446,Provo Bay Ski Dock,20180719,40.18039,-111.71756,192386000.0,Composite


In [62]:
# Append rows that were not used in interpolation
for index, row in single_tb.iterrows():
    new_row = [["n", row.datetime, "", "","","","", row[-2], "", row[-2], row[-2], 
                row[-2], row[-2]]]
    df = pd.DataFrame(new_row,columns=['OID','OBJECTID_1','COUNT','AREA','MIN','MAX',
                                       'RANGE','MEAN','STD','SUM','X25.','X50.','X75.'])
    post_tb = post_tb.append(df,ignore_index=True)

In [63]:
post_tb.tail(10)

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
39,n,20180918,,,,,,14061840.0,,14061840.0,14061840.0,14061840.0,14061840.0
40,n,20180703,,,,,,25173980.0,,25173980.0,25173980.0,25173980.0,25173980.0
41,n,20190612,,,,,,13337060.0,,13337060.0,13337060.0,13337060.0,13337060.0
42,n,20190627,,,,,,9800569.0,,9800569.0,9800569.0,9800569.0,9800569.0
43,n,20180719,,,,,,192386000.0,,192386000.0,192386000.0,192386000.0,192386000.0
44,n,20190618,,,,,,148137500.0,,148137500.0,148137500.0,148137500.0,148137500.0
45,n,20180710,,,,,,198630000.0,,198630000.0,198630000.0,198630000.0,198630000.0
46,n,20190419,,,,,,26191190.0,,26191190.0,26191190.0,26191190.0,26191190.0
47,n,20190613,,,,,,899563.4,,899563.4,899563.4,899563.4,899563.4
48,n,20180823,,,,,,25824120.0,,25824120.0,25824120.0,25824120.0,25824120.0


## Cyano cell count

In [64]:
# Import pre-interpolated data and post-interpolated data
pre_ccc = pd.read_csv("../../data/processed/cyano_cell_count.csv")
post_ccc = pd.read_excel("../../data/processed/cyano_cell_count_merge.xlsx")

In [65]:
pre_ccc.head()

Unnamed: 0,mlid,location,datetime,lat,long,cell_count,sample_depth
0,4917512,Utah Lake American Fork Marina,20160726,0.0,,7162.994,Composite
1,4917512,Utah Lake American Fork Marina,20160802,0.0,,295.6425,Composite
2,4917512,Utah Lake American Fork Marina,20160823,0.0,,1140335.0,Composite
3,4917486,Utah Lake Saratoga Private Dock,20160720,0.0,,0.0,Composite
4,4917485,Utah Lake Saratoga Springs Public Marina,20160720,0.0,,90995.93,Composite


In [66]:
post_ccc.head()

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
0,0,20160713,27254,0.036135,34194030.0,36007970.0,1813944.0,34953290.0,178704.3,952616900000.0,34888420.0,34968330.0,35006710.0
1,1,20160714,27254,0.036135,504.6424,673381.4,672876.8,316713.4,123880.0,8631707000.0,261408.1,342062.4,384145.0
2,2,20160715,27254,0.036135,1760.878,49429180.0,49427420.0,6291926.0,7687363.0,171480100000.0,1479650.0,3153114.0,8431640.0
3,3,20160720,27254,0.036135,3681.267,20095070.0,20091390.0,2565454.0,3468971.0,69918890000.0,359125.8,1111689.0,3617549.0
4,4,20160726,27254,0.036135,3108.889,73630.95,70522.06,37417.44,11374.62,1019775000.0,31224.99,36254.35,42943.99


In [67]:
# Drop rows with NA values for water quality parameters
pre_ccc = pre_ccc.dropna(subset=['cell_count'])

In [68]:
pre_ccc.shape

(445, 7)

In [69]:
pre_ccc.datetime.nunique()

98

In [70]:
post_ccc.OBJECTID_1.nunique()

72

In [71]:
# Dates for which spatial interpolation is not needed because there is only 1 value
# good answer https://stackoverflow.com/questions/19960077/how-to-filter-pandas-dataframe-using-in-and-not-in-like-in-sql
single_ccc = pre_ccc[~pre_ccc.datetime.isin(post_ccc.OBJECTID_1)]
single_ccc.head()

Unnamed: 0,mlid,location,datetime,lat,long,cell_count,sample_depth
19,4917600,Goshen Bay,20180918,40.060235,-111.874384,55967.948278,Surface
25,4917708,Lincoln Beach,20191016,40.142595,-111.802026,56239.0,Composite
125,4917702,Sandy Beach,20180703,40.171116,-111.745876,60848.231748,Composite
155,4917446,Provo Bay,20190612,40.18039,-111.71756,17427.669363,Composite
156,4917446,Provo Bay,20190627,40.18039,-111.71756,6291.564261,Composite


In [72]:
# Append rows that were not used in interpolation
for index, row in single_ccc.iterrows():
    new_row = [["n", row.datetime, "", "","","","", row[-2], "", row[-2], row[-2], 
                row[-2], row[-2]]]
    df = pd.DataFrame(new_row,columns=['OID','OBJECTID_1','COUNT','AREA','MIN','MAX',
                                       'RANGE','MEAN','STD','SUM','X25.','X50.','X75.'])
    post_ccc = post_ccc.append(df,ignore_index=True)

In [73]:
post_ccc.tail(10)

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
143,n,20200910,,,,,,23760910.0,,23760910.0,23760910.0,23760910.0,23760910.0
144,n,20200915,,,,,,114969.4,,114969.4,114969.4,114969.4,114969.4
145,n,20200810,,,,,,66837.64,,66837.64,66837.64,66837.64,66837.64
146,n,20200910,,,,,,72646040.0,,72646040.0,72646040.0,72646040.0,72646040.0
147,n,20200915,,,,,,455230.8,,455230.8,455230.8,455230.8,455230.8
148,n,20200915,,,,,,27088.95,,27088.95,27088.95,27088.95,27088.95
149,n,20200930,,,,,,2389045.0,,2389045.0,2389045.0,2389045.0,2389045.0
150,n,20200915,,,,,,14845460.0,,14845460.0,14845460.0,14845460.0,14845460.0
151,n,20200910,,,,,,10163670.0,,10163670.0,10163670.0,10163670.0,10163670.0
152,n,20200915,,,,,,49092.61,,49092.61,49092.61,49092.61,49092.61


## Cyano biovolume

In [74]:
# Import pre-interpolated data and post-interpolated data
pre_cb = pd.read_csv("../../data/processed/cyano_biovolume.csv")
post_cb = pd.read_excel("../../data/processed/cyano_biovolume_merge.xlsx")

In [75]:
pre_cb.head()

Unnamed: 0,mlid,location,datetime,lat,long,biovolume,sample_depth
0,4917512,Utah Lake American Fork Marina,20160726,0.0,,,Composite
1,4917512,Utah Lake American Fork Marina,20160802,0.0,,,Composite
2,4917512,Utah Lake American Fork Marina,20160823,0.0,,,Composite
3,4917486,Utah Lake Saratoga Private Dock,20160720,0.0,,,Composite
4,4917485,Utah Lake Saratoga Springs Public Marina,20160720,0.0,,,Composite


In [76]:
post_cb.head()

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
0,0,20180516,27254,0.036135,10387.95,582321.1,571933.2,149396.5,102328.5,4071651000.0,86665.19,129636.6,180294.2
1,1,20180523,27254,0.036135,783.6783,8067.041,7283.362,4644.62,1501.253,126584500.0,3376.814,4474.981,6012.594
2,2,20180531,27254,0.036135,317627.4,914243.6,596616.2,582404.7,57726.66,15872860000.0,566541.2,570521.6,592505.1
3,3,20180606,27254,0.036135,90471290.0,6172645000.0,6082174000.0,3154541000.0,196564800.0,85973860000000.0,3110373000.0,3170989000.0,3231304000.0
4,4,20180612,27254,0.036135,680759.6,8927466.0,8246706.0,2130096.0,1396311.0,58053630000.0,1303399.0,1896582.0,2397550.0


In [77]:
# Drop rows with NA values for water quality parameters
pre_cb = pre_cb.dropna(subset=['biovolume'])

In [78]:
pre_cb.shape

(308, 7)

In [79]:
pre_cb.datetime.nunique()

69

In [80]:
post_cb.OBJECTID_1.nunique()

44

In [81]:
# Dates for which spatial interpolation is not needed because there is only 1 value
# good answer https://stackoverflow.com/questions/19960077/how-to-filter-pandas-dataframe-using-in-and-not-in-like-in-sql
single_cb = pre_cb[~pre_cb.datetime.isin(post_cb.OBJECTID_1)]
single_cb.head()

Unnamed: 0,mlid,location,datetime,lat,long,biovolume,sample_depth
19,4917600,Goshen Bay,20180918,40.060235,-111.874384,6485007.0,Surface
25,4917708,Lincoln Beach,20191016,40.142595,-111.802026,18999570.0,Composite
125,4917702,Sandy Beach,20180703,40.171116,-111.745876,7284319.0,Composite
155,4917446,Provo Bay,20190612,40.18039,-111.71756,5336104.0,Composite
156,4917446,Provo Bay,20190627,40.18039,-111.71756,2146658.0,Composite


In [82]:
# Append rows that were not used in interpolation
for index, row in single_cb.iterrows():
    new_row = [["n", row.datetime, "", "","","","", row[-2], "", row[-2], row[-2], 
                row[-2], row[-2]]]
    df = pd.DataFrame(new_row,columns=['OID','OBJECTID_1','COUNT','AREA','MIN','MAX',
                                       'RANGE','MEAN','STD','SUM','X25.','X50.','X75.'])
    post_cb = post_cb.append(df,ignore_index=True)

In [83]:
post_cb.tail(10)

Unnamed: 0,OID,OBJECTID_1,COUNT,AREA,MIN,MAX,RANGE,MEAN,STD,SUM,X25.,X50.,X75.
114,n,20200910,,,,,,3538889000.0,,3538889000.0,3538889000.0,3538889000.0,3538889000.0
115,n,20200915,,,,,,13184090.0,,13184090.0,13184090.0,13184090.0,13184090.0
116,n,20200810,,,,,,5857255.0,,5857255.0,5857255.0,5857255.0,5857255.0
117,n,20200910,,,,,,8768633000.0,,8768633000.0,8768633000.0,8768633000.0,8768633000.0
118,n,20200915,,,,,,29575160.0,,29575160.0,29575160.0,29575160.0,29575160.0
119,n,20200915,,,,,,4421043.0,,4421043.0,4421043.0,4421043.0,4421043.0
120,n,20200930,,,,,,577924200.0,,577924200.0,577924200.0,577924200.0,577924200.0
121,n,20200915,,,,,,3080952000.0,,3080952000.0,3080952000.0,3080952000.0,3080952000.0
122,n,20200910,,,,,,1911153000.0,,1911153000.0,1911153000.0,1911153000.0,1911153000.0
123,n,20200915,,,,,,7900873.0,,7900873.0,7900873.0,7900873.0,7900873.0


## Tweets

In [84]:
# Import daily and weekly tweet count
tweets_daily = pd.read_csv("../../data/processed/daily_tweet_count.csv")
tweets_weekly = pd.read_csv("../../data/processed/weekly_tweet_count.csv")

In [85]:
# Convert data column to datetime object
tweets_daily.time = pd.to_datetime(tweets_daily.time, format='%Y-%m-%d')
tweets_weekly.time = pd.to_datetime(tweets_weekly.time, format='%Y-%m-%d')

In [86]:
tweets_daily.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1823 entries, 0 to 1822
Data columns (total 4 columns):
time              1823 non-null datetime64[ns]
water quality?    1823 non-null float64
neg               1823 non-null float64
pos               1823 non-null float64
dtypes: datetime64[ns](1), float64(3)
memory usage: 57.1 KB


In [87]:
tweets_daily.head()

Unnamed: 0,time,water quality?,neg,pos
0,2016-01-01,0.0,0.0,0.0
1,2016-01-02,0.0,0.0,0.0
2,2016-01-03,0.0,0.0,0.0
3,2016-01-04,0.0,0.0,0.0
4,2016-01-05,0.0,0.0,0.0


In [88]:
tweets_daily.columns

Index(['time', 'water quality?', 'neg', 'pos'], dtype='object')

# Data Visualization

In [89]:
import plotly
plotly.__version__

'4.9.0'

In [90]:
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.io as pio #to display low-level dicitonary graphs
from plotly.subplots import make_subplots #for secondary axis

In [91]:
# Convert date to datetime object
post_sd.OBJECTID_1 = pd.to_datetime(post_sd.OBJECTID_1, format='%Y%m%d')
post_t.OBJECTID_1 = pd.to_datetime(post_t.OBJECTID_1, format='%Y%m%d')
post_tss.OBJECTID_1 = pd.to_datetime(post_tss.OBJECTID_1, format='%Y%m%d')
post_c.OBJECTID_1 = pd.to_datetime(post_c.OBJECTID_1, format='%Y%m%d')
post_tcc.OBJECTID_1 = pd.to_datetime(post_tcc.OBJECTID_1, format='%Y%m%d')
post_tb.OBJECTID_1 = pd.to_datetime(post_tb.OBJECTID_1, format='%Y%m%d')
post_ccc.OBJECTID_1 = pd.to_datetime(post_ccc.OBJECTID_1, format='%Y%m%d')
post_cb.OBJECTID_1 = pd.to_datetime(post_cb.OBJECTID_1, format='%Y%m%d')

In [94]:
# Export the files

post_sd.to_csv("../../data/processed/regression/secchi_depth_final.csv", index=False)
post_t.to_csv("../../data/processed/regression/turbidity__final.csv", index=False)
post_tss.to_csv("../../data/processed/regression/total_suspended_solids_final.csv", index=False)
post_c.to_csv("../../data/processed/regression/chlorophyll_final.csv", index=False)
post_tcc.to_csv("../../data/processed/regression/total_cell_count_final.csv", index=False)
post_tb.to_csv("../../data/processed/regression/total_biovolume_final.csv", index=False)
post_ccc.to_csv("../../data/processed/regression/cyano_cell_count_final.csv", index=False)
post_cb.to_csv("../../data/processed/regression/cyano_biovolume_final.csv", index=False)

tweets_daily.to_csv("../../data/processed/regression/tweets_daily_final.csv", index=False)


In [248]:
# add secondary axis https://stackoverflow.com/questions/57296683/adding-a-secondary-axis-in-plotly-python

sd = go.Scatter(x=list(post_sd.OBJECTID_1),
                 y=list(post_sd['X75.']),
                 #name='Secchi Disk Depth (m)',
                 mode='markers',
                 marker=dict(color='red', size=10),
                yaxis='y1',
                showlegend=True,
                name='Water Quality'
                )



t = go.Scatter(x=list(post_t.OBJECTID_1),
                 y=list(post_t['X75.']),
                 #name='Turbidity (NTRU)',
                 mode='markers',
                 marker=dict(color='red', size=10),
                yaxis='y1',
                showlegend=True,
                name='Water Quality'
                )

tss = go.Scatter(x=list(post_tss.OBJECTID_1),
                   y=list(post_tss['X75.']),
                   #name='Total suspended solids (mg/L)',
                   mode='markers',
                   marker=dict(color='red', size=10),
                yaxis='y1',
                showlegend=True,
                name='Water Quality'
                  )

c = go.Scatter(x=list(post_c.OBJECTID_1), #uncorrected for pheophytin
                 y=list(post_c['X75.']),
                 #name=r'Chlorophyll a ($ \mu g / L$)',
                 mode='markers',
                 marker=dict(color='red', size=10),
                yaxis='y1',
                showlegend=True,
                name='Water Quality'
                )

tcc = go.Scatter(x=list(post_tcc.OBJECTID_1),
                   y=list(post_tcc['X75.']),
                   #name='Phytoplankton cell count (cells/mL)',
                   mode='markers',
                   marker=dict(color='red', size=10),
                yaxis='y1',
                showlegend=True,
                name='Water Quality'
                  )

tb = go.Scatter(x=list(post_tb.OBJECTID_1),
                   y=list(post_tb['X75.']),
                   #name='Phytoplankton biovolume ($ \mu m^3 / mL$)',
                   mode='markers',
                   marker=dict(color='red', size=10),
                yaxis='y1',
                showlegend=True,
                name='Water Quality'
                  )

ccc = go.Scatter(x=list(post_ccc.OBJECTID_1),
                   y=list(post_ccc['X75.']),
                   #name='Cyanobacteria cell count (cells/mL)',
                   mode='markers',
                   marker=dict(color='red', size=10),
                yaxis='y1',
                showlegend=True,
                name='Water Quality'
                  )

cb = go.Scatter(x=list(post_cb.OBJECTID_1),
                   y=list(post_cb['X75.']),
                   #name='Cyanobacteria biovolume ($ \mu m^3 / mL$)',
                   mode='markers',
                   marker=dict(color='red', size=10),
                yaxis='y1',
                showlegend=True,
                name='Water Quality'
                  )

tweets = go.Scatter(x=list(tweets_daily['time']),
                         y=list(tweets_daily['neg']),
                         name='Daily Tweet Count',
                         mode='markers',
                         marker=dict(color='cyan', size=10),
                         yaxis='y2')


data = [sd, t, tss, c, tcc, tb, ccc, cb, tweets]

updatemenus = list([
    dict(type="buttons",
        active=1,
        buttons=list([
            dict(label = 'Secchi Disk Depth (m)',
                method='update',
                args=[{'visible':[True, False, False, False, False, False, False, False, True]}]
                ),
            dict(label='Turbidity (NTRU)',
                method='update',
                args=[{'visible':[False, True, False, False, False, False, False, False, True]}]
                ),
            dict(label='Total suspended solids (mg/L)',
                method='update',
                args=[{'visible':[False, False, True, False, False, False, False, False, True]}]
                ),
            dict(label=r'Chlorophyll a (mu g/L)',
                method='update',
                args=[{'visible':[False, False, False, True, False, False, False, False, True]}]
                ),
            dict(label='Phytoplankton cell count (cells/mL)',
                method='update',
                args=[{'visible':[False, False, False, False, True, False, False, False, True]}]
                ),
            dict(label='Phytoplankton biovolume (cubic mu m/mL)',
                method='update',
                args=[{'visible':[False, False, False, False, False, True, False, False, True]}]
                ),
            dict(label='Cyanobacteria cell count (cells/mL)',
                method='update',
                args=[{'visible':[False, False, False, False, False, False, True, False, True]}]
                ),
            dict(label='Cyanobacteria biovolume (cubic mu m/mL)',
                method='update',
                args=[{'visible':[False, False, False, False, False, False, False, True, True]}]
                ),
        ]))
])

fig.add_trace(go.Scatter(x=list(tweets_daily['time']),
                         y=list(tweets_daily['neg']),
                         name='Daily Tweet Count',
                         mode='markers',
                         marker=dict(color='cyan', size=10),
                         yaxis='y2')
             )

layout = dict(title="Water Quality Parameters",showlegend=True,
             updatemenus=updatemenus, plot_bgcolor="white",
             yaxis=dict(side='left'),
             yaxis2=dict(title="Tweets/day", 
                         side="right",
                        overlaying='y1'))

fig = go.Figure(data=data, layout=layout)

fig.update_traces(mode='markers', opacity=0.7, marker_size=6.5)
fig.update_xaxes(showline=True, linewidth=2, linecolor='grey')
fig.update_yaxes(showline=True, linewidth=2, linecolor='grey')

# Legend position
fig.update_layout(legend=dict(
    yanchor="top",
    #y=-2,
    xanchor="right",
    x=1.307
))

# Figure dimensions
fig.update_layout(
    autosize=False,
    width=1030,
    height=500,
    margin=dict(
        l=150,
        r=0,
        b=0,
        t=100,
        pad=0
    )
)
pio.show(fig)




In [166]:
# add secondary axis https://stackoverflow.com/questions/57296683/adding-a-secondary-axis-in-plotly-python

fig = go.Figure()

fig.add_trace(go.Scatter(x=list(post_sd.OBJECTID_1),
                 y=list(post_sd['X75.']),
                 name='Secchi Disk Depth (m)',
                 mode='markers',
                 marker=dict(color='pink', size=10),
                yaxis='y1'
                )
             )



fig.add_trace(go.Scatter(x=list(post_t.OBJECTID_1),
                 y=list(post_t['X75.']),
                 name='Turbidity (NTRU)',
                 mode='markers',
                 marker=dict(color='pink', size=10),
                yaxis='y1'
                )
             )

fig.add_trace(go.Scatter(x=list(post_tss.OBJECTID_1),
                   y=list(post_tss['X75.']),
                   name='Total suspended solids (mg/L)',
                   mode='markers',
                   marker=dict(color='pink', size=10),
                yaxis='y1'
                  )
             )

fig.add_trace(go.Scatter(x=list(post_c.OBJECTID_1), #uncorrected for pheophytin
                 y=list(post_c['X75.']),
                 name=r'Chlorophyll a ($ \mu g / L$)',
                 mode='markers',
                 marker=dict(color='pink', size=10),
                yaxis='y1'
                )
             )

fig.add_trace(go.Scatter(x=list(post_tcc.OBJECTID_1),
                   y=list(post_tcc['X75.']),
                   name='Phytoplankton cell count (cells/mL)',
                   mode='markers',
                   marker=dict(color='pink', size=10),
                yaxis='y1'
                  )
             )

fig.add_trace(go.Scatter(x=list(post_tb.OBJECTID_1),
                   y=list(post_tb['X75.']),
                   name='Phytoplankton biovolume ($ \mu m^3 / mL$)',
                   mode='markers',
                   marker=dict(color='pink', size=10),
                yaxis='y1'
                  )
             )

fig.add_trace(go.Scatter(x=list(post_ccc.OBJECTID_1),
                   y=list(post_ccc['X75.']),
                   name='Cyanobacteria cell count (cells/mL)',
                   mode='markers',
                   marker=dict(color='pink', size=10),
                yaxis='y1'
                  )
             )

fig.add_trace( go.Scatter(x=list(post_cb.OBJECTID_1),
                   y=list(post_cb['X75.']),
                   name='Cyanobacteria biovolume ($ \mu m^3 / mL$)',
                   mode='markers',
                   marker=dict(color='pink', size=10),
                yaxis='y1'
                  )
             )

fig.add_trace(go.Scatter(x=list(tweets_daily['time']),
                         y=list(tweets_daily['neg']),
                         name='Daily Tweet Count',
                         mode='markers',
                         marker=dict(color='cyan', size=10),
                         yaxis='y2')
             )

fig.update_layout(updatemenus = [dict(type="buttons",
        active=0,
                                      x=0.57,
            y=1.2,
        buttons=list([
            dict(label = 'Secchi Disk Depth (m)',
                method='update',
                args=[{'visible':[True, False, False, False, False, False, False, False]}]
                ),
            dict(label='Turbidity (NTRU)',
                method='update',
                args=[{'visible':[False, True, False, False, False, False, False, False]}]
                ),
            dict(label='Total suspended solids (mg/L)',
                method='update',
                args=[{'visible':[False, False, True, False, False, False, False, False]}]
                ),
            dict(label=r'Chlorophyll a (mu g/L)',
                method='update',
                args=[{'visible':[False, False, False, True, False, False, False, False, True]}]
                ),
            dict(label='Phytoplankton cell count (cells/mL)',
                method='update',
                args=[{'visible':[False, False, False, False, True, False, False, False, True]}]
                ),
            dict(label='Phytoplankton biovolume (cubic mu m/mL)',
                method='update',
                args=[{'visible':[False, False, False, False, False, True, False, False, True]}]
                ),
            dict(label='Cyanobacteria cell count (cells/mL)',
                method='update',
                args=[{'visible':[False, False, False, False, False, False, True, False, True]}]
                ),
            dict(label='Cyanobacteria biovolume (cubic mu m/mL)',
                method='update',
                args=[{'visible':[False, False, False, False, False, False, False, True, True]}]
                ),
        ]))
]
                 )

fig.update_layout(
    title_text="Water Quality Parameters",
    xaxis_domain=[0.05, 1.0]
)

# layout = dict(title="Water Quality Parameters", showlegend=False,
#              updatemenus=updatemenus, plot_bgcolor="white",
#              yaxis=dict(side='left'),
#              yaxis2=dict(title="Tweets/day", side="right"))


# fig.add_trace(go.Scatter(x=tweets_daily['time'],
#                          y=tweets_daily['neg'],
#                          name='Daily Tweet Count',
#                          mode='markers',
#                          marker=dict(color='cyan', size=10)),
#              secondary_y=True)


fig.update_traces(mode='markers', marker_line_width=1, marker_size=10)
fig.update_xaxes(showline=True, linewidth=2, linecolor='grey')
fig.update_yaxes(showline=True, linewidth=2, linecolor='grey')
pio.show(fig)


