In [None]:
import xml.etree.cElementTree as et
import pandas as pd
import numpy as np
import requests

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics import tsaplots

from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.debugger import set_trace
#from statsmodels.stats.proportion import proportion_ztest

# Step 1: Acquire Data From CSV File

In [None]:
beach_complete = pd.read_csv('beach_complete.csv',delimiter=',',header=0,index_col=0)

In [None]:
beach_clean = beach_complete.dropna()

In [None]:
beach_clean.apply(pd.to_numeric);

In [None]:
beach_clean['average'] = beach_clean.mean(1)

In [None]:
beach_clean.head(10)

In [None]:
fig, ax = plt.subplots(6, 2, figsize=(20,40))
cols = beach_clean.columns
for i in range(6):
    for j in range(2):
        if j == 1 and i == 5:
            break
        elif j == 0:
            ax[i][j].hist(beach_clean[cols[i]].values, bins=50, histtype='stepfilled')
            ax[i][j].set(xlabel='E.coli counts',ylabel='Frequency',title=cols[i])
            #set_trace()
        else:
            ax[i][j].hist(beach_clean[cols[i+6]].values, bins=50, histtype='stepfilled')
            ax[i][j].set(xlabel='E.coli counts',ylabel='Frequency',title=cols[i+6])

In [None]:
sunnyside = beach_clean[['Sunnyside-2']]

In [None]:
sunnyside.info()

In [None]:
sunnyside.head()

In [None]:
sunnyside = sunnyside.reset_index()

In [None]:
sunnyside.head()

# Step 2: ACF and PACF plots

In [None]:
def plot_acf_pacf(list_tuples):
    fig, ax = plt.subplots(11, 2, figsize=(20,40))
    for i in range(len(list_tuples)):
        for j in range(len(list_tuples[i])-1):
            str_title_acf = "Autocorrelation " + list_tuples[i][1]
            str_title_pacf = "Partial Autocorrelation  " + list_tuples[i][1]
            plot_acf(list_tuples[i][0], lags=50, title=str_title_acf, ax=ax[i][j])
            plot_pacf(list_tuples[i][0], lags=50, title=str_title_pacf, ax=ax[i][j+1])
    
    plt.show();

plot_acf_pacf(list_tuples)

In [None]:
# fig, ax = plt.subplots(1, 2, sharey=True, figsize=(15,4))
# plot_acf(sunnyside, lags=50, title='Sunnyside ACF', ax=ax[0])
# plot_pacf(sunnyside, lags=50, title='Sunnyside PACF', ax=ax[1])
# plt.show();

For all 11 beaches, it seems that with a lag=6 there will be very little partial correlation. So we assume lag=6 for both our AR and MA parameters for now.

# Step 3: Setup Train and Test

In [None]:
train_size = int(0.7 * sunnyside['Sunnyside-2'].size)
test_size = int(0.3 * sunnyside['Sunnyside-2'].size)

In [None]:
train_size, test_size

In [None]:
sunnyside['Label'] = 0
for i in sunnyside.index:
    if i >= train_size:
        sunnyside.loc[i, 'Label'] = 1

In [None]:
sunnyside.head(), sunnyside.tail()

# Step 4: ARIMA Model 

In [None]:
sunnyside['Predictions'] = 0
sunnyside['Errors'] = 0

In [None]:
sunnyside['Sunnyside-2'][sunnyside['Label'] == 0].values

In [None]:
for i in sunnyside.index:
    if sunnyside.loc[i, 'Label'] == 1:
        model = ARIMA(endog=sunnyside['Sunnyside-2'][sunnyside['Label'] == 0].values, order=(5,1,1)).fit()
        sunnyside.loc[i, 'Predictions'] = model.forecast()[0][0]
        sunnyside.loc[i, 'Errors'] = model.forecast()[1][0]
        sunnyside.loc[i, 'Label'] = 0

In [None]:
sunnyside['sampleDate'] = pd.to_datetime(sunnyside['sampleDate'])
years = sunnyside['sampleDate'].dt.year
years = years.drop_duplicates()
years

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,6))
ax.plot(sunnyside.index[:train_size], sunnyside['Sunnyside-2'][:train_size], 'b-', label='Train')
ax.plot(sunnyside.index[train_size:train_size+test_size], 
        sunnyside['Sunnyside-2'][train_size:train_size+test_size], 'r-', label='Test')
ax.plot(sunnyside.index[train_size:train_size+test_size], 
        sunnyside['Predictions'][train_size:train_size+test_size], 'm-', label='Predicted')
plt.xticks(years.index, years)
plt.legend()
plt.show();

# Step 5: ARIMA Model Test Results 

In [None]:
rmse = np.sqrt(mean_squared_error(sunnyside['Sunnyside-2'][train_size:train_size+test_size],
                                  sunnyside['Predictions'][train_size:train_size+test_size]))
rmse