In [None]:
# General Import

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
from datetime import datetime
import datetime
import plotly.graph_objects as go
import plotly.express as px
import folium
from folium import plugins
import warnings
import seaborn as sns
plt.style.use('ggplot')

import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
# TSA from Statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt

from sklearn.metrics import mean_squared_error

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()
    
#### Load the data

# df1 = pd.read_csv('/kaggle/input/eda-sensors/df1.csv')
# df2 = pd.read_csv('/kaggle/input/eda-sensors/df2.csv')
df3 = pd.read_csv('/kaggle/input/eda-sensors/df3.csv')
df4 = pd.read_csv('/kaggle/input/eda-sensors/df4.csv')
df5 = pd.read_csv('/kaggle/input/eda-sensors/df5.csv')


#### Resample the data

# lets create time series from weather 
timeSeries = df5.loc[:, ["datetime","liv"]]
timeSeries['datetime'] = pd.to_datetime(timeSeries['datetime'] )
ts = timeSeries.set_index('datetime').resample('1H').max().reset_index()


# Traditional Time series Forecasting

## Time series Analysis

### Stationary Time Series Definition

Time series are stationary if they do not have trend or seasonal effects. Summary statistics calculated on the time series are consistent over time, like the mean or the variance of the observations.

### Why Stationarity is important in Time Series?

When a time series is stationary, it can be easier to model. Statistical modeling methods assume or require the time series to be stationary to be effective.

### But ...

Statistical time series methods and even modern machine learning methods will benefit from the clearer signal in the data.

But…

We turn to machine learning methods when the classical methods fail. When we want more or better results. We cannot know how to best model unknown nonlinear relationships in time series data and some methods may result in better performance when working with non-stationary observations or some mixture of stationary and non-stationary views of the problem.

The suggestion here is to treat properties of a time series being stationary or not as another source of information that can be used in feature engineering and feature selection on your time series problem when using machine learning methods.

#### Check TimeSeries Stationarity


* Look at Plots: You can review a time series plot of your data and visually check if there are any obvious trends or seasonality.

* Summary Statistics: You can review the summary statistics for your data for seasons or random partitions and check for obvious or significant differences.

* Statistical Tests: You can use statistical tests to check if the expectations of stationarity are met or have been violated.


We can check stationarity using the following methods:
* Summary Statistics: You can split your time series into two (or more) partitions and compare the mean and variance of each group. If they differ and the difference is statistically significant, the time series is likely non-stationary.
* Plotting Rolling Statistics: We have a window lets say window size is 12 and then we find rolling mean and variance to check stationary.
* Dickey-Fuller Test: The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the test statistic is less than the critical value, we can say that time series is stationary.
* KPSS Test: it is another test for checking the stationarity of a time series (slightly less popular than the Dickey Fuller test). The null and alternate hypothesis for the KPSS test are opposite that of the ADF test, which often creates confusion.
* Autocorrelation (ACF) and partial autocorrelation (PACF) plots: these are plots that graphically summarize the strength of a relationship with an observation in a time series with observations at prior time steps.

In [None]:
from pandas import read_csv
from matplotlib import pyplot
# adfuller library 
from statsmodels.tsa.stattools import adfuller
#  kpss library
from statsmodels.tsa.stattools import kpss

def summary_statistics(series):
    X = series.values
    split = round(len(X) / 2)
    X1, X2 = X[0:split], X[split:]
    mean1, mean2 = X1.mean(), X2.mean()
    var1, var2 = X1.var(), X2.var()
    print('Summary Statistics')
    print('mean1=%f, mean2=%f' % (mean1, mean2))
    print('variance1=%f, variance2=%f' % (var1, var2))
    print('')


# check_adfuller
def check_adfuller(series):
    # Dickey-Fuller test
    print ('Results of adfuller Test:')
    result = adfuller(series, autolag='AIC')
    print('Test statistic: ' , result[0])
    print('p-value: '  ,result[1])
    print('Critical Values:' ,result[4])
    print('')
    

#define KPSS
def check_kpss(series):
    print ('Results of KPSS Test:')
    result = kpss(series, regression='c', nlags='auto')
    print('Test statistic: ' , result[0])
    print('p-value: '  ,result[1])
    print('Critical Values:' ,result[3])
    print('')


# check_mean_std
def check_mean_std(series):
    #Rolling statistics
    TS = series
    TS['rollmean'] = TS.liv.rolling(12).mean()
    TS['rollstd'] = TS.liv.rolling(12).std()

    # Create traces
    fig = go.Figure()
    fig.add_trace(go.Scatter(x = TS['datetime'], y=TS['liv'], name='Original'))
    fig.add_trace(go.Scatter(x = TS['datetime'], y=TS['rollmean'], name='Rollling Mean'))
    fig.add_trace(go.Scatter(x = TS['datetime'], y=TS['rollstd'], name='Rolling Std'))
    fig.update_layout(title='Check Stationarity with Rolling Mean and Rolling Std ',xaxis_title='Datetime')
    fig.show()
    TS.drop(['rollmean','rollstd'],axis=1,inplace=True)

# Examine the patterns of ACF and PACF (along with the time series plot and histogram)

def tsplot(ts, lags=None, title='', figsize=(14, 8)):
    '''Examine the patterns of ACF and PACF, along with the time series plot and histogram.
    '''
    y = ts.liv
    fig = plt.figure(figsize=figsize)
    layout = (2, 2)
    ts_ax   = plt.subplot2grid(layout, (0, 0))
    hist_ax = plt.subplot2grid(layout, (0, 1))
    acf_ax  = plt.subplot2grid(layout, (1, 0))
    pacf_ax = plt.subplot2grid(layout, (1, 1))
    
    y.plot(ax=ts_ax)
    ts_ax.set_title(title)
    y.plot(ax=hist_ax, kind='hist', bins=25)
    hist_ax.set_title('Histogram')
    smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
    smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
    [ax.set_xlim(0) for ax in [acf_ax, pacf_ax]]
    sns.despine()
    fig.tight_layout()
    
    summary_statistics(y)
    check_adfuller(y)
    check_kpss(y)
    check_mean_std(ts)
    
    return ts_ax, acf_ax, pacf_ax


In [None]:
tsplot(ts)

Data transformations (detrending, demeaning, differencing, ARMA structure ,power, constancy of variance considerations, elimination of pulses/level shifts/seasonal pulses/local time trends ) are used to convert an observed series to a white noise series/process . The parameters of this white-noise process ( the errors from these suitable transformations) i.e. the mean , variance and covarince for all lags should be constant for all sub-intervals of time.

### Make TimeSeries Stationary
* Differencing
* Power based Transformations

### Differencing 

#### First-Order Differencing

In [None]:
ts_diff_first = timeSeries.set_index('datetime').resample('1H').max().reset_index()
# 1st order differencing
ts_diff_first.liv = ts_diff_first.liv.diff()
ts_diff_first.dropna(inplace=True)
# visualization
tsplot(ts_diff_first)

#### Seasonal Differencing 

In [None]:
ts_diff_seas = timeSeries.set_index('datetime').resample('1H').max().reset_index()
# Seasonal differencing
ts_diff_seas.liv = ts_diff_seas.liv.diff(24)
ts_diff_seas.dropna(inplace=True)
# visualization
tsplot(ts_diff_seas)

# Now, forecast the water level for next 24 hours.
* Apply Seasonal Differencing to make the time series stationary
* Train and Test Splitting of time series
* Choose a Method, Train and Test it
* Invert the difference back to the original level
* Compute the MSE metric

#### the following traditional forecasting models are applied:
* Constant Naive model
* Seasonal Naive Model
* ES - Exponential Smoothing
* ARIMA - Autoregressive Integrated Moving Average Model

#### --> For ARIMA look at ACF and PACF for quick models parameters configuration

In [None]:
TS = ts.liv

### Train & Test Splitting

In [None]:
TRAIN = TS[:-24]
TEST = TS[-24:]

### Constant Naive Model

In [None]:
pred_constant = [TRAIN.values[-1] for _ in range(24)]

# MSE
mse = mean_squared_error(TEST.values, pred_constant)
print('MSE')
print(mse)

# plot
x = [i for i in range(2150,len(TS))]

plt.plot(x[:-len(TEST)], TRAIN[2150:], color='green')
plt.plot(x[-len(TEST):], TEST.values, color='red', label ='target')
plt.plot(x[-len(TEST):], pred_constant, color='blue', label ='prediction')
plt.legend()

### Seasonal Naive Model

In [None]:
pred_seasonal = TRAIN.values[-24:]

# MSE
mse = mean_squared_error(TEST.values, pred_seasonal)
print('MSE')
print(mse)

# plot
x = [i for i in range(2150,len(TS))]

plt.plot(x[:-len(TEST)], TRAIN[2150:], color='green')
plt.plot(x[-len(TEST):], TEST.values, color='red', label ='target')
plt.plot(x[-len(TEST):], pred_seasonal, color='blue', label ='prediction')
plt.legend()

### make data stationary

In [None]:
# create a n-th order differenced series
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return diff

 
# invert differenced forecast
def inverse_difference(last_ob, value):
    return value + last_ob

In [None]:
v_0 = TS[:24]
diff_24 = difference(TS,24)
train = diff_24[:-24]
test = diff_24[-24:] #predictions last day

### Exponential Smoothing

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# fit model
model_es = ExponentialSmoothing(train, seasonal="add", seasonal_periods=24)
model_es_fit = model_es.fit()

# forecast
pred_es = model_es_fit.forecast(24)

# stack with train dataset
diff_predicted_es = np.hstack((train, pred_es))
diff_target = np.hstack((train,test))


# total invert differencing
predicted_es =  np.hstack((v_0,[inverse_difference(TRAIN[i], diff_predicted_es[i]) for i in range(len(diff_predicted_es))]))
target =  np.hstack((v_0,[inverse_difference(TRAIN[i], diff_target[i]) for i in range(len(diff_target))]))

y_pred = predicted_es[-24:]
y_true = target[-24:]

# MSE
mse = mean_squared_error(y_true, y_pred)
print('MSE')
print(mse)


# plot
x = [i for i in range(2150,len(TS))]

plt.plot(x[:-len(TEST)], TRAIN[2150:], color='green')
plt.plot(x[-len(TEST):], y_true, color='red', label ='target')
plt.plot(x[-len(TEST):], y_pred, color='blue', label ='prediction')
plt.legend()

### ARIMA

#### Configure ARIMA

The parameters (p,d,q) of the ARIMA model are defined as follows:

* --> p: The number of lag observations included in the model, also called the lag order.
* --> d: The number of times that the raw observations are differenced, also called the degree of differencing.
* --> q: The size of the moving average window, also called the order of moving average.

        ** Note: since the time series has been already differenciated it is set equal to 0.

To select the lag values for the Autoregression (AR) and Moving Average (MA) parameters, p and q respectively, we review Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots.

In [None]:
pyplot.figure()
smt.graphics.plot_acf(ts_diff_seas.liv)
smt.graphics.plot_pacf(ts_diff_seas.liv)
pyplot.show()

* The ACF shows a significant lag for 6 hours.
* The PACF shows a significant lag for perhaps 2-3 hours

A good starting point for the p and q values is 6 or 3.

In [None]:
from statsmodels.tsa.arima_model import ARIMA

# fit model
model_arima = ARIMA(train, order=(6,0,3))
model_arima_fit = model_arima.fit(disp=0)

# forecast
pred_arima = model_arima_fit.forecast(steps=24)[0]

# stack
diff_predicted_arima = np.hstack((train, pred_arima))

#invert
predicted_arima =  np.hstack((v_0,[inverse_difference(TRAIN[i], diff_predicted_arima[i]) for i in range(len(diff_predicted_arima))]))


y_pred_arima = predicted_arima[-24:]

# MSE
mse = mean_squared_error(y_true, y_pred_arima)
print('MSE')
print(mse)


# plot
x = [i for i in range(2150,len(TS))]

plt.plot(x[:-len(TEST)], TRAIN[2150:], color='green')
plt.plot(x[-len(TEST):], y_true, color='red', label ='target')
plt.plot(x[-len(TEST):], y_pred_arima, color='blue', label ='prediction')
plt.legend()