## The Data
### we are using Population Time Series Data
Time series analysis comprises methods for analyzing time series data in order to extract meaningful statistics and other characteristics of the data. Time series forecasting is the use of a model to predict future values based on previously observed values.

In [None]:


# Load packages
import numpy as np                               # vectors and matrices
import pandas as pd                              # tables and data manipulations
import matplotlib.pyplot as plt                  # plots
import seaborn as sns                            # more plots
import warnings                                  # do not disturbe mode
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

from dateutil.relativedelta import relativedelta # working with dates with style
from scipy.optimize import minimize              # for function minimization

import statsmodels.formula.api as smf            # statistics and econometrics
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

from itertools import product                    # some useful functions
from tqdm import tqdm_notebook

# Importing everything from forecasting quality metrics
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error

In [None]:
from dateutil.relativedelta import relativedelta # working with dates with style
from scipy.optimize import minimize              # for function minimization

import statsmodels.formula.api as smf            # statistics and econometrics
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

from itertools import product                    # some useful functions
from tqdm import tqdm_notebook
import matplotlib
matplotlib.rcParams['axes.labelsize']=14
matplotlib.rcParams['xtick.labelsize']=12
matplotlib.rcParams['ytick.labelsize']=12
matplotlib.rcParams['text.color']='k'




In [None]:
# MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
def tsplot(y, lags=None, figsize=(12, 7), style='bmh'):
    """
        Plot time series, its ACF and PACF, calculate Dickey–Fuller test
        
        y - timeseries
        lags - how many lags to include in ACF, PACF calculation
    """
    
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
        
    with plt.style.context(style):    
        fig = plt.figure(figsize=figsize)
        layout = (2, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        
        y.plot(ax=ts_ax)
        p_value = sm.tsa.stattools.adfuller(y)[1]
        ts_ax.set_title('Time Series Analysis Plots\n Dickey-Fuller: p={0:.5f}'.format(p_value))
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
        plt.tight_layout()

## [Work-Source](https://github.com/Alro10/deep-learning-time-series/blob/master/notebooks/SARIMA.ipynb)

In [None]:
ads = pd.read_csv("../input/population-time-series-data/POP.csv", index_col=['date'], parse_dates=['date'])



In [None]:
ads.head()

In [None]:
col=["realtime_start", "realtime_end"]
ads.drop(col, axis=1,inplace=True)

In [None]:
ads.head()

In [None]:
ads.shape
ads.describe()

In [None]:
ads.info()

In [None]:
#No messing value
ads.isnull().sum()

In [None]:
plt.figure(figsize=(18, 6))
plt.plot(ads)
plt.title("Montly Value")
plt.show()

In [None]:

#There is a strengh Trend
tsplot(ads.value,lags=10)



In [None]:
ads_log=np.log(ads.value)
tsplot(ads_log,lags=60)

In [None]:
# The daily difference
ads_log_diff = ads_log - ads_log.shift(24)
tsplot(ads_log_diff[24:], lags=60)

In [None]:
ads_log_diff = ads_log_diff - ads_log_diff.shift(1)
tsplot(ads_log_diff[24+1:], lags = 60)

### We can use decomposition Method to plot distinct the tree component of the Time Serie: Trend, Saisonality and noise
we cann see that, there is no saisonality

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 18, 8
decomposition = sm.tsa.seasonal_decompose(ads, model='additive')
fig = decomposition.plot()
plt.show()

### Function the search the parameters q,p,Q,P for SARIMA-Model

In [None]:
# setting initial values and some bounds for them
ps = range(2, 5)
d=1 #first Differencing
qs = range(2, 5)
Ps = range(0, 2)
D=1 #second Differecing
Qs = range(0, 2)
s = 24 # season length is still 24

# creating list with all the possible combinations of parameters
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)#List of ARIMA Parameter
len(parameters_list)#Lenght of the List


def optimizeSARIMA(y, parameters_list, d, D, s):
    """Return dataframe with parameters and corresponding AIC
        
        y - time series
        parameters_list - list with (p, q, P, Q) tuples
        d - integration order in ARIMA model
        D - seasonal integration order 
        s - length of season
    """
    
    results = []
    best_aic = float("inf")

    for param in tqdm_notebook(parameters_list):
        # we need try-except because on some combinations model fails to converge
        try:
            model=sm.tsa.statespace.SARIMAX(y, order=(param[0], d, param[1]), 
                                            seasonal_order=(param[2], D, param[3], s)).fit(disp=-1)
        except:
            continue
        aic = model.aic
        # saving best model, AIC and parameters
        if aic < best_aic:
            best_model = model
            best_aic = aic
            best_param = param
        results.append([param, model.aic])

    result_table = pd.DataFrame(results)
    result_table.columns = ['parameters', 'aic']
    # sorting in ascending order, the lower AIC is - the better
    result_table = result_table.sort_values(by='aic', ascending=True).reset_index(drop=True)
    
    return result_table



In [None]:
%%time
warnings.filterwarnings("ignore") 
result_table = optimizeSARIMA(ads, parameters_list, d, D, s)

In [None]:
#Parameter and AIC
result_table.head()
min(result_table.aic)

In [None]:
# set the parameters that give the lowest AIC
#The Best Model has the parameter (2,1,2)(0,1,1,24)
p, q, P, Q = result_table.parameters[0]

best_model=sm.tsa.statespace.SARIMAX(ads.value, order=(p, d, q), 
                                        seasonal_order=(P, D, Q, s)).fit(disp=-1)
print(best_model.summary())

In [None]:
mod=sm.tsa.statespace.SARIMAX(ads, order=(3, 1, 2), 
                                        seasonal_order=(0, 1, 1, 24),
                             enforce_stationarity=False,
                             enforce_invertibility=False)

#Fitting the model
result=mod.fit()

In [None]:
print(result.summary().tables[1])

In [None]:
result.plot_diagnostics(figsize=(16, 8))
plt.show()

## Model diagnostic to investigate any ununsual issues
#### The Residual has a normal Distribution, that mean that our Model is god

In [None]:

tsplot(best_model.resid[24+1:], lags=60)

In [None]:
def plotSARIMA(series, model, n_steps):
    """Plots model vs predicted values
        
        series - dataset with timeseries
        model - fitted SARIMA model
        n_steps - number of steps to predict in the future    
    """
    
    # adding model values
    data = series.copy()
    data.columns = ['actual']
    data['sarima_model'] = model.fittedvalues
    # making a shift on s+d steps, because these values were unobserved by the model
    # due to the differentiating
    data['sarima_model'][:s+d] = np.NaN
    
    # forecasting on n_steps forward 
    forecast = model.predict(start = data.shape[0], end = data.shape[0]+n_steps)
    forecast = data.sarima_model.append(forecast)
    # calculate error, again having shifted on s+d steps from the beginning
    error = mean_absolute_percentage_error(data['actual'][s+d:], data['sarima_model'][s+d:])

    plt.figure(figsize=(15, 7))
    plt.title("Mean Absolute Percentage Error: {0:.2f}%".format(error))
    plt.plot(forecast, color='r', label="model")
    plt.axvspan(data.index[-1], forecast.index[-1], alpha=0.5, color='lightgrey')
    plt.plot(data.actual, label="actual")
    plt.legend()
    

In [None]:
#Forecast for 120 Day
plotSARIMA(ads, best_model, 120)

In [None]:

pred = result.get_prediction(start=pd.to_datetime('2010-01-01'),dynamic=True)
pred_ci= pred.conf_int()
ax = ads['1990':].plot(label='observed')
pred.predicted_mean.plot(ax = ax, label='Forecast', alpha=.7, figsize=(14, 7))

ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)

ax.set_xlabel('Date ')
ax.set_ylabel('value of Pop')
plt.legend()
plt.show()

### Forecasts visualizition

In [None]:

pred_u=result.get_forecast(steps=120)
pred_ci= pred_u.conf_int()
ax = ads.plot(label='observed',figsize=(14, 7))
pred_u.predicted_mean.plot(ax = ax, label='Forecast')

ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.25)

ax.set_xlabel('Date ')
ax.set_ylabel('value of Pop')
plt.legend()
plt.show()

In [None]:
#predict Value
pred_ci