# ARIMA (autoregressive integrated moving average) Model

## Import, handle missing values, create a clean dataframe

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from math import floor, sqrt

ic = pd.read_excel("D:\DS\Mine\Corona\I21-11.xlsx", index_col=0, parse_dates=[0]) # Iran Corona for 2021-11-..
ic['total_vaccinations_f'] = ic['total_vaccinations'].interpolate(method='linear')
ic['stringency_index_f'] = ic['stringency_index'].fillna(method="ffill")
icf = ic.loc[ : , ['datei', 'total_cases', 'new_cases', 'new_cases_smoothed',
                   'total_deaths', 'new_deaths', 'new_deaths_smoothed', 'total_vaccinations_f', 'stringency_index_f']]

icf.loc[:, "datei"] = icf["datei"].apply(lambda x: x - 43880)
pd.options.display.float_format = '{:,.2f}'.format
# pd.set_option('display.max_rows', 2000)
icf

## Checking stationarity of the data 'new_deaths'

In [None]:
icf.plot(y='new_deaths')

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
icn = icf.loc[:,'new_deaths']
plot_acf(icn);

In [None]:
icd = icn.diff(periods=1)
icd = icd[1:]
icd

In [None]:
plot_acf(icd);

In [None]:
icd.plot()

In [None]:
from statsmodels.tsa.stattools import adfuller
result = adfuller(icd)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])

**>> Using d = 1 (1 time differencing) the data becomes stationary (p-value < 0.05).**

## Defining constants, spliting train and test for ARIMA

In [None]:
length = icn.shape[0]                #number of records
train_len = floor(length * 0.90)     #90% of records are going to be used for training
print('Out of', length, 'samples,', train_len, 'samples were devoted to training section and the rest (i.e.',
      length - train_len, 'samples) to test section.')
prl = 10                                    #prediction lendth: number of days to predict

xd = icn.values
xr = xd[:train_len]
xs = xd[train_len:train_len+prl]
xsp = []

### Single set of parameters for an initial model

In [None]:
# Running ARIMA using p=10, d=1 and q=1
from statsmodels.tsa.arima.model import ARIMA
print('On', icf.index[train_len].strftime('%Y-%m-%d'), 'a prediction of', prl, 'days is as follows:')
arim = ARIMA(xr, order=(10, 1, 2))
arim_fit = arim.fit()

# print(arim_fit.aic)
xsp = arim_fit.forecast(steps=prl)
print('Using (p, d, q) = (3, 1, 1):')
for i in range (prl):
    print('Actual =', xs[i], '- predicted =', "{:2.1f}".format(xsp[i]))

In [None]:
arim_fit.params


In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from math import sqrt, floor
print('MAE:  Mean Absolute Error =           ', "{:2.2f}".format(mean_absolute_error(xs, xsp)))
print('MAPE: Mean Absolute Percentage Error =', "{:2.1%}".format(mean_absolute_percentage_error(xs, xsp)))
print('RMSE: Root Mean Squared Error =       ', "{:2.2f}".format(sqrt(mean_squared_error(xs, xsp))))

## Search to find the best parameters for ARIMA

### Constants

In [None]:
interval = 30    #interval: is going to be used for predictions at every 'interval' records
startp = 200     #start position: to start forcasting after this point

### Parameters sets

In [None]:
import itertools #To ease the process of combining parameters.

#Parameter values:
# pv = [1, 2, 3, 7, 10, 12, 14]#, 15, 16, 20]#, 30]#, 60]
pv = range(1, 20)
dv = range(0, 3)
qv = range(0, 3)
pdq = list(itertools.product(pv, dv, qv))
print(len(pdq), 'set of parameters are going to be compared:')


### Fitting models and RMSE comparison within the training section

In [None]:
import warnings
import itertools
import math
from statistics import mean, stdev
warnings.filterwarnings('ignore')

lowest_rmse, best_cfg = float("inf"), None
cols = math.floor((train_len - startp - prl) / interval) #columns: is calculated to be used as the number of columns in this study
rmse = np.zeros([len(pdq), cols]) #root mean square error 
print('Search to find best parameters using', len(pdq), 'parameters sets in', cols, 'points of dataset:')
np.set_printoptions(precision=0)
fit_model = [None]*len(pdq)

for i in range(len(pdq)):
    print('Parameters set', i, ': (p, d, q) =', pdq[i])
    j = 0
    while j < cols:
        try:
            model = ARIMA(xr[:startp+j*interval], order=pdq[i])
            fit_model[i] = model.fit()
            xs = xr[startp+(j*interval):startp+(j*interval)+prl]
            xsp = fit_model[i].predict(start=startp+(j*interval), end=startp+(j*interval)+prl-1)
            print('On', startp + j*interval, 'Actuals =', xs, 'Predictions =', xsp)
            rmse[i][j] = sqrt(mean_squared_error(xs, xsp))
        except Exception as e:
            print(e)
            j += 1
            continue
        j += 1
       
    if mean(rmse[i]) < lowest_rmse:
        lowest_rmse, best_cfg = mean(rmse[i]), pdq[i]
    print("  > RMSE:","{:2.2f}".format(mean(rmse[i])))
      
print('********************************* Best Score *********************************')
print('Lowest RMSE', "{:2.2f}".format(lowest_rmse), 'was acheived using parameters set (p, d, q) =', best_cfg)
# print(rmse)


### Prediction in test section using best set of parameters found above and calculations of errors:

In [None]:
xs = xd[train_len : train_len + prl]
print('On', icf.index[train_len].strftime('%Y-%m-%d'), 'a prediction of', prl, 'days is as follows:')
try:
    print('Using the best found set of parameters (p, d, q) =', best_cfg, ':')
    xsp = ARIMA(xr, order=best_cfg).fit().predict(start=train_len, end=train_len+prl-1)
    for i in range (prl):
        print('Actual =', xs[i], '- predicted =', "{:2.1f}".format(xsp[i]))    

    print('\nMAE:  Mean Absolute Error =           ', "{:2.2f}".format(mean_absolute_error(xs, xsp)))
    print('MAPE: Mean Absolute Percentage Error =', "{:2.1%}".format(mean_absolute_percentage_error(xs, xsp)))
    print('RMSE: Root Mean Squared Error =       ', "{:2.2f}".format(sqrt(mean_squared_error(xs, xsp))))
except Exception as e:
    print(e)

## Conclusion

**>> In this notebook, we have established a technique to find the best set of parameters for ARIMA model in prediction of daily new deaths of corona. Briefly, the technique uses several points in the training section for the model to fit an ARIMA model and then use the fit model to predict some future values.**

**Several attempts were made to implement the abovementioned searching and assessment techniques. The above technique can avoid overfitting by using tests that are outside of the training boundaries of each model.**

**The model shows an acceptable performance within 10 days after the final available data. This model is based on the previous samples of the studied feature. Also as we are using the local data only, it cannot predict new peaks in cases/deaths using worldwide new variants outbreak. This limits the application of ARIMA to short term predictions (such as 1 or 2 weeks).**