# Forecasting with Autoregressive Integrated Moving Average (ARIMA)

I refer to this [paper](https://doi.org/10.1016/j.asoc.2021.107161) 

First, we install pmdarima for module arima model

In [None]:
!pip3 install pmdarima

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import matplotlib
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.tools import diff
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima import auto_arima

from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
dataset_confirmed = pd.read_csv('../input/covid-confirmed-global/confirmed_global.csv')
dataset_confirmed.head()

# Select Data

In this case, we used USA for example

In [None]:
dataset_confirmed_usa = dataset_confirmed[dataset_confirmed['Country/Region'] == 'US']
dataset_confirmed_usa = dataset_confirmed_usa.drop(['Province/State', 'Country/Region', 'Lat', 'Long'], axis=1)
dataset_confirmed_usa.head()

In [None]:
dataset_confirmed_usa = dataset_confirmed_usa.T
dataset_confirmed_usa.columns = ['cases']
dataset_confirmed_usa.head()

In [None]:
index = pd.DatetimeIndex(pd.date_range('20200122', '20210410', freq='d'))
dataset_confirmed_complex=dataset_confirmed_usa.copy()
dataset_confirmed_complex.set_index(index, inplace=True)
dataset_confirmed_complex

The data will splitted into 2 for training dan testing

In [None]:
after_start_date = dataset_confirmed_complex.index >= "2020-01-22" #All data form visualization
before_end_date = dataset_confirmed_complex.index <= "2020-09-21"
between_two_dates = after_start_date & before_end_date
filtered_dates_all = dataset_confirmed_complex.loc[between_two_dates]

after_start_date = dataset_confirmed_complex.index >= "2020-01-22" #Training data
before_end_date = dataset_confirmed_complex.index <= "2020-07-24"
between_two_dates = after_start_date & before_end_date
filtered_dates_train = dataset_confirmed_complex.loc[between_two_dates]

after_start_date = dataset_confirmed_complex.index >= "2020-07-25" #Testing data
before_end_date = dataset_confirmed_complex.index <= "2020-09-21"
between_two_dates = after_start_date & before_end_date
filtered_dates_predict = dataset_confirmed_complex.loc[between_two_dates]

In [None]:
len(filtered_dates_predict)

We also need to check if there is a missing value

In [None]:
for col in dataset_confirmed_complex.columns:
    print(col, str(round(100* dataset_confirmed_complex[col].isnull().sum() / len(dataset_confirmed_complex), 2)) + '%')

In [None]:
sns.lineplot(y="cases", x=filtered_dates_train.index, data=filtered_dates_train)
sns.lineplot(y="cases", x=filtered_dates_predict.index, data=filtered_dates_predict,  color='red')

plt.xticks(rotation=15)
plt.title('Plot USA 22/01 - 21/09')
plt.show()

Blue Line : Training Data
Red line  : Testing Data

In [None]:
#Visualisasi Data latih
df = filtered_dates_train.copy()
sm = plt.cm.ScalarMappable(cmap='viridis', 
                           norm=plt.Normalize(vmin=df.index.min().value,
                                              vmax=df.index.max().value))
sm._A = []  

df.plot(legend=False, colormap='viridis', figsize=(15,10));

cbar = plt.colorbar(sm);
cbar.ax.set_yticklabels(pd.to_datetime(cbar.get_ticks()).strftime(date_format='%b %Y'))

# Identify Data Trends

The core proses of Time series analysis, we must check the data if there is a trends

In [None]:
plot_acf(df["cases"], lags = 40, label = "90");
plot_pacf(df["cases"], lags = 40, label = "90");

In [None]:
# seasonality = seasonal_decompose(df, model='multiplicative')
df["d1"] = diff(df["cases"], k_diff = 1)
df['d2'] = diff(df["cases"], k_diff = 2)

## First Differencing

In [None]:
plot_acf(df[1:].d1.values.squeeze(), lags = 40 );
plot_pacf(df[1:].d1.values.squeeze(), lags = 40);

## Second Differencing

In [None]:
plot_acf(df[2:].d2.values.squeeze(), lags = 40 );
plot_pacf(df[2:].d2.values.squeeze(), lags = 40);

In [None]:
plt.rcParams["figure.figsize"] = (20,10)

result = seasonal_decompose(df['cases'], model='multiplicative')
fig = result.plot()
plt.show(fig)

# Build the basic model with Auto ARIMA

In [None]:
size = int(len(df)*0.8)
train_data= df.iloc[:size]
test_data =df.iloc[size:]
len(test_data)
step_fit = auto_arima(df['cases'], start_p=0, start_q=0, max_p=7, max_q=7,
          seasonal=False, # for SARIMA models seasonality is set to True
          d=None, trace=True, enforce_stationarity =False, enforce_invertibility = False,
          error_action='ignore', suppress_warnings=True, maxiter = 50, stepwise=True)
step_fit.summary()


From auto_arima, we get the best model is (1,2,1)

In [None]:
get_ipython().run_cell_magic('time', '', 'model_base = ARIMA(df["cases"].astype(float), order =(1,2,1))\nresults_base = model_base.fit()\nresults_base.summary()')

# Predict the test data

In [None]:
start=len(train_data)
end=len(train_data)+len(test_data)-1

predictions_base = results_base.predict(start=start, end=end, dynamic=False, typ='levels').rename('BASE_model Predictions')
for i in range(len(predictions_base)):
  print(f"predicted={predictions_base[i]:<11.10}, expected={test_data['cases'][i]}")

plt.rc('axes', axisbelow=True)
fig = plt.figure(figsize = (15,9))
Test_data = plt.plot(test_data['cases'],"o",color = "#ff7f0e", label = "Test data (USA)")
predicted = plt.plot(predictions_base, color = '#1f77b4', label = 'Predictions(Basic model)', linewidth = 2)

# Evaluation of the basic model

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
MSE = mean_squared_error(test_data['cases'], predictions_base)
MAE = mean_absolute_error(test_data['cases'], predictions_base)
RMSE = mean_squared_error(test_data['cases'], predictions_base, squared=False)
MAPE = mean_absolute_percentage_error(test_data['cases'], predictions_base)
print('-------------------- Error Data Testing -----------------')
print("MSE: ", MSE)
print("MAE: ", MAE)
print("RMSE: ", RMSE)
print("MAPE: ", MAPE)

#  Development of the basic model based on the ACF and PACF

From the result of auto_arima, we get MAPE 0,14%. Actually this result is quite good. But we try to build model based on the ACF and PACF. So we used model (7,2,1)

In [None]:
model = ARIMA(df['cases'].astype(float), order=(7,2,1))
results = model.fit(start_ar_lags=8) #error di parameter start_ar_lags udh deprecated => start_ar_lags=8
fcast=results.predict(len(df),len(df)+60,typ='levels').rename('ARIMA(7,2,1) Forecast')
fig, ax = plt.subplots(figsize=(6, 4),dpi=100)
plot_acf(results.resid, lags =20,ax=ax,color ='#1f77b4',linewidth =0.1)
fig, ax = plt.subplots(figsize=(6, 4),dpi=100)
plot_pacf(results.resid, lags =20,ax=ax,color ='#1f77b4',linewidth =0.1)

# Plot predictions based on developed model and actual data

In [None]:
start=len(train_data)
end=len(train_data)+len(test_data)-1
predictions = results.predict(start=(start), end=(end), dynamic=False, typ='levels').rename('Selected model Predictions')

for i in range(len(predictions)):
  print(f"predicted={predictions[i]:<11.10}, expected={test_data['cases'][i]}")

plt.rc('axes', axisbelow=True)
fig = plt.figure(figsize = (10,9))
Test_data = plt.plot(test_data['cases'],"o",color = "#ff7f0e", label = "Test data (USA)")
predicted =plt.plot(predictions, color = '#1f77b4', label = 'Predictions(ARIMA 7,2,1)', linewidth =2)

# Evaluation metrics for developed model

In [None]:
MSE = mean_squared_error(test_data['cases'], predictions)
MAE = mean_absolute_error(test_data['cases'], predictions)
RMSE = mean_squared_error(test_data['cases'], predictions, squared=False)
MAPE = mean_absolute_percentage_error(test_data['cases'], predictions)
print('-------------------- Error Data Testing -----------------')
print("MSE: ", MSE)
print("MAE: ", MAE)
print("RMSE: ", RMSE)
print("MAPE: ", MAPE)

* Arima based on auto_arima **(1,2,1)** = **0,14%**
* Arima based on the ACF & PACF Plot **(7,2,1)** = **0,1%**

# Diagnosing the developed model with kde/q–q plots

In [None]:
fig, ax = plt.subplots(figsize=(10,5), dpi=100)
results.resid.plot(kind = "kde")

# Forecasting time-series data based on the selected model

Based on the best model ARIMA(7,2,1) we predict the testing data

In [None]:
figs, ax = plt.subplots(figsize=(10,5),dpi = 100)
model_fit = model.fit()
# model_fit
figs = model_fit.plot_predict(10, 243, dynamic = False, plot_insample = True, ax=ax)
plt.show()

In [None]:
# start=len(train_data)
# end=len(train_data)+len(test_data)-1
predictions = model_fit.predict(10, 243, dynamic=False, typ='levels').rename('Selected model Predictions')
predictions_new = predictions[175:].copy()

for i in range(len(predictions_new)):
  print(f"predicted={predictions_new[i]}, expected={filtered_dates_predict['cases'][i]}")

plt.rc('axes', axisbelow=True)
fig = plt.figure(figsize = (15,9))
Test_data = plt.plot(filtered_dates_predict['cases'],"o",color = "#ff7f0e", label = "Testing data (USA) 25 Juli 2020 - 21 September 2020")
predicted =plt.plot(predictions_new, color = '#1f77b4', label = 'Predictions(ARIMA 7,2,1)', linewidth =2)