In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

import itertools
import seaborn as sns
import plotly.express as px
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA

import warnings
warnings.simplefilter('ignore')
%matplotlib inline


In [2]:
df = pd.read_csv("COVID19BE_CASES_AGESEX.csv")

In [3]:
def get_time_series_province(province):
    pd.options.mode.chained_assignment = None  
    df_updated = df.dropna(axis=0)
    df_province = df_updated[(df_updated['PROVINCE'] == province)]
    df_province["DATE"] = pd.to_datetime(df_province["DATE"], format='%Y-%m-%d')
    #df_province["DATE"] = df_province["DATE"].dt.strftime("%Y%m%d").astype(int)
    df_province.drop(["REGION","AGEGROUP","SEX"], axis=1,inplace=True)
    return df_province.groupby("DATE").sum()
    #df_province = df_province.groupby('df_province['DATE']').sum()    
    #return df_province.set_index('DATE')["CASES"]

def get_time_series_total():
    pd.options.mode.chained_assignment = None  
    df_updated = df.dropna(axis=0)

    df_updated["DATE"] = pd.to_datetime(df_updated["DATE"], format='%Y-%m-%d')
    #df_updated["DATE"] = df_updated["DATE"].dt.strftime("%Y%m%d").astype(int)
    df_updated.drop(["REGION","AGEGROUP","SEX"], axis=1,inplace=True)
    return df_updated.groupby('DATE').sum()


In [4]:
result = get_time_series_total()


In [5]:
# split into train and test sets
train_size = int(len(result) * 0.95)
test_size = len(result) - train_size
train, test = result[0:train_size], result[train_size:len(result)]
print(len(train), len(test))

291 16


In [6]:
def create_features(df,label=None):
    df = df.copy()
    df['Date'] = df.index
    df['hour'] = df['Date'].dt.hour
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['dayofmonth'] = df['Date'].dt.day
    df['weekofyear'] = df['Date'].dt.weekofyear
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
   
    return X

In [7]:
train_features=pd.DataFrame(create_features(train))
test_features=pd.DataFrame(create_features(test))
features_and_target_train = pd.concat([train,train_features], axis=1)
features_and_target_test = pd.concat([test,test_features], axis=1)
le = LabelEncoder()
def FunLabelEncoder(df):
    for c in result.columns:
        if df.dtypes[c] == object:
            le.fit(df[c].astype(str))
            df[c] = le.transform(df[c].astype(str))
    return df
features_and_target_train= FunLabelEncoder(features_and_target_train)
print(features_and_target_train)

            CASES  hour  dayofweek  quarter  month  year  dayofyear  \
DATE                                                                  
2020-03-01     19     0          6        1      3  2020         61   
2020-03-02     19     0          0        1      3  2020         62   
2020-03-03     34     0          1        1      3  2020         63   
2020-03-04     46     0          2        1      3  2020         64   
2020-03-05     81     0          3        1      3  2020         65   
...           ...   ...        ...      ...    ...   ...        ...   
2020-12-12   1575     0          5        4     12  2020        347   
2020-12-13    731     0          6        4     12  2020        348   
2020-12-14   3793     0          0        4     12  2020        349   
2020-12-15   3199     0          1        4     12  2020        350   
2020-12-16   2809     0          2        4     12  2020        351   

            dayofmonth  weekofyear  
DATE                                
20

In [8]:
x_train= features_and_target_train[['month', 'dayofyear', 'dayofmonth' , 'weekofyear']]
y1 = features_and_target_train[['CASES']]
x_test = features_and_target_test[['month', 'dayofyear', 'dayofmonth' , 'weekofyear']]

In [None]:
#Mean absolute percentage error
def mape(y1, y_pred): 
    y1, y_pred = np.array(y1), np.array(y_pred)
    return np.mean(np.abs((y1 - y_pred) / y1)) * 100

#Arima modeling for ts
def arima(ts,test):
    p=d=q=range(0,6)
    a=99999
    pdq=list(itertools.product(p,d,q))
    
    #Determining the best parameters
    for var in pdq:
        try:
            model = ARIMA(ts, order=var)
            result = model.fit()

            if (result.aic<=a) :
                a=result.aic
                param=var
        except:
            continue
            
    #Modeling
    model = ARIMA(ts, order=param)
    result = model.fit()
    result.plot_predict(start=int(len(ts) * 0.7), end=int(len(ts) * 1.2))
    pred=result.forecast(steps=len(test))[0]
    #Plotting results
    f,ax=plt.subplots()
    plt.plot(pred,c='green', label= 'predictions')
    plt.plot(test, c='red',label='real values')
    plt.legend()
    plt.title('True vs predicted values')
    #Printing the error metrics
    print(result.summary())        
    
    print('\nMean absolute percentage error: %f'%mape(test,pred))
    return (pred)


pred=arima(train,test)

In [None]:
print(pred)