In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.tools.eval_measures import rmse
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns 



Using TensorFlow backend.


In [10]:
# sars data
sars = pd.read_csv('../data/sars_chn_agg.csv')
sars.Date = pd.to_datetime(sars.Date)
s = sars.groupby('Date').agg('sum')
s.index = pd.to_datetime(s.index)
s.columns=['Confirmed','Deaths','Recovered']
sarsov = s
# sarsov

In [9]:
# covid19 data
cv_c = pd.read_csv('../data/time_series_19-covid-Confirmed.csv')
cov_c = cv_c.T[4:]
cov_c.columns=list(cv_c['Country/Region'])
cv_d = pd.read_csv('../data/time_series_19-covid-Deaths.csv')
cov_d = cv_d.T[4:]
cov_d.columns=list(cv_d['Country/Region'])
cv_reco = pd.read_csv('../data/time_series_19-covid-Recovered.csv')
cov_reco = cv_reco.T[4:]
cov_reco.columns=list(cv_reco['Country/Region'])
case_r=cov_reco
case_r['Confirmed'] = list(cv_reco[cv_reco.columns[4:]].sum().values)
case_r = case_r[['Confirmed']].reset_index().rename(columns={'index':'Timeline'}).set_index('Timeline')
case_r.index=pd.to_datetime(case_r.index)
case_d=cov_d
case_d['Confirmed'] = list(cv_d[cv_d.columns[4:]].sum().values)
case_d = case_d[['Confirmed']].reset_index().rename(columns={'index':'Timeline'}).set_index('Timeline')
case_d.index=pd.to_datetime(case_d.index)
case_c=cov_c
case_c['Confirmed'] = list(cv_c[cv_c.columns[4:]].sum().values)
case_c = case_c[['Confirmed']].reset_index().rename(columns={'index':'Timeline'}).set_index('Timeline')
case_c.index=pd.to_datetime(case_c.index)
covid19ov = pd.concat([case_c,case_d,case_r],axis=1)
covid19ov.columns=['Confirmed','Deaths','Recovered']
# covid19ov

In [12]:
# define LSTM model in function to make predictions
def LSTM_train(df, timesteps, n_features):
    # df is the pandas dataframe will be used as datasource for ML prediction
    # df index should be datetime format timeseries timestamps | TimeSeries is univariate 
    # timesteps is the desired interval of time for machine to learn and to make prediction | eg. timesteps = 12 | 
    # n_features is the amount of supervised value machine predicts | eg. n_features = 1 | currently function allows 1
    # add. argument col_name is the column in df which contains the value to predict
    # col_name should be a string | eg. col_name = 'Confirmed'
    # currently allows 1 col* if predicting multiple values, cal_name should be a list of strings; pass columns using indexing | eg. df[col_name[0]]

    #splitting train and test dataset 
    train, test = df[:-timesteps], df[-timesteps:]
    #scale data to a range between 0 and 1 using MinMax 
    #for machine to better learn sequence
    scaler = MinMaxScaler()
    scaler.fit(train)
    train_scaled = scaler.transform(train)
    test_scaled = scaler.transform(test)
    # define TS generator for X, y values 
    # data: features | targets: values
    # length: length of output | batch_size: number of timeseries samples each batch
    generator = TimeseriesGenerator(data=train_scaled, targets=train_scaled, length=timesteps, batch_size=6)
    model = Sequential()
    model.add(LSTM(200, activation='relu', input_shape=(timesteps, n_features)))
    model.add(Dropout(0.1))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse',metrics=['mae','acc'])
    model.fit_generator(generator,epochs=180)
    # converting univariate TS data to supervised
    pred_list = []
    batch = train_scaled[-timesteps:].reshape((1, timesteps, n_features))

    for i in range(timesteps):   
        pred_list.append(model.predict(batch)[0]) 
        batch = np.append(batch[:,1:,:],[[pred_list[i]]],axis=1)
        #appending prediction to the list while excluding first data used for prediction | keeping window size constant at timesteps amount

    df_predict = pd.DataFrame(scaler.inverse_transform(pred_list),
                            index=df[-timesteps:].index, columns=['Prediction'])

    df_test = pd.concat([df,df_predict], axis=1)

    sns.set(style='white')
    palette=sns.color_palette('hot_r',2)
    ax = sns.lineplot(data=df_test,style='Prediction',size=(20,5),palette=palette,hue='Prediction')
    plt.title('LSTM Prediction vs. Actual Data')
    plt.legend(loc='best', fontsize='medium')
    plt.xticks(fontsize=10,rotation=45)
    plt.yticks(fontsize=10)
    pred_actual_rmse = rmse(df_test.iloc[-timesteps:, [0]], df_test.iloc[-timesteps:, [1]])
    print('RMSE of prediction: %f.' % pred_actual_rmse)
    return model, df_test, pred_actual_rmse[0]

# define function to calculate metrics
def avg_performance(func,run_time,val_ind):
    results = []
    for i in range(run_time):
        results[i] = func
    avg = np.mean([results[i][val_ind] for i in range(len(results))])
    return avg


# define function to project future vales
def LSTM_project(df,timesteps,n_features,future_days):
    train = df
    scaler = MinMaxScaler()
    train_scaled = scaler.fit(train).transform(train)
    generator = TimeseriesGenerator(data=train_scaled, targets=train_scaled, length=timesteps, batch_size=6)
    model = Sequential()
    model.add(LSTM(200, activation='relu', input_shape=(timesteps, n_features)))
    model.add(Dropout(0.1))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse',metrics=['mae','acc'])
    model.fit_generator(generator,epochs=180)

    pred_list = []
    batch = train_scaled[-timesteps:].reshape((1, timesteps, n_features))

    for i in range(timesteps):   
        pred_list.append(model.predict(batch)[0]) 
        batch = np.append(batch[:,1:,:],[[pred_list[i]]],axis=1)


    from pandas.tseries.offsets import DateOffset
    add_dates = [df.index[-1] + DateOffset(days=x) for x in range(0,future_days) ]
    future_dates = pd.DataFrame(index=add_dates[1:],columns=df.columns)

    df_predict = pd.DataFrame(scaler.inverse_transform(pred_list),
                            index=future_dates[-timesteps:].index, columns=['Prediction'])

    df_proj = pd.concat([df,df_predict], axis=1)

    sns.set(style='white')
    palette=sns.color_palette('hot_r',2)
    ax = sns.lineplot(data=df_proj,style='Prediction',size=(20,5),palette=palette,hue='Prediction')
    plt.title('LSTM Projection vs. Actual Data')
    plt.legend(loc='best', fontsize='medium')
    plt.xticks(fontsize=10,rotation=45)
    plt.yticks(fontsize=10)
    return model, df_proj
