In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv('../input/time_series_covid19_confirmed_global.csv')

## US

In [None]:
us_data_raw = data[data["Country/Region"] == "US"]
us_data = pd.DataFrame(us_data_raw[us_data_raw.columns[4:]].sum(),columns=["confirmed"])
us_data.index = pd.to_datetime(us_data.index)
us_data.head()

In [None]:
us_data.plot(title="US Confirmed cases")

In [None]:
us_data_daily = us_data.diff()[1:]
us_data_daily.plot(title="US Daily Confirmed cases")

In [None]:
x = len(us_data_daily)-14
x

In [None]:
train = us_data_daily.iloc[:x]
test = us_data_daily.iloc[x:]
train

In [None]:
test

In [None]:
##scale or normalize data as the data is too skewed
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
scaler.fit(train) #find max value 

In [None]:
scaled_train = scaler.transform(train)#and divide every point by max value
scaled_test = scaler.transform(test)
print(scaled_train[-5:])

In [None]:
## feed in batches [t1,t2,t3] --> t4
from keras.preprocessing.sequence import TimeseriesGenerator

In [None]:
scaled_train.shape

In [None]:
## how to decide num of inputs , 
n_input = 14  ## number of steps
n_features = 1 ## number of features you want to predict (for univariate time series n_features=1)
generator = TimeseriesGenerator(scaled_train,scaled_train,length = n_input,batch_size=1)

In [None]:
len(scaled_train)

In [None]:
len(generator)

In [None]:
## above takes 5 inputs and predicts next point in scaled_train
## smaller batch size leads to better trainig for time series

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation

In [None]:
model = Sequential()
model.add(LSTM(150,activation="relu",input_shape=(n_input,n_features)))
model.add(Dropout(0.2))
model.add(Dense(75, activation='relu'))
model.add(Dense(units=1))
#model.add(Dense(1))
model.compile(optimizer="adam",loss="mse")

In [None]:
model.summary()

In [None]:
validation_set = np.append(scaled_train[70:84],scaled_test)
validation_set = validation_set.reshape(28,1)
validation_set

In [None]:
## how to decide num of inputs , 
n_input = 14
n_features = 1
validation_gen = TimeseriesGenerator(validation_set,validation_set,length=14,batch_size=1)

In [None]:
validation_gen[0][0].shape,validation_gen[0][1].shape

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',patience=20,restore_best_weights=True)

In [None]:
model.fit_generator(generator,validation_data=validation_gen,epochs=100,callbacks=[early_stop],steps_per_epoch=10)

In [None]:
pd.DataFrame(model.history.history).plot(title="loss vs epochs curve")

In [None]:
### evaluation batch
## 5 history steps ---> step 6
## last 5 point train predicts point 1 of test data

## forecast

In [None]:
## holding predictions
test_prediction = []

##last n points from training set
first_eval_batch = scaled_train[-n_input:]
current_batch = first_eval_batch.reshape(1,n_input,n_features)

In [None]:
current_batch.shape

In [None]:
## how far in future we can predict
for i in range(len(test)+60):
    current_pred = model.predict(current_batch)[0]
    test_prediction.append(current_pred)
    current_batch = np.append(current_batch[:,1:,:],[[current_pred]],axis=1)

In [None]:
### inverse scaled data
true_prediction = scaler.inverse_transform(test_prediction)
true_prediction[:,0]

In [None]:
time_series_array = test.index
for k in range(0,60):
    time_series_array = time_series_array.append(time_series_array[-1:] + pd.DateOffset(1))
time_series_array

In [None]:
df_forecast = pd.DataFrame(columns=["confirmed","confirmed_predicted"],index=time_series_array)
df_forecast

In [None]:
len(df_forecast.loc[:,"confirmed_predicted"]), len(true_prediction[:,0])

In [None]:
df_forecast.loc[:,"confirmed_predicted"] = true_prediction[:,0]
df_forecast.loc[:,"confirmed"] = test["confirmed"]

In [None]:
df_forecast["2020-04-10":"2020-05-10"]

In [None]:
df_forecast

In [None]:
#plt.ylim([80000,85000])
df_forecast.plot(title="Predictions for next 2 month")

In [None]:
MAPE = np.mean(np.abs(np.array(df_forecast["confirmed"][:5]) - np.array(df_forecast["confirmed_predicted"][:5]))/np.array(df_forecast["confirmed"][:5]))
print("MAPE is " + str(MAPE*100) + " %")

In [None]:
sum_errs = np.sum((np.array(df_forecast["confirmed"][:5]) - np.array(df_forecast["confirmed_predicted"][:5]))**2)
sum_errs

In [None]:
stdev = np.sqrt(1/(5-2) * sum_errs)
stdev

In [None]:
# calculate prediction interval
interval = 1.96 * stdev
interval

In [None]:
df_forecast["confirm_min"] = df_forecast["confirmed_predicted"] - interval
df_forecast["confirm_max"] = df_forecast["confirmed_predicted"] + interval
df_forecast

In [None]:
df_forecast["Model Accuracy"] = round((1-MAPE),2)
df_forecast

In [None]:
from datetime import datetime
df_forecast["Execution date"] = "2020-04-29"
df_forecast

In [None]:
df_forecast_n = pd.concat([us_data_daily[:"2020-04-16"],df_forecast],axis=0,join='outer')
df_forecast = df_forecast_n

In [None]:
df_forecast.iloc[:,:4].plot()

In [None]:
fig= plt.figure(figsize=(10,5))
plt.title("Results")
plt.plot(df_forecast.index,df_forecast["confirmed"],label="confirmed")
plt.plot(df_forecast.index,df_forecast["confirmed_predicted"],label="confirmed_predicted")
#ax.fill_between(x, (y-ci), (y+ci), color='b', alpha=.1)
plt.fill_between(df_forecast.index,df_forecast["confirm_min"],df_forecast["confirm_max"],color="indigo",alpha=0.09,label="Confidence Interval")
plt.legend()
plt.show()