In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
%matplotlib inline

# For reading stock data from yahoo
from pandas_datareader.data import DataReader

# For time stamps
from datetime import datetime

model_save_path = './PredictModel/vietnam_covid_predict.hdf5'
DATA_PATH = [
    './Data/1_Hà Nội_covid_data.csv', 
    './Data/2_TP HCM_covid_data.csv',
    './Data/3_Hải Phòng_covid_data.csv',
    './Data/4_Đà Nẵng_covid_data.csv',
    './Data/55_Cần Thơ_covid_data.csv',
]

# Data visualizing

In [None]:
df = [None] * len(DATA_PATH)
for path in range(len(DATA_PATH)):
    df[path] = pd.read_csv(DATA_PATH[path])
    df[path].drop(df[path].tail(26).index,inplace=True)
    for idx in range(len(df[path].index)):
        df[path].iat[idx, 0] = datetime.strptime(df[path].iat[idx, 0], '%Y-%m-%d %H:%M:%S')
    df[path] = df[path].sort_values(by=['date'])
    df[path] = df[path].drop(columns=['Unnamed: 9'])

In [None]:
df[0].tail(20)

In [None]:
df[1].tail(20)

In [None]:
df[2].tail(20)

In [None]:
df[3].tail(20)

In [None]:
df[4].tail(20)

In [None]:
# Summary Stats
df[0].describe()

In [None]:
# Summary Stats
df[1].describe()

In [None]:
# Summary Stats
df[2].describe()

In [None]:
# Summary Stats
df[3].describe()

In [None]:
# Summary Stats
df[4].describe()

In [None]:
df_filtered = [None] * len(DATA_PATH)
for path in range(len(DATA_PATH)):
    df_filtered[path] = df[path][df[path]['date'] >= datetime.strptime('1/1/2021 00:00', '%m/%d/%Y %H:%M')]

In [None]:
df_filtered[0].plot(x ='date', y='case_by_day', kind = 'line')

In [None]:
df_filtered[1].plot(x ='date', y='case_by_day', kind = 'line')

In [None]:
df_filtered[2].plot(x ='date', y='case_by_day', kind = 'line')

In [None]:
df_filtered[3].plot(x ='date', y='case_by_day', kind = 'line')

In [None]:
df_filtered[4].plot(x ='date', y='case_by_day', kind = 'line')

# Predicting case by day

In [None]:
new_df = [None] * len(DATA_PATH)
for path in range(len(DATA_PATH)):
    new_df[path] = df_filtered[path].set_index('date')

In [None]:
plt.figure(figsize=(16,6))
plt.title('Case By Day')
plt.plot(new_df[0]['case_by_day'])
plt.xlabel('Case', fontsize=18)
plt.ylabel('Time', fontsize=18)
plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title('Case By Day')
plt.plot(new_df[1]['case_by_day'])
plt.xlabel('Case', fontsize=18)
plt.ylabel('Time', fontsize=18)
plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title('Case By Day')
plt.plot(new_df[2]['case_by_day'])
plt.xlabel('Case', fontsize=18)
plt.ylabel('Time', fontsize=18)
plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title('Case By Day')
plt.plot(new_df[3]['case_by_day'])
plt.xlabel('Case', fontsize=18)
plt.ylabel('Time', fontsize=18)
plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title('Case By Day')
plt.plot(new_df[4]['case_by_day'])
plt.xlabel('Case', fontsize=18)
plt.ylabel('Time', fontsize=18)
plt.show()

In [None]:
data = [None] * len(DATA_PATH)
dataset = [None] * len(DATA_PATH)
training_data_len = [None] * len(DATA_PATH)
for path in range(len(DATA_PATH)):
    # Create a new dataframe with only the 'Close column 
    data[path] = new_df[path].filter(['case_by_day'])
    # Convert the dataframe to a numpy array
    dataset[path] = data[path].values
    # Get the number of rows to train the model on
    # training_data_len[path] = int(np.ceil(len(dataset[path]) * .95 ))
    training_data_len[path] = len(dataset[path]) - 30
    print(training_data_len[path])

In [None]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = [None] * len(DATA_PATH)
for path in range(len(DATA_PATH)):
    scaled_data[path] = scaler.fit_transform(dataset[path])

scaled_data

In [None]:
DATA_LEN = 60
train_data = [None] * len(DATA_PATH)
test_data = [None] * len(DATA_PATH)
x_train = [None] * len(DATA_PATH)
y_train = [None] * len(DATA_PATH)
x_test = [None] * len(DATA_PATH)
y_test = [None] * len(DATA_PATH)
for path in range(len(DATA_PATH)):
    # Create the training data set 
    # Create the scaled training data set
    train_data[path] = scaled_data[path][0:int(training_data_len[path]), :]
    # Split the data into x_train and y_train data sets
    x_train[path] = []
    y_train[path] = []

    for idx in range(DATA_LEN, len(train_data[path])):
        x_train[path].append(train_data[path][idx - DATA_LEN: idx, 0])
        y_train[path].append(train_data[path][idx, 0])

    # Convert the x_train and y_train to numpy arrays 
    x_train[path], y_train[path] = np.array(x_train[path]), np.array(y_train[path])

    # Reshape the data
    x_train[path] = np.reshape(x_train[path], (x_train[path].shape[0], x_train[path].shape[1], 1))
    
    # Create the testing data set
    test_data[path] = scaled_data[path][training_data_len[path] - DATA_LEN: , :]
    # Create the data sets x_test and y_test
    x_test[path] = []
    y_test[path] = dataset[path][training_data_len[path]:, :]
    for i in range(DATA_LEN, len(test_data[path])):
        x_test[path].append(test_data[path][i - DATA_LEN: i, 0])

    # Convert the data to a numpy array
    x_test[path], y_test[path] = np.array(x_test[path]), np.array(y_test[path])

    # Reshape the data
    x_test[path] = np.reshape(x_test[path], (x_test[path].shape[0], x_test[path].shape[1], 1 ))
    
x_train, y_train, x_test, y_test

In [None]:
from functools import reduce

X_train = x_train[0] if len(DATA_PATH) == 1 else reduce(lambda x, y: x + y, x_train[1:], x_train[0])
Y_train = y_train[0] if len(DATA_PATH) == 1 else reduce(lambda x, y: x + y, y_train[1:], y_train[0])
X_test = x_test[0] if len(DATA_PATH) == 1 else reduce(lambda x, y: x + y, x_test[1:], x_test[0])
Y_test = y_test[0] if len(DATA_PATH) == 1 else reduce(lambda x, y: x + y, y_test[1:], y_test[0])

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.utils.vis_utils import plot_model
#from tensorflow.keras.utils import plot_model

# Build the LSTM model
model = Sequential() 
model.add(LSTM(30, return_sequences=True, input_shape=(DATA_LEN, 1)))
model.add(LSTM(20, return_sequences=False))
model.add(Dense(10))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
model.summary()  # tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
# Train the model
model.fit(
    X_train, 
    Y_train, 
    batch_size=8, 
    epochs=50,
    validation_data=(X_test, Y_test)
)

In [None]:
predictions = [None] * len(DATA_PATH)
for path in range(len(DATA_PATH)):
    # Get the models predicted price values
    predictions[path] = model.predict(x_test[path])
    predictions[path] = scaler.inverse_transform(predictions[path])

    # Get the root mean squared error (RMSE)
    rmse = np.sqrt(np.mean(((predictions[path] - y_test[path]) ** 2)))
    print(rmse)

In [None]:
val_loss = model.evaluate(X_test, Y_test, batch_size=8)

In [None]:
train = [None] * len(DATA_PATH)
valid = [None] * len(DATA_PATH)
for path in range(len(DATA_PATH)):
    train[path] = data[path][:training_data_len[path]]
    valid[path] = data[path][training_data_len[path]:]
    # valid[path]['case_by_day_predict'] = predictions[path]
    valid[path].insert(1, 'case_by_day_predict', predictions[path])

In [None]:
for path in range(len(DATA_PATH)):
    for idx in range(len(valid[path].index)):
        valid[path].iat[idx, 0] = 0 if valid[path].iat[idx, 0] < 0 else valid[path].iat[idx, 0]
        valid[path].iat[idx, 1] = 0 if valid[path].iat[idx, 1] < 0 else valid[path].iat[idx, 1]

In [None]:
# Plot the data
plt.figure(figsize=(16,6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Case by day', fontsize=18)
plt.plot(train[0]['case_by_day'])
plt.plot(valid[0][['case_by_day', 'case_by_day_predict']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()

In [None]:
# Show the valid and predicted prices
valid[0]

In [None]:
# Plot the data
plt.figure(figsize=(16,6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Case by day', fontsize=18)
plt.plot(train[1]['case_by_day'])
plt.plot(valid[1][['case_by_day', 'case_by_day_predict']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()

In [None]:
# Show the valid and predicted prices
valid[1]

In [None]:
# Plot the data
plt.figure(figsize=(16,6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Case by day', fontsize=18)
plt.plot(train[2]['case_by_day'])
plt.plot(valid[2][['case_by_day', 'case_by_day_predict']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()

In [None]:
# Show the valid and predicted prices
valid[2]

In [None]:
# Plot the data
plt.figure(figsize=(16,6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Case by day', fontsize=18)
plt.plot(train[3]['case_by_day'])
plt.plot(valid[3][['case_by_day', 'case_by_day_predict']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()

In [None]:
# Show the valid and predicted prices
valid[3]

In [None]:
# Plot the data
plt.figure(figsize=(16,6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Case by day', fontsize=18)
plt.plot(train[4]['case_by_day'])
plt.plot(valid[4][['case_by_day', 'case_by_day_predict']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()

In [None]:
# Show the valid and predicted prices
valid[4]

In [None]:
model.save(model_save_path, include_optimizer=False)