# Deep Learning Time Series COVID-19 Cases Prediction

### Import libraries and packages

In [145]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import pandas_profiling
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Flatten
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
import requests

## Data Sourcing

### Data API 

#### By country over time

In [10]:
def fetch_time_series(feature='stringency', start_date='2020-02-14', end_date='2021-02-14'):
    """
    Get stringency time series for each countries requesting API.
    Returns json dict with TS between start_date and end_date like 'YYYY-MM-DD'.
    """
    url = f'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/{feature}/date-range/{start_date}/{end_date}'
    response = requests.get(url)
    if response.status_code != 200:
        return ''
    data = response.json()
    return data

In [3]:
countries_time_series_api = fetch_time_series()

In [4]:
[(k, [c for c in v if c == 'VNM'])  for k, v in countries_time_series_api.items()  if k == 'countries' ]

[('countries', ['VNM'])]

In [5]:
[([([([(vee)  for kaaa, veee  in vee.items() if kaaa in ['date_value', 'confirmed']  ])  for kaa, vee  in ve.items() if kaa =='USA'   ])  for ka, ve  in v.items() ])  for k, v in countries_time_series_api.items() if k=='data'   ]

[[[[{'date_value': '2020-02-14',
     'country_code': 'USA',
     'confirmed': 14,
     'deaths': 0,
     'stringency_actual': 5.56,
     'stringency': 5.56,
     'stringency_legacy': 7.14,
     'stringency_legacy_disp': 7.14},
    {'date_value': '2020-02-14',
     'country_code': 'USA',
     'confirmed': 14,
     'deaths': 0,
     'stringency_actual': 5.56,
     'stringency': 5.56,
     'stringency_legacy': 7.14,
     'stringency_legacy_disp': 7.14}]],
  [[{'date_value': '2020-02-15',
     'country_code': 'USA',
     'confirmed': 14,
     'deaths': 0,
     'stringency_actual': 5.56,
     'stringency': 5.56,
     'stringency_legacy': 7.14,
     'stringency_legacy_disp': 7.14},
    {'date_value': '2020-02-15',
     'country_code': 'USA',
     'confirmed': 14,
     'deaths': 0,
     'stringency_actual': 5.56,
     'stringency': 5.56,
     'stringency_legacy': 7.14,
     'stringency_legacy_disp': 7.14}]],
  [[{'date_value': '2020-02-16',
     'country_code': 'USA',
     'confirmed': 14,
 

In [6]:
[(k, [(ka, [(kaa, vee)  for kaa, vee  in ve.items() if kaa =='USA'   ])  for ka, ve  in v.items() ])  for k, v in data_api.items() if k=='data'   ]

[('data',
  [('2020-02-14',
    [('USA',
      {'date_value': '2020-02-14',
       'country_code': 'USA',
       'confirmed': 14,
       'deaths': 0,
       'stringency_actual': 5.56,
       'stringency': 5.56,
       'stringency_legacy': 7.14,
       'stringency_legacy_disp': 7.14})]),
   ('2020-02-15',
    [('USA',
      {'date_value': '2020-02-15',
       'country_code': 'USA',
       'confirmed': 14,
       'deaths': 0,
       'stringency_actual': 5.56,
       'stringency': 5.56,
       'stringency_legacy': 7.14,
       'stringency_legacy_disp': 7.14})]),
   ('2020-02-16',
    [('USA',
      {'date_value': '2020-02-16',
       'country_code': 'USA',
       'confirmed': 14,
       'deaths': 0,
       'stringency_actual': 5.56,
       'stringency': 5.56,
       'stringency_legacy': 7.14,
       'stringency_legacy_disp': 7.14})]),
   ('2020-02-17',
    [('USA',
      {'date_value': '2020-02-17',
       'country_code': 'USA',
       'confirmed': 14,
       'deaths': 0,
       'stringen

#### Country data for a specific day

In [4]:
def fetch_data(country='USA', date='2020-02-14'):
    """
    Get stringency data for one country {ALPHA-3} requesting API.
    Returns json dict with data for country like 'AAA' and specific date and like 'YYYY-MM-DD'.
    """
    url = f'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/actions/{country}/{date}'
    response = requests.get(url)
    if response.status_code != 200:
        return ''
    data = response.json()
    return data

In [5]:
country_data_api = fetch_data()

In [76]:
[[';'.join([str(kk) for kk, vv in d.items()]) for i, d in enumerate(v) if type(d) == dict and i == 0] for v in country_data_api.values()][0]

['policy_type_code;policy_type_display;policyvalue;policyvalue_actual;flagged;is_general;notes;flag_value_display_field;policy_value_display_field']

In [77]:
[[';'.join([str(vv) for kk, vv in d.items()]) for d in v if type(d) == dict] for v in country_data_api.values()][0]

['C1;School closing;0;0;None;None;None;General;No measures',
 'C2;Workplace closing;0;0;None;None;None;General;No measures',
 'C3;Cancel public events;0;0;None;None;None;General;No Measures',
 'C4;Restrictions on gatherings;0;0;None;None;None;General;No restrictions',
 'C5;Close public transport;0;0;None;None;None;General;No Measures',
 'C6;Stay at home requirements;0;0;None;None;None;General;No measures',
 'C7;Restrictions on internal movement;0;0;None;None;;General;No Measures',
 'C8;International travel controls;2;2;None;None;None;Quarantine',
 'E1;Income support;0;0;None;None;None;All workers;No income support',
 'E2;Debt/contract relief;0;0;None;None;None;None',
 'E3;Fiscal measures;0;0;None;None;;USD Value',
 'E4;International support;0;0;None;None;None;USD Value',
 'H1;Public information campaigns;0;0;None;None;None;General;None',
 'H2;Testing policy;0;0;None;None;None;No policy',
 'H3;Contact tracing;1;1;None;None;None;Limited',
 'H4;Emergency investment in healthcare;0;0;None;

In [105]:
# [';'.join([str(vv) for vv in v]) for v in country_data_api.values()][-1]
[';'.join([str(kk) for kk in v]) for k, v in country_data_api.items()][-1]

'date_value;country_code;confirmed;deaths;stringency_actual;stringency'

In [106]:
[[';'.join([str(vv) for kk, vv in d.items()]) for d in v] for v in country_data_api.values()][-1]

AttributeError: 'str' object has no attribute 'items'

### Read CSV

In [146]:
!curl -L 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/timeseries/c1m_school_closing.csv' > ../raw_data/c1m_school_closing.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1020k  100 1020k    0     0  2580k      0 --:--:-- --:--:-- --:--:-- 2624k


In [147]:
ts_c1m_csv = pd.read_csv('../raw_data/c1m_school_closing.csv') #,index_col=)

ts_c1m_csv.head()

Unnamed: 0.1,Unnamed: 0,country_code,country_name,region_code,region_name,jurisdiction,01Jan2020,02Jan2020,03Jan2020,04Jan2020,...,28Aug2022,29Aug2022,30Aug2022,31Aug2022,01Sep2022,02Sep2022,03Sep2022,04Sep2022,05Sep2022,06Sep2022
0,1,ABW,Aruba,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
1,2,AFG,Afghanistan,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,3,AGO,Angola,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,4,ALB,Albania,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,5,AND,Andorra,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [148]:
ts_all_csv = pd.read_csv('../raw_data/OxCGRT_timeseries_all.csv') #,index_col=)

ts_all_csv.head()

Unnamed: 0,country_code,country_name,region_code,region_name,jurisdiction,01Jan2020,02Jan2020,03Jan2020,04Jan2020,05Jan2020,...,26Aug2022,27Aug2022,28Aug2022,29Aug2022,30Aug2022,31Aug2022,01Sep2022,02Sep2022,03Sep2022,04Sep2022
0,ABW,Aruba,,,NAT_TOTAL,,,,,,...,25.93,25.93,25.93,25.93,25.93,25.93,25.93,25.93,,
1,AFG,Afghanistan,,,NAT_TOTAL,,,,,,...,,,,,,,,,,
2,AGO,Angola,,,NAT_TOTAL,,,,,,...,28.11,28.11,28.11,28.11,28.11,28.11,28.11,28.11,,
3,ALB,Albania,,,NAT_TOTAL,,,,,,...,,,,,,,,,,
4,AND,Andorra,,,NAT_TOTAL,,,,,,...,,,,,,,,,,


In [149]:
ts_cases_csv = pd.read_csv('../raw_data/confirmed_cases.csv') #,index_col=)

ts_cases_csv.head()

Unnamed: 0.1,Unnamed: 0,country_code,country_name,region_code,region_name,jurisdiction,01Jan2020,02Jan2020,03Jan2020,04Jan2020,...,27Aug2022,28Aug2022,29Aug2022,30Aug2022,31Aug2022,01Sep2022,02Sep2022,03Sep2022,04Sep2022,05Sep2022
0,1,ABW,Aruba,,,NAT_TOTAL,,,,,...,42750.0,42750.0,42792.0,42792.0,42848.0,42848.0,42848.0,42848.0,42848.0,42848.0
1,2,AFG,Afghanistan,,,NAT_TOTAL,,,,,...,191967.0,192463.0,192906.0,193004.0,193250.0,193520.0,193520.0,193912.0,193912.0,193912.0
2,3,AGO,Angola,,,NAT_TOTAL,,,,,...,102636.0,102636.0,102636.0,102636.0,102636.0,102636.0,102636.0,102636.0,102636.0,102636.0
3,4,ALB,Albania,,,NAT_TOTAL,,,,,...,328299.0,328515.0,328571.0,329017.0,329352.0,329615.0,329862.0,330062.0,330062.0,330062.0
4,5,AND,Andorra,,,NAT_TOTAL,,,,,...,46027.0,46027.0,46027.0,46027.0,46027.0,46027.0,46027.0,46027.0,46027.0,46027.0


In [150]:
ts_cases_csv.shape


(263, 985)

## Exploratory Data Analyzis

In [151]:
ts_cases_csv.isna()

Unnamed: 0.1,Unnamed: 0,country_code,country_name,region_code,region_name,jurisdiction,01Jan2020,02Jan2020,03Jan2020,04Jan2020,...,27Aug2022,28Aug2022,29Aug2022,30Aug2022,31Aug2022,01Sep2022,02Sep2022,03Sep2022,04Sep2022,05Sep2022
0,False,False,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,False,False,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
259,False,False,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
260,False,False,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
261,False,False,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False


In [152]:
# get VietNam country dataset
vn_data = ts_cases_csv.loc[ts_cases_csv['country_code'] == 'VNM'].copy()

vn_data.head()

Unnamed: 0.1,Unnamed: 0,country_code,country_name,region_code,region_name,jurisdiction,01Jan2020,02Jan2020,03Jan2020,04Jan2020,...,27Aug2022,28Aug2022,29Aug2022,30Aug2022,31Aug2022,01Sep2022,02Sep2022,03Sep2022,04Sep2022,05Sep2022
257,258,VNM,Vietnam,,,NAT_TOTAL,,,,,...,11401597.0,11403302.0,11405711.0,11408952.0,11411679.0,11414359.0,11415907.0,11417503.0,11417503.0,11417503.0


In [153]:
lmr_list = ['VNM']
lmr_data = ts_cases_csv[ts_cases_csv['country_code']=='']

In [154]:
ts_cases_csv.shape

(263, 985)

In [155]:
lmr_data.columns # .tail(7)


Index(['Unnamed: 0', 'country_code', 'country_name', 'region_code',
       'region_name', 'jurisdiction', '01Jan2020', '02Jan2020', '03Jan2020',
       '04Jan2020',
       ...
       '27Aug2022', '28Aug2022', '29Aug2022', '30Aug2022', '31Aug2022',
       '01Sep2022', '02Sep2022', '03Sep2022', '04Sep2022', '05Sep2022'],
      dtype='object', length=985)

In [156]:
lmr_data.shape

(0, 985)

In [None]:
import pandas_profiling

In [None]:
%%time
# vn_data.profile_report()

### Time Series Analysis

In [159]:
ts_cases.head()

Dates,01Jan2020,02Jan2020,03Jan2020,04Jan2020,05Jan2020,06Jan2020,07Jan2020,08Jan2020,09Jan2020,10Jan2020,...,27Aug2022,28Aug2022,29Aug2022,30Aug2022,31Aug2022,01Sep2022,02Sep2022,03Sep2022,04Sep2022,05Sep2022
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42750.0,42750.0,42792.0,42792.0,42848.0,42848.0,42848.0,42848.0,42848.0,42848.0
AFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,191967.0,192463.0,192906.0,193004.0,193250.0,193520.0,193520.0,193912.0,193912.0,193912.0
AGO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,102636.0,102636.0,102636.0,102636.0,102636.0,102636.0,102636.0,102636.0,102636.0,102636.0
ALB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,328299.0,328515.0,328571.0,329017.0,329352.0,329615.0,329862.0,330062.0,330062.0,330062.0
AND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,46027.0,46027.0,46027.0,46027.0,46027.0,46027.0,46027.0,46027.0,46027.0,46027.0


In [158]:
ts_cases = ts_cases_csv.drop(columns=['country_name','region_code','region_name','jurisdiction','Unnamed: 0'])
ts_cases = ts_cases.groupby('country_code').agg('sum')
ts_cases.transpose()
ts_cases.columns.name = 'Dates'
ts_cases = ts_cases.fillna(0)
# ts_cases.index = pd.to_datetime(ts_cases.index)

In [117]:
ts_cases = ts_cases.transpose()
ts_cases.head()

country_code,ABW,AFG,AGO,ALB,AND,ARE,ARG,AUS,AUT,AZE,...,USA,UZB,VEN,VIR,VNM,VUT,YEM,ZAF,ZMB,ZWE
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01Jan2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02Jan2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
03Jan2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
04Jan2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
05Jan2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
ts_cases.shape

country_code,ABW,AFG,AGO,ALB,AND,ARE,ARG,AUS,AUT,AZE,...,USA,UZB,VEN,VIR,VNM,VUT,YEM,ZAF,ZMB,ZWE
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [162]:
vn_ts_cases = vn_data.drop(columns=['country_name','region_code','region_name','jurisdiction','Unnamed: 0'])
vn_ts_cases = vn_ts_cases.groupby('country_code').agg('sum')
vn_ts_cases.transpose()
vn_ts_cases.columns.name = 'Dates'
vn_ts_cases = vn_ts_cases.fillna(0)
# ts_cases.index = pd.to_datetime(ts_cases.index)

In [164]:
vn_ts_cases

Dates,01Jan2020,02Jan2020,03Jan2020,04Jan2020,05Jan2020,06Jan2020,07Jan2020,08Jan2020,09Jan2020,10Jan2020,...,27Aug2022,28Aug2022,29Aug2022,30Aug2022,31Aug2022,01Sep2022,02Sep2022,03Sep2022,04Sep2022,05Sep2022
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VNM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11401597.0,11403302.0,11405711.0,11408952.0,11411679.0,11414359.0,11415907.0,11417503.0,11417503.0,11417503.0


In [165]:
vn_ts_cases = vn_ts_cases.transpose()

In [166]:
vn_ts_cases

country_code,VNM
Dates,Unnamed: 1_level_1
01Jan2020,0.0
02Jan2020,0.0
03Jan2020,0.0
04Jan2020,0.0
05Jan2020,0.0
...,...
01Sep2022,11414359.0
02Sep2022,11415907.0
03Sep2022,11417503.0
04Sep2022,11417503.0


## Recurrent Neural Network (sequences data) modeling

### Samples/Sequences, Observations, Features

X.shape = (n_SEQUENCES, n_OBSERVATIONS, n_FEATURES)

y = RNN(X)

❗️ Notation $X_{i,j}^{t}$

 $_{i}$ is the sample/sequence
 
 $_{j}$ is the feature measured
 
 $^{t}$ is the time at which the observation is seen

In [167]:
n_seq = ts_cases.shape[0] - 1 # nb of countries (samples)
n_obs = 15 # 15 days of observations
n_feat = 1 # 1feature: covid cases

X = np.array(ts).astype(np.float32)

y = np.array([y_a, y_b, y_c]).astype(np.float32)

#### Create sequences (`X`,`y`)

def **subsample_sequence(df, length)** -> pd.DataFrame:

function that given a full dataframe `df`:
- Create a sub sequences df, with long length?

In [183]:
def subsample_sequence(df, length) -> pd.DataFrame:
    """
    Given the initial dataframe `df`, return a shorter dataframe sequence of length `length`.
    This shorter sequence should be selected at random
    """
    last_possible = df.shape[0] - length
    
    random_start = np.random.randint(0, last_possible)
    df_sample = df[random_start: random_start+length]
    
    return df_sample

In [184]:
# Test it 
assert subsample_sequence(vn_ts_cases, 10).shape  == (10, 1)
assert subsample_sequence(vn_ts_cases, 400).shape == (400, 1)
subsample_sequence(vn_ts_cases, 10).shape, subsample_sequence(vn_ts_cases, 400).shape

((10, 1), (400, 1))

#### Pad `X` missing values with mean values

def **split_subsample_sequence(df,  length, sequence='VNM', df_mean=None)** -> tuple:

function that given a full dataframe `df`:
- Create a sub sequences df
- Stores the value of the pollution of the last day as your variable array `y`
- Stores all features of previous days as a variable `X`
- Returns (`X`, `y`)

In [190]:
def split_subsample_sequence(df,  length, sequence='VNM', df_mean=None) -> tuple:
    '''
    Create one single random (X_sample, y_sample)
    containing one sequence each of length `length`'''
    # Trick to save time during potential recursive calls
    if df_mean is None:
        df_mean = df.mean()
    df_subsample = subsample_sequence(df, length)
    y_sample = df_subsample.iloc[length -1][sequence]
    # Case y_sample is NaN: redraw !
    if y_sample != y_sample: # A value is not equal to itself only for NaN
            X_sample, y_sample = split_subsample_sequence(df, length, df_mean) # Recursive call !!!
            return np.array(X_sample), np.array(y_sample)    
    X_sample = df_subsample[0:length -1]
    # Case X_sample has some NaNs
    if X_sample.isna().sum().sum() !=0:
        X_sample = X_sample.fillna(compute_means(X_sample, df_mean))
        X_sample = X_sample.values

    return np.array(X_sample), np.array(y_sample)

In [191]:
# Test it
(X_sample, y_sample) = split_subsample_sequence(vn_ts_cases, 10)
assert X_sample.shape == (9,1)
assert y_sample.shape == ()
X_sample.shape, y_sample.shape

((9, 1), ())

#### Generates an entire dataset of multiple subsamples with shape $(X, y)$

def **get_X_y(df, n_sequences, length)** -> tuple:

function to generates an entire dataset of multiple subsamples suitable for RNN, that is, $(X, y)$ of shape:

```python
X.shape = (n_sequences, length, n_features)
y.shape = (n_sequences, )
```

In [192]:
def get_X_y(df, n_sequences, length) -> tuple:
    '''Return a list of samples (X, y)'''
    X, y = [], []

    for i in range(n_sequences):
        (xi, yi) = split_subsample_sequence(df, length)
        X.append(xi)
        y.append(yi)
        
    X = np.array(X)
    y = np.array(y)

    return X, y

Generate your dataset $(X, y)$ of `200` sequences, each of `20` observations + the covid cases at the 21-st day to predict

In [194]:
X, y = get_X_y(vn_ts_cases, 200, 21)
print(X.shape)
print(y.shape)

(200, 20, 1)
(200,)


In [195]:
# Check your code below
assert X.shape == (200, 20, 1)
assert y.shape == (200, )
assert np.isnan(X).sum() == 0

### Split train

In [65]:
# -1. Train splitting
# Let's keep the last 40% of the values out for testing purposes
train_size = 0.6
index = round(train_size * ts_cases.shape[0])

X_train = ts_cases.iloc[:index]
X_test = ts_cases.iloc[index:]

Let's not cross-validate in this challenge to start with 🤯 
- Separate `df` into `df_train` and `df_test` such that the first 80% of the dataframe is in the training, and the last 20% in the test set.
- Then generate (`X_train`, `y_train`) from `df_train` and (`X_test`, `y_test`) from `df_test`
- Ensure that `X_train.shape == (200, 20, 1)`

In [197]:
len_ = int(0.8*ts_cases.shape[0])
df_train = ts_cases[:len_]
df_test = ts_cases[len_:]
df_train.shape, df_test.shape

((149, 979), (38, 979))

In [198]:
X_train, y_train = get_X_y(df_train, 200, 21)
X_test, y_test = get_X_y(df_test, 40, 21)

KeyError: 'VNM'

In [66]:
X_train.shape, X_test.shape

((112, 979), (75, 979))

### Normalization layer

In [67]:
# 0. The Normalization Layer
normalizer = Normalization()  # Instantiate a "normalizer" layer
normalizer.adapt(X_train) # "Fit" it on the train set

2022-09-06 23:08:59.935778: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### RNN model architecture

In [68]:
# 1. The Architecture
rnn_model = Sequential()
rnn_model.add(normalizer) # Using the Normalization layer to standardize the datapoints during the forward pass
rnn_model.add(SimpleRNN(units=20, activation='tanh'))  # , input_shape=(?,?)))
# rnn_model.add(layers.Dense(10, activation = 'relu'))
# model.add(layers.Dropout(0.3))
rnn_model.add(layers.Dense(1, activation = 'linear'))

ValueError: Input 0 of layer "simple_rnn" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 979)

### Compile model with 'rmsprop'

In [169]:
# 2. Compiling with 'rmsprop' rather than 'adam' (recommended)
rnn_model.compile(loss='mse',
              optimizer='rmsprop')  # Recommended optimizer for RNNs

In [170]:
rnn_model.summary()

ValueError: This model has not yet been built. Build the model first by calling `build()` or by calling the model on a batch of data.

### Train model

In [172]:
# 3. Training
es = EarlyStopping(monitor = 'val_accuracy',
                   patience = 5,
                   verbose = 0,
                   restore_best_weights = True)

rnn_model.fit(X_train,
          y_train, validation_data = (X_train, y_train),
          batch_size = 16,
          epochs = 5,
          callbacks = [es],
          verbose=1)

NameError: name 'y_train' is not defined

### Evaluate model

In [173]:
# 4. Evaluating
# REGRESSION compilation
model.compile(loss='mse', 
              optimizer='adam', 
              metrics=['mae'])

# The fit
model.fit(X, y,
         batch_size=16,
         epochs=10, verbose=0)

# The prediction (one per sequence/city)
model.predict(X)

NameError: name 'model' is not defined

## Time Series Forecasting