# Deep Learning Time Series COVID-19 Cases Prediction

### **Import libraries** and **packages**

In [1]:
import numpy as np
import pandas as pd
import scipy
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import requests
import pandas_profiling
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers import Dense, SimpleRNN, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Data Sourcing

### Data API 

#### By country over time

In [2]:
def fetch_time_series(feature='stringency', start_date='2020-02-14', end_date='2021-02-14'):
    """
    Get stringency time series for each countries requesting API.
    Returns json dict with TS between start_date and end_date like 'YYYY-MM-DD'.
    """
    url = f'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/{feature}/date-range/{start_date}/{end_date}'
    response = requests.get(url)
    if response.status_code != 200:
        return ''
    data = response.json()
    return data

In [3]:
countries_time_series_api = fetch_time_series()

In [4]:
[(k, [c for c in v if c == 'VNM'])  for k, v in countries_time_series_api.items()  if k == 'countries' ]

[('countries', ['VNM'])]

In [5]:
[([([([(vee)  for kaaa, veee  in vee.items() if kaaa in ['date_value', 'confirmed']  ])  for kaa, vee  in ve.items() if kaa =='VNM'   ])  for ka, ve  in v.items() ])  for k, v in countries_time_series_api.items() if k=='data'   ]

[[[[{'date_value': '2020-02-14',
     'country_code': 'VNM',
     'confirmed': 16,
     'deaths': 0,
     'stringency_actual': 45.37,
     'stringency': 45.37,
     'stringency_legacy': 54.76,
     'stringency_legacy_disp': 54.76},
    {'date_value': '2020-02-14',
     'country_code': 'VNM',
     'confirmed': 16,
     'deaths': 0,
     'stringency_actual': 45.37,
     'stringency': 45.37,
     'stringency_legacy': 54.76,
     'stringency_legacy_disp': 54.76}]],
  [[{'date_value': '2020-02-15',
     'country_code': 'VNM',
     'confirmed': 16,
     'deaths': 0,
     'stringency_actual': 47.22,
     'stringency': 47.22,
     'stringency_legacy': 58.33,
     'stringency_legacy_disp': 58.33},
    {'date_value': '2020-02-15',
     'country_code': 'VNM',
     'confirmed': 16,
     'deaths': 0,
     'stringency_actual': 47.22,
     'stringency': 47.22,
     'stringency_legacy': 58.33,
     'stringency_legacy_disp': 58.33}]],
  [[{'date_value': '2020-02-16',
     'country_code': 'VNM',
     'c

In [6]:
[(k, [(ka, [(kaa, vee)  for kaa, vee  in ve.items() if kaa =='USA'   ])  for ka, ve  in v.items() ])  for k, v in countries_time_series_api.items() if k=='data'   ]

[('data',
  [('2020-02-14',
    [('USA',
      {'date_value': '2020-02-14',
       'country_code': 'USA',
       'confirmed': 14,
       'deaths': 0,
       'stringency_actual': 5.56,
       'stringency': 5.56,
       'stringency_legacy': 7.14,
       'stringency_legacy_disp': 7.14})]),
   ('2020-02-15',
    [('USA',
      {'date_value': '2020-02-15',
       'country_code': 'USA',
       'confirmed': 14,
       'deaths': 0,
       'stringency_actual': 5.56,
       'stringency': 5.56,
       'stringency_legacy': 7.14,
       'stringency_legacy_disp': 7.14})]),
   ('2020-02-16',
    [('USA',
      {'date_value': '2020-02-16',
       'country_code': 'USA',
       'confirmed': 14,
       'deaths': 0,
       'stringency_actual': 5.56,
       'stringency': 5.56,
       'stringency_legacy': 7.14,
       'stringency_legacy_disp': 7.14})]),
   ('2020-02-17',
    [('USA',
      {'date_value': '2020-02-17',
       'country_code': 'USA',
       'confirmed': 14,
       'deaths': 0,
       'stringen

#### Country data for a specific day

In [7]:
def fetch_data(country='USA', date='2020-08-14'):
    """
    Get stringency data for one country {ALPHA-3} requesting API.
    Returns json dict with data for country like 'AAA' and specific date and like 'YYYY-MM-DD'.
    """
    url = f'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/actions/{country}/{date}'
    response = requests.get(url)
    if response.status_code != 200:
        return ''
    data = response.json()
    return data

In [8]:
country_data_api = fetch_data()

In [9]:
[[';'.join([str(kk) for kk, vv in d.items()]) for i, d in enumerate(v) if type(d) == dict and i == 0] for v in country_data_api.values()][0]

['policy_type_code;policy_type_display;policyvalue;policyvalue_actual;flagged;is_general;notes;flag_value_display_field;policy_value_display_field']

In [10]:
[[';'.join([str(vv) for kk, vv in d.items()]) for d in v if type(d) == dict] for v in country_data_api.values()][0]

['C1;School closing;3;3;False;False;According to the New York Times, New York City is the only major school district in the country to open for students part-time this fall. \r\n\r\nSee the article in the New York Times, archived: http://archive.vn/OkUNQ;General;Require closing (all levels)',
 'C2;Workplace closing;2;2;False;False;None;General;Require closing some sectors',
 'C3;Cancel public events;2;2;False;False;None;General;Required',
 'C4;Restrictions on gatherings;4;4;False;False;None;General;Restrictions on gatherings of 10 or fewer people',
 'C5;Close public transport;1;1;False;False;None;General;Recommended',
 'C6;Stay at home requirements;1;1;True;True;None;General;Recommend staying home',
 'C7;Restrictions on internal movement;2;2;False;False;None;General;Required',
 'C8;International travel controls;3;3;None;None;None;Ban',
 'E1;Income support;2;2;False;False;None;All workers;>50% lost income',
 'E2;Debt/contract relief;1;1;None;None;None;Narrow',
 'E3;Fiscal measures;0;0;N

In [11]:
# [';'.join([str(vv) for vv in v]) for v in country_data_api.values()][-1]
[';'.join([str(kk) for kk in v]) for k, v in country_data_api.items()][-1]

'date_value;country_code;confirmed;deaths;stringency_actual;stringency'

### CSV data

#### **Read URL**, **Get CSV files** and **store CSV in local**  *(optional do it at begining or to refresh CSV data)*

##### **get_database_to_csv()** function

In [12]:
def get_database_to_csv(url, csv_list, path='', db_grid=[]) -> list:
    """
    function that take in parameter:
     - a root URL (string) to get the CSV data,
     - a list of CSV files,
     - a path (string) to store CSV in local,
     - a grid (list of list) to add in the CSV filename, URL, local path.     
    and returns the gird updated with the CSVs of the list
    
    """

    ### Create a database grid (list of list) with all CSVs and associated URLs
    # print('db_grid', db_grid)
    #### Data project directory (if empty do not store CSV in local)
    # print('path', path)
    ### Website CSV datasets URL
    # print('url', url)
    #### List of CSVs of Website to retrieve
    # print('csv_list', csv_list)

    #### Length of grid aka number of CSVs already stored in grid
    len_grid = len(db_grid)

    for l in range(len(csv_list)):
        # g = l + len_grid
        sub_list = []       
        sub_list.append(csv_list[l]) ## 1st pos°: CSV filename
        sub_list.append(url + csv_list[l]) ## 2nd pos°: URL + CSV
        if len(data_dir) > 0: ## store CSV in local
            sub_list.append(data_dir + csv_list[l]) ## 3rd pos°: local data path + CSV
            !curl -L "{url + csv_list[l]}" > {data_dir + csv_list[l]} ## curl <URL>/<CSV> => <path>
        print('sub_list', sub_list)
        db_grid.append(sub_list)

    ### Return a database grid (list of list) with all CSVs and associated URLs
    return db_grid

#### **Get database to csv**

In [13]:
#### Data project directory
data_dir = '../raw_data/'

### Oxford Master data time series URL
url_root_oxford = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/timeseries/'

#### List of CSVs of Oxford database Feel free to add more feature...
csv_list = ['confirmed_cases.csv', 'confirmed_deaths.csv', 'government_response_index_avg.csv', 'stringency_index_avg.csv', 
            'containment_health_index_avg.csv', 'economic_support_index.csv', 'c1m_school_closing.csv', 
            'c2m_workplace_closing.csv', 'c3m_cancel_public_events.csv', 'c4m_restrictions_on_gatherings.csv', 
            'c5m_close_public_transport.csv', 'c6m_stay_at_home_requirements.csv', 'c7m_movementrestrictions.csv',
            'c8ev_internationaltravel.csv', 'e1_income_support.csv', 'e2_debtrelief.csv', 'h1_public_information_campaigns.csv',
            'h2_testing_policy.csv', 'h3_contact_tracing.csv', 'h6m_facial_coverings.csv',
            'h7_vaccination_policy.csv', 'h8m_protection_of_elderly_ppl.csv'
           ]
# print(csv_list)
    
### Vacinations Dataset URLs
url_root_vaccinations = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/'

#### List of CSVs of Vaccinations database
csv_list_vax = ['vaccinations.csv', 'vaccinations-by-age-group.csv']
# print(csv_list_vax)

### Create a database grid (list of list) with all CSVs and associated URLs
# db_grid = [[]]
# print(db_grid)
### Insert into database grid all CSVs and associated URLs from Oxford website
db_grid = get_database_to_csv(url_root_oxford, csv_list, data_dir) ## uncomment to store CSV
### Insert into database grid all CSVs and associated URLs from vaccinations website
db_grid = get_database_to_csv(url_root_vaccinations, csv_list_vax, data_dir, db_grid)
# print('db_grid', db_grid)

# Stack all csl in the list
csv_list += csv_list_vax

# transform list into dict:
csv = dict(zip(csv_list, [v[0] for v in enumerate(csv_list)])) # if v[1] == 'containment_health_index_avg.csv'
csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1994k  100 1994k    0     0  4073k      0 --:--:-- --:--:-- --:--:-- 4121k
sub_list ['confirmed_cases.csv', 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/timeseries/confirmed_cases.csv', '../raw_data/confirmed_cases.csv']
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1584k  100 1584k    0     0  2597k      0 --:--:-- --:--:-- --:--:-- 2631k
sub_list ['confirmed_deaths.csv', 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/timeseries/confirmed_deaths.csv', '../raw_data/confirmed_deaths.csv']
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1927k  10

{'confirmed_cases.csv': 0,
 'confirmed_deaths.csv': 1,
 'government_response_index_avg.csv': 2,
 'stringency_index_avg.csv': 3,
 'containment_health_index_avg.csv': 4,
 'economic_support_index.csv': 5,
 'c1m_school_closing.csv': 6,
 'c2m_workplace_closing.csv': 7,
 'c3m_cancel_public_events.csv': 8,
 'c4m_restrictions_on_gatherings.csv': 9,
 'c5m_close_public_transport.csv': 10,
 'c6m_stay_at_home_requirements.csv': 11,
 'c7m_movementrestrictions.csv': 12,
 'c8ev_internationaltravel.csv': 13,
 'e1_income_support.csv': 14,
 'e2_debtrelief.csv': 15,
 'h1_public_information_campaigns.csv': 16,
 'h2_testing_policy.csv': 17,
 'h3_contact_tracing.csv': 18,
 'h6m_facial_coverings.csv': 19,
 'h7_vaccination_policy.csv': 20,
 'h8m_protection_of_elderly_ppl.csv': 21,
 'vaccinations.csv': 22,
 'vaccinations-by-age-group.csv': 23}

### *Get database to csv one by one*


#### Target URLs
url_cases = url_root_oxford + 'confirmed_cases.csv'
url_deaths = url_root_oxford + 'confirmed_deaths.csv'

#### Index URLs
url_id1= url_root_oxford + csv_i1 ## index_strigency
url_id2 = url_root_oxford + csv_i2 ## index_gov_resp
url_id3_health = url_root_oxford + csv_i3 ## index_health
url_id4 = url_root_oxford + csv_i4 ## index_economic

#### C[1-8] URLs
url_c1 = url_root_oxford + csv_c1 ## 'C1;School closing;'
url_c2 = url_root_oxford + csv_c1 ## 'C2;Workplace closing;'
url_c3 = url_root_oxford + csv_c3 ## 'C3;Cancel public events;'
url_c4 = url_root_oxford + csv_c4 ## 'C4;Restrictions on gatherings;'
url_c5 = url_root_oxford + csv_c5 ## 'C5;Close public transport'
url_c6 = url_root_oxford + csv_c6 ## 'C6;Stay at home requirements;'
url_c7 = url_root_oxford + csv_c7 ## 'C7;Restrictions on internal movement;'
url_c8 = url_root_oxford + csv_c8 ##  'C8;International travel controls;'
 
#### E[1-4] URLs
url_e1 = url_root_oxford + csv_e1 ## 'E1;Income support;'
url_e2 = url_root_oxford + csv_e2 ## 'E2;Debt/contract relief;'
url_e3 = url_root_oxford + csv_e3 ## 'E3;Fiscal measures;'
url_e4 = url_root_oxford + csv_e4 ## 'E4;International support;'
 
#### H[1-8] URLs
url_h1 = url_root_oxford + csv_h1 ## 'H1;Public information campaigns;'
url_h2 = url_root_oxford + csv_h2 ## 'H2;Testing policy'
url_h3 = url_root_oxford + csv_h3 ## 'H3;Contact tracing;'
url_h3 = url_root_oxford + csv_h4 ## 'H4;Emergency investment in healthcare;0;0;None;None;None;USD Value',
url_h3 = url_root_oxford + csv_h5 ## 'H5;Investment in vaccines;0;0;None;None;None;USD Value',
url_h3 = url_root_oxford + csv_h6 ## 'H6;Facial Coverings;0;0;None;None;None;General;No policy',
url_h3 = url_root_oxford + csv_h7 ## 'H7;Vaccination policy;0;0;None;None;None;Government funded;No availability',
url_h3 = url_root_oxford + csv_h8 ## 'H8;Protection of elderly people;1;1;True;True;None;General;Recommended protections',

#### V[1-4] URLs
url_v1 = url_root_oxford + csv_h1 ## 'V1;Vaccine Prioritisation;-2;-2;None;None;None',
url_v2 = url_root_oxford + csv_h2 ## 'V2;Vaccine Availability;-2;-2;None;None;None',
url_v3 = url_root_oxford + csv_h3 ## 'V3;Vaccine Financial Support;-2;-2;None;None;None',
url_v4 = url_root_oxford + csv_h4 ## 'V4;Mandatory Vaccination;-2;-2;None;None;None']

### Vacinations Dataset URLs
url_root_vaccinations = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/'
url_v5  = url_root_vaccinations + csv_v5 ## '/vaccinations.csv'
url_v6 = url_root_vaccinations + csv_v6 ## 'vaccinations-by-age-group.csv'


#### DataFrame Index
!curl -L "{url_i1}" > {data_dir}{csv_i1}
!curl -L "{url_i2}" > {data_dir}{csv_i2}
!curl -L "{url_i3}" > {data_dir}{csv_i3}
!curl -L "{url_i4}" > {data_dir}{csv_i4}
!curl -L "{url_c1}" > {data_dir}{csv_c1}
!curl -L "{url_c2}" > {data_dir}{csv_c2}
!curl -L "{url_c3}" > {data_dir}{csv_c3}
!curl -L "{url_c4}" > {data_dir}{csv_c4}
!curl -L "{url_c5}" > {data_dir}{csv_c5}
!curl -L "{url_c6}" > {data_dir}{csv_c6}
!curl -L "{url_c7}" > {data_dir}{csv_c7}
!curl -L "{url_c8}" > {data_dir}{csv_c8}

df_raw_school_closing=pd.read_csv('../raw_data/c1m_school_closing.csv')
df_raw_workplace_closing=pd.read_csv('../raw_data/c2m_workplace_closing.csv')
df_raw_cancel_public_event=pd.read_csv('../raw_data/c3m_cancel_public_events.csv')
df_raw_restriction_on_gathering=pd.read_csv('../raw_data/c4m_restrictions_on_gatherings.csv')
df_raw_stay_at_home=pd.read_csv('../raw_data/c6m_stay_at_home_requirements.csv')
df_raw_international_travel=pd.read_csv('../raw_data/c6m_stay_at_home_requirements.csv')
df_raw_goverment_response=pd.read_csv('../raw_data/government_response_index_avg.csv')
df_raw_facial_covering=pd.read_csv('../raw_data/h6m_facial_coverings.csv')
df_raw_vacination_policy=pd.read_csv('../raw_data/h7_vaccination_policy.csv')



#### Vaccination
!curl -L "{url_vaccination}" > {data_dir}{vaccination_csv}
!curl -L "{url_ages}" > {data_dir}{ages_csv}

#### CSV target
!curl -L "{url_cases}" > {data_dir}{cases_csv}
!curl -L "{url_deaths}" > {data_dir}{deaths_csv}



### **Read CSV** and **Set raw dataframe**

In [15]:
db_grid[csv['containment_health_index_avg.csv']]

['containment_health_index_avg.csv',
 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/timeseries/containment_health_index_avg.csv',
 '../raw_data/containment_health_index_avg.csv']

In [None]:
df_raw_gov_response = pd.read_csv(data_dir + 'government_response_index_avg.csv')
df_raw_health = pd.read_csv(data_dir + 'containment_health_index_avg.csv')
df_raw_economic = pd.read_csv(data_dir + 'economic_support_index.csv')

#### Vaccination
df_raw_vaccination = pd.read_csv(data_dir + 'vaccinations.csv')
df_raw_ages = pd.read_csv(data_dir + 'vaccinations-by-age-group.csv')


#### Data Frame target
df_raw_cases = pd.read_csv(data_dir + 'confirmed_deaths.csv')
df_raw_deaths = pd.read_csv(data_dir + 'confirmed_deaths.csv')

#### Data multiple
# df_raw_confirmed_cases=pd.read_csv(url_cases)
# df_raw_death=pd.read_csv(data_dir + 'confirmed_deaths.csv')
df_raw_school_closing=pd.read_csv(data_dir + 'c1m_school_closing.csv')
df_raw_workplace_closing=pd.read_csv(data_dir + 'c2m_workplace_closing.csv')
df_raw_cancel_public_event=pd.read_csv(data_dir + 'c3m_cancel_public_events.csv')
df_raw_restriction_on_gathering=pd.read_csv(data_dir + 'c4m_restrictions_on_gatherings.csv')
df_raw_stay_at_home=pd.read_csv(data_dir + 'c6m_stay_at_home_requirements.csv')
df_raw_international_travel=pd.read_csv(data_dir + 'c6m_stay_at_home_requirements.csv')
df_raw_goverment_response=pd.read_csv(data_dir + 'government_response_index_avg.csv')
df_raw_facial_covering=pd.read_csv(data_dir + 'h6m_facial_coverings.csv')
df_raw_vacination_policy=pd.read_csv(data_dir + 'h7_vaccination_policy.csv')

In [None]:
df_raw_cases.head()

In [None]:
df_raw_cases.shape

## Exploratory Data Analyzis

In [None]:
df_raw_cases.isna()

In [None]:
# get VietNam country dataset
vn_data = df_raw_cases.loc[df_raw_cases['country_code'] == 'VNM'].copy()

vn_data.head()

In [None]:
# %%time
# vn_data.profile_report()

### Time Series Analysis

In [None]:
df_raw_cases.head()

In [None]:
ts_cases = df_raw_cases.drop(columns=['country_name','region_code','region_name','jurisdiction','Unnamed: 0'])
ts_cases = ts_cases.groupby('country_code').agg('sum')
ts_cases.transpose()
ts_cases.columns.name = 'Dates'
ts_cases = ts_cases.fillna(0)
# ts_cases.index = pd.to_datetime(ts_cases.index)

In [None]:
ts_cases = ts_cases.transpose()
ts_cases.head()

In [None]:
ts_cases.shape

In [None]:
vn_ts_cases = vn_data.drop(columns=['country_name','region_code','region_name','jurisdiction','Unnamed: 0'])
vn_ts_cases = vn_ts_cases.groupby('country_code').agg('sum')
vn_ts_cases.transpose()
vn_ts_cases.columns.name = 'Dates'
vn_ts_cases = vn_ts_cases.fillna(0)
# ts_cases.index = pd.to_datetime(ts_cases.index)

In [None]:
vn_ts_cases

In [None]:
vn_ts_cases = vn_ts_cases.transpose()

In [None]:
vn_ts_cases

In [25]:
# Auto reload imported module every time a jupyter cell is executed (handy for olist.order.py updates)
%load_ext autoreload
%autoreload 2
from covid_time_series_prediction.indicator import Indicator
from indicator.data import Olist
Indicator().ping()
indicator = Indicator()
data = indicator.get_data()
data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: No module named 'covid_time_series_prediction'

In [24]:
country_index, country_indicator = Indicator().get_country_data(country_name='France')
country_index, country_indicator

ModuleNotFoundError: No module named 'covid_time_series_prediction'

## Recurrent Neural Network (sequences data) modeling

### Samples/Sequences, Observations, Features

X.shape = (n_SEQUENCES, n_OBSERVATIONS, n_FEATURES)

y = RNN(X)

❗️ Notation $X_{i,j}^{t}$

 $_{i}$ is the sample/sequence
 
 $_{j}$ is the feature measured
 
 $^{t}$ is the time at which the observation is seen

In [None]:
n_seq = 50 # ts_cases.shape[0] - 1 # nb of countries (samples)
n_obs = 15 # 15 days of training periiod (observations)
n_feat = 1 # 1feature: covid cases
n_pred = 1 # nb of days of prediction

X = np.array(ts).astype(np.float32)

y = np.array([y_a, y_b, y_c]).astype(np.float32)



### Prerequisites:

- **retrieve dataset** from Alberto

    - **clean dataset**: 
        
        - **drop first lines == 0** *(before Covid arrived)*
        
        - **check Nan**: 
- **strategy 1 country by country** sequences split as follow:

- **strategy 2 one sequence per country**:
    - **split X train, set** 
    - **Pad sequences**
    - **create one csv per country**

#### Create sequences (`X`,`y`)

def **subsample_sequence(df, length)** -> pd.DataFrame:

function that given a full dataframe `df`:
- Create a sub sequences df, with long length?

In [None]:
def subsample_sequence(df, length) -> pd.DataFrame:
    """
    Given the initial dataframe `df`, return a shorter dataframe sequence of length `length` (eg n_obs).
    This shorter sequence should be selected at random
    """
    last_possible = df.shape[0] - length
    # How to split sequences? we could do it manually...
    random_start = np.random.randint(0, last_possible)
    df_sample = df[random_start: random_start+length]
    
    return df_sample

In [None]:
# Test it 
assert subsample_sequence(vn_ts_cases, 10).shape  == (10, 1)
assert subsample_sequence(vn_ts_cases, 400).shape == (400, 1)
subsample_sequence(vn_ts_cases, 10).shape, subsample_sequence(vn_ts_cases, 400).shape

#### Pad `X` missing values with mean values

def **split_subsample_sequence(df,  length, sequence='VNM', df_mean=None)** -> tuple:

function that given a full dataframe `df`:
- Create a sub sequences df
- Stores the value of the covid deaths* (or cases) of the last day as your variable array `y`
- Stores all features of previous days as a variable `X`
- Returns (`X`, `y`)

* *'VNM_covd_deaths'* preference for death prediction, then switch on cases *'VNM_covid_cases'*

In [None]:
# length of a sequence
len_seq = n_obs + n_pred
# Ex: 16 = 15 + 1
len_seq

In [None]:
def split_subsample_sequence(df,  length, sequence='VNM', y_size=1) -> tuple:
    '''
    Create one single random (X_sample, y_sample)
    containing one sequence each of length `length`
    ToDo: Adapt the y size=-1'''
    # Trick to save time during potential recursive calls
    # if df_mean is None:
    #     df_mean = df.mean()
    df_subsample = subsample_sequence(df, length)
    y_sample = df_subsample.iloc[length - y_size][sequence] # ['VNM'] ['VNM_covid_cases', 'VNM_covd_deaths'] 
    # Case y_sample is NaN: redraw !
    # if y_sample != y_sample: # A value is not equal to itself only for NaN
    #         X_sample, y_sample = split_subsample_sequence(df, length, df_mean) # Recursive call !!!
    #         return np.array(X_sample), np.array(y_sample)    
    X_sample = df_subsample[0:length - y_size]
    # Case X_sample has some NaNs
    # if X_sample.isna().sum().sum() !=0:
    #    X_sample = X_sample.fillna(compute_means(X_sample, df_mean))
    #    X_sample = X_sample.values

    return np.array(X_sample), np.array(y_sample)

In [None]:
# Test it
(X_sample, y_sample) = split_subsample_sequence(vn_ts_cases, length=len_seq, y_size=n_pred)
assert X_sample.shape == (n_obs,n_feat)
assert y_sample.shape == ()
X_sample.shape, y_sample.shape

#### Generates an entire dataset of multiple subsamples with shape $(X, y)$

def **get_X_y(df, n_sequences, length)** -> tuple:

function to generates an entire dataset of multiple subsamples suitable for RNN, that is, $(X, y)$ of shape:

```python
X.shape = (n_sequences, length, n_features)
y.shape = (n_sequences, )
```

In [None]:
def get_X_y(df, n_sequences, length) -> tuple:
    '''Return a list of samples (X, y)'''
    X, y = [], []

    for i in range(n_sequences):
        (xi, yi) = split_subsample_sequence(df, length)
        X.append(xi)
        y.append(yi)
        
    X = np.array(X)
    y = np.array(y)

    return X, y

Generate your dataset $(X, y)$ of `50` sequences, each of `15` observations + the covid cases at the 16-th day to predict

n_seq = 50 # ts_cases.shape[0] - 1 # nb of countries (samples)
n_obs = 15 # 15 days of training periiod (observations)
n_feat = 1 # 1feature: covid cases
n_pred = 1 # nb of days of prediction
len_seq = 16 # length of a sequence (len_seq = n_obs + n_pred/ Ex: 16 = 15 + 1)

In [None]:
X, y = get_X_y(vn_ts_cases, n_sequences=n_seq, length=len_seq)
print(X.shape)
print(y.shape)

In [None]:
# Check your code below
assert X.shape == (50, 15, 1)
assert y.shape == (50, )
assert np.isnan(X).sum() == 0

### How to split sequences?

- randomly or

- manually

### Samples/Sequences, Observations, Features

### Split train

In [None]:
# -1. Train splitting
# Let's keep the last 20% of the values out for testing purposes
train_size = 0.8 ## 80% of dataset to train
index = round(train_size * ts_cases.shape[0])

X_train = ts_cases.iloc[:index]
X_test = ts_cases.iloc[index:]

Let's not cross-validate in this challenge to start with 🤯 
- Separate `df` into `df_train` and `df_test` such that the first 80% of the dataframe is in the training, and the last 20% in the test set.
- Then generate (`X_train`, `y_train`) from `df_train` and (`X_test`, `y_test`) from `df_test`
- Ensure that `X_train.shape == (50, 15, 1)`

len_ = int(0.8*ts_cases.shape[0])
df_train = ts_cases[:len_] ; df_test = ts_cases[len_:]
df_train.shape, df_test.shape

In [None]:
n_seq_test = n_seq // 3 ; n_seq_test

In [None]:
X_train, y_train = get_X_y(df_train, n_seq, len_seq)
X_test, y_test = get_X_y(df_test, n_seq_test, len_seq)

In [None]:
X_train.shape, X_test.shape

### Normalization layer

In [None]:
# 0. The Normalization Layer
normalizer = Normalization()  # Instantiate a "normalizer" layer
normalizer.adapt(X_train) # "Fit" it on the train set

### RNN model architecture

In [None]:
# 1. The Architecture
rnn_model = Sequential()
rnn_model.add(normalizer) # Using the Normalization layer to standardize the datapoints during the forward pass
rnn_model.add(SimpleRNN(units=20, activation='tanh', return_sequences=True))  ## , input_shape=(?,?))) without a Normalizer layer
rnn_model.add(SimpleRNN(units=20, activation='tanh'))  ## 
rnn_model.add(Dense(10, activation = 'relu')) ## add 1 or more 'relu' layers
# model.add(layers.Dropout(0.3)) ## if RNN model over-fit
rnn_model.add(Dense(n_pred, activation = 'linear'))

### Compile model with 'rmsprop'

In [None]:
# 2. Compiling with 'rmsprop' rather than 'adam' (recommended)
rnn_model.compile(loss='mse',
              optimizer='rmsprop',
                 metrics='mape')  # Recommended optimizer for RNNs

In [None]:
rnn_model.summary()

### Train model

In [None]:
# 3. Training
es = EarlyStopping(monitor = 'val_loss',
                   patience = 10,
                   verbose = 0,
                   restore_best_weights = True)
# The fit
rnn_model.fit(X_train,
          y_train, 
          validation_split=0.1, # Auto split for validation data
              ## validation_data = (X_val, y_val), # To be created manually if needed
          batch_size = 16,
          epochs = 200,
          callbacks = [es],
          verbose=1)

### Evaluate model

In [None]:
# 4. Evaluating
# The prediction (one per sequence/city)
y_pred = rnn_model.predict(X_test) 

In [None]:
y_pred.shape

## Time Series Forecasting

In [None]:
# Check your code below
assert y_pred.shape == (n_seq_test, n_pred)