# Deep Learning Time Series COVID-19 Cases Prediction

## Project Setup

### Dependencies importing

In [1]:
# Auto reload imported module every time a jupyter cell is executed
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import scipy
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import requests
import pandas_profiling
from typing import overload
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, GRU, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop 
from covid_time_series_prediction.ml_logic import preprocessor
# from ml_logic.country_data import country_output
from covid_time_series_prediction.ml_logic.preprocessor import train_test_set, scaler

## Data sourcing

### Data API 

#### By country over time

In [None]:
def fetch_time_series(feature='stringency', start_date='2020-02-14', end_date='2021-02-14'):
    """
    Get stringency time series for each countries requesting API.
    Returns json dict with TS between start_date and end_date like 'YYYY-MM-DD'.
    """
    url = f'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/{feature}/date-range/{start_date}/{end_date}'
    response = requests.get(url)
    if response.status_code != 200:
        return ''
    data = response.json()
    return data

In [None]:
countries_time_series_api = fetch_time_series()

In [None]:
[(k, [c for c in v if c == 'VNM'])  for k, v in countries_time_series_api.items()  if k == 'countries' ]

In [None]:
[([([([(vee)  for kaaa, veee  in vee.items() if kaaa in ['date_value', 'confirmed']  ])  for kaa, vee  in ve.items() if kaa =='VNM'   ])  for ka, ve  in v.items() ])  for k, v in countries_time_series_api.items() if k=='data'   ]

In [None]:
[(k, [(ka, [(kaa, vee)  for kaa, vee  in ve.items() if kaa =='USA'   ])  for ka, ve  in v.items() ])  for k, v in countries_time_series_api.items() if k=='data'   ]

#### Country data for a specific day

In [None]:
def fetch_data(country='USA', date='2020-08-14'):
    """
    Get stringency data for one country {ALPHA-3} requesting API.
    Returns json dict with data for country like 'AAA' and specific date and like 'YYYY-MM-DD'.
    """
    url = f'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/actions/{country}/{date}'
    response = requests.get(url)
    if response.status_code != 200:
        return ''
    data = response.json()
    return data

In [None]:
country_data_api = fetch_data()

In [None]:
[[';'.join([str(kk) for kk, vv in d.items()]) for i, d in enumerate(v) if type(d) == dict and i == 0] for v in country_data_api.values()][0]

In [None]:
[[';'.join([str(vv) for kk, vv in d.items()]) for d in v if type(d) == dict] for v in country_data_api.values()][0]

In [None]:
# [';'.join([str(vv) for vv in v]) for v in country_data_api.values()][-1]
[';'.join([str(kk) for kk in v]) for k, v in country_data_api.items()][-1]

### Raw data

In [None]:
#### Data project directory
data_dir = '../data/raw_data/'

#### **Read URL**, **Get CSV files** and **store CSV in local**  *(optional do it at begining or to refresh CSV data)*

##### **get_database_to_csv()** function

In [None]:
def get_database_to_csv(url, csv_list, path='', db_grid=[]) -> list:
    """
    function that take in parameter:
     - a root URL (string) to get the CSV data,
     - a list of CSV files,
     - a path (string) to store CSV in local,
     - a grid (list of list) to add in the CSV filename, URL, local path.     
    and returns the gird updated with the CSVs of the list
    
    """

    ### Create a database grid (list of list) with all CSVs and associated URLs
    # print('db_grid', db_grid)
    #### Data project directory (if empty do not store CSV in local)
    # print('path', path)
    ### Website CSV datasets URL
    # print('url', url)
    #### List of CSVs of Website to retrieve
    # print('csv_list', csv_list)

    #### Length of grid aka number of CSVs already stored in grid
    len_grid = len(db_grid)

    for l in range(len(csv_list)):
        sub_list = []       
        sub_list.append(csv_list[l]) ## 1st pos°: CSV filename
        sub_list.append(url + csv_list[l]) ## 2nd pos°: URL + CSV
        if len(data_dir) > 0: ## store CSV in local
            sub_list.append(data_dir + csv_list[l]) ## 3rd pos°: local data path + CSV
            !curl -L "{url + csv_list[l]}" > {data_dir + csv_list[l]} ## curl <URL>/<CSV> => <path>
        # print('sub_list', sub_list)
        db_grid.append(sub_list)

    ### Return a database grid (list of list) with all CSVs and associated URLs
    return db_grid

#### **Get database to csv** with **get_database_to_csv()** function

In [None]:
### Oxford Master data time series URL
url_root_oxford = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/timeseries/'

#### List of CSVs of Oxford database Feel free to add more feature...
## index_strigency missed!
## 'E3;Fiscal measures;' missed!
## 'E4;International support;' missed!
## 'H3;Contact tracing;' missed!
## 'H4;Emergency investment in healthcare!
## 'H5;Investment in vaccines missed!
## 'V1;Vaccine Prioritisation missed!
## 'V2;Vaccine Availability missed!
## 'V3;Vaccine Financial Support!
## 'V4;Mandatory Vaccination missed!
csv_list = ['confirmed_cases.csv', 'confirmed_deaths.csv',
            'government_response_index_avg.csv', 'stringency_index_avg.csv', 
            'containment_health_index_avg.csv', 'economic_support_index.csv',
            'c1m_school_closing.csv', 'c2m_workplace_closing.csv',
            'c3m_cancel_public_events.csv', 'c4m_restrictions_on_gatherings.csv', 
            'c5m_close_public_transport.csv', 'c6m_stay_at_home_requirements.csv',
            'c7m_movementrestrictions.csv', 'c8ev_internationaltravel.csv',
            'e1_income_support.csv', 'e2_debtrelief.csv',
            'h1_public_information_campaigns.csv', 'h2_testing_policy.csv',
            'h3_contact_tracing.csv', 'h6m_facial_coverings.csv',
            'h7_vaccination_policy.csv', 'h8m_protection_of_elderly_ppl.csv'
           ] ## ; print('csv_list', csv_list, 'len(csv_list)', len(csv_list))
    
### Vacinations Dataset URLs
url_root_vaccinations = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/'

#### List of CSVs of Vaccinations database
csv_list_vax = ['vaccinations.csv', 'vaccinations-by-age-group.csv'] ## ; print('csv_list', csv_list_vax, 'len(csv_list)', len(csv_list_vax))

### Create a database grid all CSVs and associated URLs from Oxford website
db_grid = get_database_to_csv(url_root_oxford, csv_list, data_dir)
### Insert into database grid all CSVs and associated URLs from vaccinations website
db_grid = get_database_to_csv(url_root_vaccinations, csv_list_vax, data_dir, db_grid)
# print('db_grid', db_grid)

# Stack all csl in the list
csv_list += csv_list_vax ## ; print('csv_list', csv_list)

# transform list into dict:
csv = dict(zip(csv_list, [v[0] for v in enumerate(csv_list)])) ## ; print ('csv', csv) ## if v[1] == 'containment_health_index_avg.csv' 

## Data loading

## Data cleaning

### **Read CSV** and **Set raw dataframe**

### **Read raw CSV** and **Set dataframe**

In [None]:
df_raw_gov_response = pd.read_csv(data_dir + 'government_response_index_avg.csv')
df_raw_health = pd.read_csv(data_dir + 'containment_health_index_avg.csv')
df_raw_economic = pd.read_csv(data_dir + 'economic_support_index.csv')

#### Vaccination
df_raw_vaccination = pd.read_csv(data_dir + 'vaccinations.csv')
df_raw_ages = pd.read_csv(data_dir + 'vaccinations-by-age-group.csv')


#### Data Frame target
df_raw_cases = pd.read_csv(data_dir + 'confirmed_cases.csv')
df_raw_deaths = pd.read_csv(data_dir + 'confirmed_deaths.csv')

#### Data multiple
df_raw_school_closing=pd.read_csv(data_dir + 'c1m_school_closing.csv')
df_raw_workplace_closing=pd.read_csv(data_dir + 'c2m_workplace_closing.csv')
df_raw_cancel_public_event=pd.read_csv(data_dir + 'c3m_cancel_public_events.csv')
df_raw_restriction_on_gathering=pd.read_csv(data_dir + 'c4m_restrictions_on_gatherings.csv')
df_raw_stay_at_home=pd.read_csv(data_dir + 'c6m_stay_at_home_requirements.csv')
df_raw_international_travel=pd.read_csv(data_dir + 'c6m_stay_at_home_requirements.csv')
df_raw_goverment_response=pd.read_csv(data_dir + 'government_response_index_avg.csv')
df_raw_facial_covering=pd.read_csv(data_dir + 'h6m_facial_coverings.csv')
df_raw_vacination_policy=pd.read_csv(data_dir + 'h7_vaccination_policy.csv')


### **Read out CSV** and **Set dataframe**

In [None]:
#### CSV Data out project directory
csv_dir = '../data/csv_out/'
# ! unzip {csv_dir}usa_index
# ! unzip {csv_dir}usa_indicator
# ! rm ECG_data.zip
df_usa_index =  pd.read_csv(csv_dir + 'usa_index.csv')
df_usa_indicator =  pd.read_csv(csv_dir + 'usa_indicator.csv')
df_usa_index, df_usa_indicator


## DataFrames setting

In [None]:
df_usa_alb_index.head(), df_usa_alb_indicator.head()

In [None]:

# Sumedha csv test
df_ts_usa_index =  df_usa_index.copy()
df_ts_usa_indicator =  df_usa_indicator.copy()


## Time series analysing

### Time Series Analysis *(optional)*

In [None]:
ts_cases = df_raw_cases.drop(columns=['country_name','region_code','region_name','jurisdiction','Unnamed: 0'])
ts_cases = ts_cases.groupby('country_code').agg('sum')
ts_cases.transpose()
ts_cases.columns.name = 'Dates'
ts_cases = ts_cases.fillna(0)
# ts_cases.index = pd.to_datetime(ts_cases.index)

In [None]:
ts_cases = ts_cases.transpose()
ts_cases.head()

In [None]:
ts_cases.shape

In [None]:
vn_ts_cases

In [None]:
vn_ts_cases = vn_ts_cases.transpose()

In [None]:
df_raw_cases.isna()

In [None]:
# get VietNam country dataset
vn_data = df_raw_cases.loc[df_raw_cases['country_code'] == 'VNM'].copy()

vn_data.head()

In [None]:
# %%time
# vn_data.profile_report()

## TENSORFLOW & RNN MODEL

### Recurrent Neural Network (sequences data) modeling

### Samples/Sequences, Observations, Features

X.shape = (n_SEQUENCES, n_OBSERVATIONS, n_FEATURES)

y = RNN(X)

❗️ Notation $X_{i,j}^{t}$

 $_{i}$ is the sample/sequence
 
 $_{j}$ is the feature measured
 
 $^{t}$ is the time at which the observation is seen

In [3]:
# Alberto train set
# 3. Training
def train_rnn_model(model, patience=2, epochs=200):
    es = EarlyStopping(monitor = 'val_loss',
                    patience = patience,
                    verbose = 1,
                    restore_best_weights = True)
    # The fit
    history =  model.fit(X_train, y_train, 
            validation_split=0.1, # Auto split for validation data
                ## validation_data = (X_val, y_val), # To be created manually if needed
            batch_size = 16,
            epochs = epochs,
            callbacks = [es],
            verbose=1)
    return history


# print(type(overfit_es), overfit_es)
# if overfit_es:
#     print("early stopping")
#     history = train_rnn_model(patience=overfit_es)
# else:
# print("No early stopping")
# history = train_rnn_model()



### Prerequisites:

- **retrieve dataset** from Sumedha & Alberto

    - **clean dataset**: 
        
        - **drop first lines == 0** *(before Covid arrived)*
        
        - **check Nan**: 
- **strategy 1 country by country** sequences split as follow:

- **strategy 2 one sequence per country**:
    - **split X train, set** 
    - **Pad sequences**
    - **create one csv per country**

## Training strategies:
- Get NB dataset (cleaned) from Alberto & Sumedha
- 1/ Indicator in precentage %
- 2/ Indicator as categorical labels
- Run same RNN model in parallel with Kim & Thomas
- Identify best dataset
- Parameters to fit:
    - increase **nb of sequences**
    - train series modulation (ex: [50, 150, 200, 300, 400 nb of days = n_obs]) < take time to compute
    - **learning_rate** in Optimizer(parameters)
    - model layers architecture (**simple** -> complex) (less data -> more data) (print(loss) function check lecture)
        > LSTM
        > Dense
       (> LSTM
        > LSTM
        > Dense)
     >> **try to overfit** the model with the loss (train over val) or (early_stopping)
     >> **(X_val, y_val)**

### Feature Selection

### Time Series **Analysis** & **Preparation** to training

### Train Splitting

Split the dataset into training, validation, and test datasplit the dataset into training, validation, and test data

In [4]:
# Alberto train set
X_train, y_train, X_val, y_val, X_test, y_test = train_test_set('United States', split_train=0.7, split_val=0.9)


In [12]:
# Alberto train set
X_train, y_train = get_X_y_2(X_train, y_train, 
                            X_len=n_obs, y_len=n_pred, n_sequences=n_seq)

X_test, y_test = get_X_y_2(X_test, y_test, 
                           X_len=n_obs, y_len=n_pred, n_sequences=n_seq_test)
X_val, y_val = get_X_y_2(X_val, y_val, 
                         X_len=n_obs, y_len=n_pred, n_sequences=n_seq_val)

# np.ndim(X_train), np.ndim(y_train), np.ndim(X_val), np.ndim(y_val), np.ndim(X_test), np.ndim(y_test)

ValueError: high <= 0

In [None]:
np.ndim(X_train), np.ndim(y_train), np.ndim(X_val), np.ndim(y_val), np.ndim(X_test), np.ndim(y_test)

In [11]:
# Alberto train set
n_seq = 200 ## nb of sequences (samples)
n_obs = 150 # 150 days of training period (observations)
n_feat = X_train.shape[1] # 20 feature:
n_pred = 1 # nb of days where we can predict new daily deaths
n_seq_val = n_seq // 7 # number of sequences in test set ?
n_seq_test = n_seq // 10 # number of sequences in test set ?
n_seq_test, n_feat

(20, 20)

In [10]:
n_seq = 200 # ts_cases.shape[0] - 1 # nb of countries (samples)
n_obs = 150 # 15 days of training period (observations)
n_feat = df_ts_usa_index.shape[1] # 1 # 1feature: covid cases
n_pred = 1 # nb of days of prediction
# length of a sequence
len_seq = n_obs + n_pred
# Ex: 16 = 15 + 1
len_seq, n_feat

NameError: name 'df_ts_usa_index' is not defined

Let's not cross-validate in this challenge to start with 🤯 
- Separate `df` into `df_train` and `df_test` such that the first 80% of the dataframe is in the training, and the last 20% in the test set.
- Then generate (`X_train`, `y_train`) from `df_train` and (`X_test`, `y_test`) from `df_test`
- Ensure that `X_train.shape == (50, 15, 1)`

len_ = int(0.8*ts_cases.shape[0])
df_train = ts_cases[:len_] ; df_test = ts_cases[len_:]
df_train.shape, df_test.shape

In [None]:
X_train, y_train = get_X_y(df_train, n_seq, len_seq, feature=y_seq)
X_test, y_test = get_X_y(df_test, n_seq_test, len_seq, feature=y_seq)

In [None]:
X_train, y_train = get_X_y(df_train, n_seq, len_seq, feature=y_seq)
X_test, y_test = get_X_y(df_test, n_seq_test, len_seq, feature=y_seq)

In [None]:
X_train.shape, X_test.shape

#### Create sequences (`X`,`y`)

##### Generates an entire dataset of multiple subsamples with shape $(X, y)$

##### **subsample_sequence(df, length)**

function that given a full dataframe `df`:
- Create a sub sequences df, with long length?

In [None]:
np.random.randint(0, 259)

In [7]:
def subsample_sequence_2(X, y, X_len, y_len) -> pd.DataFrame:
    """
    Given the initial dataframe `df`, return a shorter dataframe sequence of length `length` (eg n_obs).
    This shorter sequence should be selected at random
    """
    last_possible = X.shape[0] - X_len - y_len
    # How to split sequences? we could do it manually...
    random_start = np.random.randint(0, last_possible)
    # X start and y end
    X_sample = X[random_start : random_start + X_len]
    y_sample = y[random_start : random_start + X_len + y_len]
    
    return np.array(X_sample), np.array(y_sample)



In [None]:
def subsample_sequence(df, length) -> pd.DataFrame:
    """
    Given the initial dataframe `df`, return a shorter dataframe sequence of length `length` (eg n_obs).
    This shorter sequence should be selected at random
    """
    last_possible = df.shape[0] - length
    # How to split sequences? we could do it manually...
    random_start = np.random.randint(0, last_possible)
    df_sample = df[random_start: random_start+length]
    
    return df_sample

In [None]:
# Test it 
# assert subsample_sequence(vn_ts_cases, 10).shape  == (10, 1)
# assert subsample_sequence(vn_ts_cases, 400).shape == (400, 1)
subsample_sequence(df_ts_usa_index, 10).shape, subsample_sequence(df_ts_usa_index, 400).shape

##### **split_subsample_sequence(df,  length, sequence='VNM', df_mean=None)**

function that given a full dataframe `df`:
- Create a sub sequences df
- Stores the value of the covid deaths* (or cases) of the last day as your variable array `y`
- Stores all features of previous days as a variable `X`
- Returns (`X`, `y`)

In [None]:
def split_subsample_sequence_2(X, y,  length, y_size=1) -> tuple:
    '''
    Create one single random (X_sample, y_sample)
    containing one sequence each of length `length`
    ToDo: Adapt the y size=-1'''
    # Trick to save time during potential recursive calls
    # if df_mean is None:
    #     df_mean = df.mean()
    X_subsample, y_subsample = subsample_sequence_2(X, y, X_len=n_obs, y_len=n_pred)
   

    return np.array(X_sample), np.array(y_sample)

In [None]:
def split_subsample_sequence(df,  length, feature='VNM', y_size=1) -> tuple:
    '''
    Create one single random (X_sample, y_sample)
    containing one sequence each of length `length`
    ToDo: Adapt the y size=-1'''
    # Trick to save time during potential recursive calls
    # if df_mean is None:
    #     df_mean = df.mean()
    df_subsample = subsample_sequence(df, length)
    y_sample = df_subsample.iloc[length - y_size][feature] # ['VNM'] ['VNM_covid_cases', 'VNM_covd_deaths'] 
    # Case y_sample is NaN: redraw !
    # if y_sample != y_sample: # A value is not equal to itself only for NaN
    #         X_sample, y_sample = split_subsample_sequence(df, length, df_mean) # Recursive call !!!
    #         return np.array(X_sample), np.array(y_sample)    
    X_sample = df_subsample[0:length - y_size]
    # Case X_sample has some NaNs
    # if X_sample.isna().sum().sum() !=0:
    #    X_sample = X_sample.fillna(compute_means(X_sample, df_mean))
    #    X_sample = X_sample.values

    return np.array(X_sample), np.array(y_sample)

In [None]:
# Test it
print(y_seq)
(X_sample, y_sample) = split_subsample_sequence(df_ts_usa_index, feature=y_seq, length=len_seq, y_size=n_pred)
X_sample.shape, y_sample.shape

##### **get_X_y(df, n_sequences, length)**

function to generates an entire dataset of multiple subsamples suitable for RNN, that is, $(X, y)$ of shape:

```python
X.shape = (n_sequences, length, n_features)
y.shape = (n_sequences, )
```

In [6]:
def get_X_y_2(X, y, X_len, y_len, n_sequences) -> tuple:
    '''Return a list of samples (X, y)'''
    X_list, y_list = [], []

    for i in range(n_sequences):
        (xi, yi) = subsample_sequence_2(X, y, X_len=X_len, y_len=y_len)
        X_list.append(xi)
        y_list.append(yi)
        
    X = np.array(X_list)
    y = np.array(y_list)

    return X, y

In [None]:
def get_X_y(df, n_sequences, length, feature='VNM') -> tuple:
    '''Return a list of samples (X, y)'''
    X, y = [], []

    for i in range(n_sequences):
        (xi, yi) = split_subsample_sequence(df, length, feature=feature)
        X.append(xi)
        y.append(yi)
        
    X = np.array(X)
    y = np.array(y)

    return X, y

Generate your dataset $(X, y)$ of `50` sequences, each of `15` observations + the covid cases at the 16-th day to predict

n_seq = 50 # ts_cases.shape[0] - 1 # nb of countries (samples)
n_obs = 15 # 15 days of training periiod (observations)
n_feat = 1 # 1feature: covid cases
n_pred = 1 # nb of days of prediction
len_seq = 16 # length of a sequence (len_seq = n_obs + n_pred/ Ex: 16 = 15 + 1)

In [None]:
X, y = get_X_y(df_ts_usa_index, n_sequences=n_seq, length=len_seq, feature=y_seq)
print(X.shape) ; print(y.shape)

In [None]:
##### Test it
(X_sample, y_sample) = split_subsample_sequence(vn_ts_cases, length=len_seq, y_size=n_pred)
###### assert X_sample.shape == (n_obs,n_feat)
###### assert y_sample.shape == ()
X_sample.shape, y_sample.shape

In [None]:
# Check your code below
assert X.shape == (50, 15, 1)
assert y.shape == (50, )
assert np.isnan(X).sum() == 0

### How to split sequences?



- randomly or

- manually

### Samples/Sequences, Observations, Features

##### **train_rnn_model(model, patience=2, epochs=200):**

function to generates an entire dataset of multiple subsamples suitable for RNN, that is, $(X, y)$ of shape:

```python
X.shape = (n_sequences, length, n_features)
y.shape = (n_sequences, )
```

In [None]:
def train_rnn(X_val=0, y_val=0):
    [print(f'validation_data=(X_val, y_val),') if (X_val!=0 or y_val!=0) else print(f'validation_split=0.1,')]
    return True

train_rnn(),train_rnn((1),(0))
    

In [None]:
# 3. Training
def train_rnn_model(model, patience=20, epochs=200, X_val=0, y_val=0):
    """ function that train a RNN model with hyperparameters:
    - patience by default 2 to early stop
    - epochs by default 200 to train over several epochs
    - valisation data by default (X_val, y_val)=(0, 0) in case of auto split
    """
    es = EarlyStopping(monitor = 'val_loss',
                    patience = patience,
                    verbose = 0,
                    restore_best_weights = True)
    # The fit
    history =  model.fit(X_train,
            y_train, 
             # Auto split for validation data
            [print(f'validation_data=(X_val, y_val),') if (X_val!=0 or y_val!=0) else print(f'validation_split=0.1,')]
            batch_size = 16,
            epochs = epochs,
            callbacks = [es],
            verbose=1)
    return history


print(type(overfit_es), overfit_es)
# if overfit_es:
#     print("early stopping")
#     history = train_rnn_model(patience=overfit_es)
# else:
# print("No early stopping")
# history = train_rnn_model()

In [None]:
# 3. Training
def train_rnn_model(model, patience=20, epochs=200):
    es = EarlyStopping(monitor = 'val_loss',
                    patience = patience,
                    verbose = 0,
                    restore_best_weights = True)
    # The fit
    history =  model.fit(X_train,
            y_train, 
            validation_split=0.1, # Auto split for validation data
                ## validation_data = (X_val, y_val), # To be created manually if needed
            batch_size = 16,
            epochs = epochs,
            callbacks = [es],
            verbose=1)
    return history


print(type(overfit_es), overfit_es)
# if overfit_es:
#     print("early stopping")
#     history = train_rnn_model(patience=overfit_es)
# else:
# print("No early stopping")
# history = train_rnn_model()

### Normalization

#### Normalization layer

In [None]:
# Alberto model #4 test 
# Normalization layer not necessary as X_train already train
# normalizer = Normalization()  # Instantiate a "normalizer" layer
# normalizer.adapt(X_train) # "Fit" it on the train set
# 1. The Architecture
"""   - 3rd model layers architecture (simple -> complex) (less data -> more data) (print(loss) function check lecture)
> LSTM
"""
rnn_model_4 = Sequential()
# rnn_model_4.add(normalizer) # Using the Normalization layer to standardize the datapoints during the forward pass
# Input len(train) (input_shape=(?,?))
rnn_model_4.add(LSTM(units=n_feat, activation='relu'))  ## , input_shape=(?,?))) without a Normalizer layer
# output return sequences = True
rnn_model_4.add(Dense(10, activation = 'relu')) ## add 1 or more 'relu' layers
# Output 10 only, no more RNN just dropout()
# rnn_model_3.add(layers.Dropout(0.3)) ## if RNN model over-fit
rnn_model_4.add(Dense(n_pred, activation = 'linear'))
#ValueError: Input 0 of layer "lstm_1" is incompatible with the layer:
#     >>> expected ndim=3, found ndim=2. Full shape received: (None, 20)#
# 2. Compiling with 'rmsprop' rather than 'adam' (recommended)
optimizer = RMSprop(
                learning_rate=0.001,
                rho=0.9,
                momentum=0.0,
                epsilon=1e-07,
                centered=False
            )
rnn_model_4.compile(loss='mse',
              optimizer= optimizer, # optimizer='rmsprop'    <- adapt learning rate
                 metrics='mape')  # Recommended optimizer for RNNs
rnn_model_4.summary()

# 3. Training
history = train_rnn_model(model=rnn_model_4, epochs=200, patience=3)
plt.plot(history.history['mape'])
plt.plot(history.history['val_mape'])
plt.show();

# 4. Evaluating
# The prediction (one per sequence/city)
y_pred = rnn_model.predict(X_test) 
print(y_pred.shape)
# Distribution of the predictions
pd.DataFrame(y_pred).mean().sort_values(ascending=False)

In [None]:
# 0. The Normalization Layer
normalizer = Normalization()  # Instantiate a "normalizer" layer
normalizer.adapt(X_train) # "Fit" it on the train set

### RNN model #3 architecture 

In [None]:
# 1. The Architecture
"""   - 3rd model layers architecture (simple -> complex) (less data -> more data) (print(loss) function check lecture)
> LSTM
"""
rnn_model_3 = Sequential()
rnn_model_3.add(normalizer) # Using the Normalization layer to standardize the datapoints during the forward pass
# Input len(train) (input_shape=(?,?))
rnn_model_3.add(LSTM(units=30, activation='tanh'))  ## , input_shape=(?,?))) without a Normalizer layer
# output return sequences = True
rnn_model_3.add(Dense(10, activation = 'relu')) ## add 1 or more 'relu' layers
# Output 10 only, no more RNN just dropout()
# rnn_model_3.add(layers.Dropout(0.3)) ## if RNN model over-fit
rnn_model_3.add(Dense(n_pred, activation = 'linear'))

### Model #1 evaluating

In [None]:
# 4. Evaluating
# The prediction (one per sequence/city)
y_pred = rnn_model.predict(X_test) 
print(y_pred.shape)
# Distribution of the predictions
pd.DataFrame(y_pred).mean().sort_values(ascending=False)

### Time Series Forecasting with model #1

### Compile model #3 with 'rmsprop'

In [None]:
# 2. Compiling with 'rmsprop' rather than 'adam' (recommended)
optimizer = RMSprop(
                learning_rate=0.001,
                rho=0.9,
                momentum=0.0,
                epsilon=1e-07,
                centered=False
            )
rnn_model_3.compile(loss='mse',
              optimizer= optimizer, # optimizer='rmsprop'    <- adapt learning rate
                 metrics='mape')  # Recommended optimizer for RNNs
rnn_model_3.summary()

In [None]:
rnn_model.summary()

### Train model #2

In [None]:
history = train_rnn_model(rnn_model, patience=5, epochs=200)
plt.plot(history.history['mape'])
plt.plot(history.history['val_mape'])
plt.show();
type(history)

In [None]:
trai### Train model #1n_series = [50, 150, 200, 300, 400]
overfit_es =   [2, 6, 6, 5, 6 ]
print('type(overfit_es), overfit_es', type(overfit_es), overfit_es)
# if overfit_es:
#     print("early stopping")
#     history = train_rnn_model(patience=overfit_es)
# else:
# print("No early stopping")
for i in range(len(train_series)):
    history = train_rnn_model(model=rnn_model_2, epochs=train_series[i], patience=overfit_es[i])
    plt.plot(history.history['mape'])
    plt.plot(history.history['val_mape'])
    plt.show();

### RNN model #3 architecture 

#### 🚀 The **LSTM (= Long Short Term Memory)** with their ability to *avoid the vanishing gradient problem*, should be preferred over a SimpleRNN.

In [None]:
# 1. The Architecture
"""   - 3rd model layers architecture (simple -> complex) (less data -> more data) (print(loss) function check lecture)
> LSTM
"""
rnn_model_3 = Sequential()
rnn_model_3.add(normalizer) # Using the Normalization layer to standardize the datapoints during the forward pass
# Input len(train) (input_shape=(?,?))
rnn_model_3.add(LSTM(units=30, activation='tanh'))  ## , input_shape=(?,?))) without a Normalizer layer
# output return sequences = True
rnn_model_3.add(Dense(10, activation = 'relu')) ## add 1 or more 'relu' layers
# Output 10 only, no more RNN just dropout()
# rnn_model_3.add(layers.Dropout(0.3)) ## if RNN model over-fit
rnn_model_3.add(Dense(n_pred, activation = 'linear'))

### Compile model #3 with 'rmsprop'

In [None]:
# 2. Compiling with 'rmsprop' rather than 'adam' (recommended)
optimizer = RMSprop(
                learning_rate=0.001,
                rho=0.9,
                momentum=0.0,
                epsilon=1e-07,
                centered=False
            )
rnn_model_3.compile(loss='mse',
              optimizer= optimizer, # optimizer='rmsprop'    <- adapt learning rate
                 metrics='mape')  # Recommended optimizer for RNNs
rnn_model_3.summary()

### Train model #2

### RNN model #3 architecture 

In [None]:
# 1. The Architecture
"""   - 3rd model layers architecture (simple -> complex) (less data -> more data) (print(loss) function check lecture)
> LSTM
"""
rnn_model_3 = Sequential()
rnn_model_3.add(normalizer) # Using the Normalization layer to standardize the datapoints during the forward pass
# Input len(train) (input_shape=(?,?))
rnn_model_3.add(LSTM(units=30, activation='tanh'))  ## , input_shape=(?,?))) without a Normalizer layer
# output return sequences = True
rnn_model_3.add(Dense(10, activation = 'relu')) ## add 1 or more 'relu' layers
# Output 10 only, no more RNN just dropout()
# rnn_model_3.add(layers.Dropout(0.3)) ## if RNN model over-fit
rnn_model_3.add(Dense(n_pred, activation = 'linear'))

### Compile model #3 with 'rmsprop'

In [None]:
# 2. Compiling with 'rmsprop' rather than 'adam' (recommended)
optimizer = RMSprop(
                learning_rate=0.001,
                rho=0.9,
                momentum=0.0,
                epsilon=1e-07,
                centered=False
            )
rnn_model_3.compile(loss='mse',
              optimizer= optimizer, # optimizer='rmsprop'    <- adapt learning rate
                 metrics='mape')  # Recommended optimizer for RNNs
rnn_model_3.summary()

### Train model #3

In [None]:
# 3. Training
from typing import overload

def train_rnn_model(rnn_model_3, patience=2, epochs=200, (X_val, y_val)=(0, 0)):
    es = EarlyStopping(monitor = 'val_loss',
                    patience = patience,
                    verbose = 0,
                    restore_best_weights = True)
    # The fit
    history =  rnn_model_3.fit(X_train, y_train, 
            validation_split=0.1, # Auto split for validation data
                ## validation_data = (X_val, y_val), # To be created manually if needed
            batch_size = 16,
            epochs = epochs,
            callbacks = [es],
            verbose=1)
    return history


print(type(overfit_es), overfit_es)
# if overfit_es:
#     print("early stopping")
#     history = train_rnn_model(patience=overfit_es)
# else:
# print("No early stopping")
# history = train_rnn_model()

### Train model #2

In [None]:
# min(history.history['mape'])

# print("adjust early stopping")
# overfit_es = [d[0]+1 for d in enumerate(history.history['mape']) if d[1] == min(history.history['mape'])][0]
# overfit_es

In [None]:
min(history.history['mape']), max(history.history['mape']), history.history['mape'] # blue line

In [None]:
max(history.history['val_mape']), history.history['val_mape'] # orange line

### Model #1 evaluating

In [None]:
# 4. Evaluating
# The prediction (one per sequence/city)
y_pred = rnn_model.predict(X_test) 
print(y_pred.shape)
# Distribution of the predictions
pd.DataFrame(y_pred).mean().sort_values(ascending=False)

### Time Series Forecasting with model #1

In [None]:
# Check your code below
assert y_pred.shape == (n_seq_test, n_pred)
# Distribution of the real values y_train
pd.DataFrame(y_train).mean().sort_values(ascending=False)

In [None]:
# Distribution of the real values y_train
pd.DataFrame(y_train).mean().sort_values(ascending=False)

In [None]:
trai### Train model #1n_series = [50, 150, 200, 300, 400]
overfit_es =   [2, 6, 6, 5, 6 ]
print('type(overfit_es), overfit_es', type(overfit_es), overfit_es)
# if overfit_es:
#     print("early stopping")
#     history = train_rnn_model(patience=overfit_es)
# else:
# print("No early stopping")
for i in range(len(train_series)):
    history = train_rnn_model(model=rnn_model_2, epochs=train_series[i], patience=overfit_es[i])
    plt.plot(history.history['mape'])
    plt.plot(history.history['val_mape'])
    plt.show();