# 3 LSTM
The intuition behind using an LSTM is that time series forcasting, at its core, is a sequense-to-sequence prediction, which falls under the domain of recurrent neural networks (RNNs). 

In [80]:
%%capture
import pandas as pd
import numpy as np
import datetime as dt

from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences # extends sequences (vectors) to have the same length
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional

import pickle # to im- and export the tokanizer

from tqdm import tqdm_notebook as tqdm

## 3.1 Pre-processing
One requirements for getting reliable results when using RNNs for time series predictions is that the intervall between each step is the same. In our training data, we are missing some rows, but the information is given that "missing" can be set eaqual to a demand of 0.

Afterwards, we have to split our data into training, test, and validation set. The rules state that we can use up to 14 consecutive days to predict the following 5 days. At 96 data points per day, that comes out to an input size of 1344, and an output size of 480,.

In [56]:
df = pd.read_csv('training.csv')

In [57]:
# We can append the missing rows by comparing the existing ones for each
# geo-location with a date_range
missing_rows = list()
# All existing datetimes for our timeframe
all_dt = pd.date_range(start=dt.date(2019, 3, 1), 
                                  end=dt.datetime(2019, 4, 30, 23, 45, 0), 
                                  freq='15min')

for ghash in tqdm(df.geohash6.unique()):

    geo_dates = pd.DatetimeIndex(df[df.geohash6 == ghash].datetime)
    missing_dates = all_dt.difference(geo_dates)  
    
    for date in missing_dates:
        day = abs((date - dt.datetime(2019, 3, 1, 0, 0)).days) + 1
        missing_rows.append({'geohash6': ghash, 
                             'day': day, 
                             'timestamp': date.strftime('%H:%M'), 
                             'demand': 0,
                             'datetime': date})

# Combine the dataset with the missing rows
df = pd.concat([df, pd.DataFrame(missing_rows)], sort=False)

HBox(children=(IntProgress(value=0, max=1329), HTML(value='')))




In [35]:
df = pd.read_csv('Grab_train_filled.csv')

In [58]:
df.to_csv('Grab_train_filled.csv', index=False)

In [59]:
# In this case we will train on the first 54 days and leave the remaining week for test and validation 
df_train = df[df['day'] <= 54]

In [None]:
# Next we have to construct input and trainings output, since demand is already in float format, no further transformation is needed
# For the input we extract our 1344-grams and for the output the following 480-grams 

In [72]:
df_temp = df_train[df_train.geohash6 == 'qp03wc'].copy()
df_temp['datetime'] = pd.to_datetime(df_temp.datetime)

# Order by timestamp ascending, meaning newest entries first
df_temp.sort_values(by='datetime', inplace=True)

In [85]:
[0.05485798, 0.08620924, 0.05073921,0.20420064, 0.19579046,0.23865169][-2:]

[0.19579046, 0.23865169]

In [90]:
def generate_ngrams(a, in_len=1344, out_len=480):
    ngrams = zip(*[a[i:] for i in range(in_len + out_len)])
    ngrams2 = zip([(a[:in_len], a[-out_len:]) for a in ngrams])
    return ngrams2#(np.array([a[:in_len] for a in ngrams]), np.array([a[-out_len:] for a in ngrams2]))

TypeError: 'zip' object is not subscriptable

In [92]:
for i in generate_ngrams(df_temp.demand.values):
    print(np.array(i))
    break

[[(0.054857976109078575, 0.08620923632207511, 0.05073921349776716, 0.07517419504194835, 0.0628671348329678, 0.056764744507038174, 0.06941742824376898, 0.07985279156700817, 0.11919184760686005, 0.1159551497239064, 0.15035665798618356, 0.1387425491754965, 0.19982220521977828, 0.2411690307725056, 0.18732088721679327, 0.19369278254981587, 0.2678029108129704, 0.27262347210628696, 0.2525619436203057, 0.2477031454053984, 0.25367902953233457, 0.26770051395207106, 0.2409874212154621, 0.3251657683296361, 0.2991415230139928, 0.30805251001349643, 0.2852877011631137, 0.3246462992150791, 0.3456920965862432, 0.3380084227252602, 0.3495032648898867, 0.4550280947185475, 0.3847340798199181, 0.4185358848975457, 0.3863651595209561, 0.3501536940470569, 0.4723451777698381, 0.40241660950359615, 0.35404456365267684, 0.4054735193530796, 0.4390616658030713, 0.4286807546037928, 0.4388545185832316, 0.4794528792315969, 0.4747358350547026, 0.4583502322492236, 0.3983974724878533, 0.4558792565336193, 0.443888209419057