## Task II - Using Recurrent Neural Networks (RNNs) for forecasting

In [13]:
import numpy as np

In [2]:
import pandas as pd

#Reading the data
X_KV = pd.read_csv('../X_KV.csv')   #JNSM_KV_flashcrowd_1 
Y_KV = pd.read_csv('../Y_KV.csv')   
X_VoD = pd.read_csv('../X_VoD.csv') #JNSM_VoD_flashcrowd_1 
Y_VoD = pd.read_csv('../Y_VoD.csv') 

X_KV_2 = pd.read_csv('../X_KV_2.csv')    #JNSM_KV_flashcrowd_2
Y_KV_2 = pd.read_csv('../Y_KV_2.csv')    
X_VoD_2 = pd.read_csv('../X_VoD_2.csv')  #JNSM_VoD_flashcrowd_2
Y_VoD_2 = pd.read_csv('../Y_VoD_2.csv') 

#Concatenate datasets
X_KV = pd.concat([X_KV, X_KV_2], ignore_index=True)
Y_KV = pd.concat([Y_KV, Y_KV_2], ignore_index=True)
X_VoD = pd.concat([X_VoD, X_VoD_2], ignore_index=True)
Y_VoD = pd.concat([Y_VoD, Y_VoD_2], ignore_index=True)

#Remove the first two columns that index the samples and retrieve all other values by using iloc()
X_KV = X_KV.iloc[:,2:]  
Y_KV = Y_KV.iloc[:, 2:] 
X_VoD = X_VoD.iloc[:,2:]
Y_VoD = Y_VoD.iloc[:, 2:]

In [5]:
%store -r X_KV_train X_KV_test Y_KV_train Y_KV_test
%store -r X_VoD_train X_VoD_test Y_VoD_train Y_VoD_test

### Task II.2 
Create a new training set and a new test set with dimensions dimensions (#samples, #times_steps, #features) and samples of structure ([x(t−l), ..., x(t)], [y(t), ..., y(t+h)]).

In [58]:
#For KV service
#We want dimensions (#samples, #times_steps, #features) as input for LSTM network

def X_KV(samples, l, h):
    if (l>=0 and h>0) or (l>0 and h>=0): 
        matrix = np.empty((samples.shape[0]-l-h,(l+1),samples.shape[1]))
        for i in range(0, samples.shape[0]-h-l):
            for j in range(0,l+1): 
                matrix[i][j] = samples[i+j]
        return matrix
    if l==0 and h==0: 
        return samples

def Y_KV(targets, l, h):
    targets = np.array(targets)
    
    if (l>=0 and h>0) or (l>0 and h>=0): 
        matrix = np.empty((targets.shape[0]-l-h,h+1))   
        for i in range(0, targets.shape[0]-h-l):
            matrix[i] = np.concatenate([[targets[j]] for j in range(i,i+h+1)])
        return matrix
    if l==0 and h==0: 
        return targets

In [59]:
#For VoD service

#We want dimensions (#samples, #times_steps, #features) as input for LSTM network
def X_VoD(samples, l, h, s):
    if s<=0:
        print('The step size must be >=1.')
        
    if (l>=0 and h>0) or (l>0 and h>=0): 
        matrix = np.empty((samples.shape[0]-s*l-s*h,(l+1),samples.shape[1]))
        for i in range(0, samples.shape[0]-s*l-s*h):
            for j in range(0,l+1):
                matrix[i][j] = samples[i+j*s]
        return matrix
    if l==0 and h==0: 
        return samples
    
def Y_VoD(targets, l, h, s):
    targets = np.array(targets)
    if s<= 0: 
        print('The step size must be >=1.')
        
    if (l>=0 and h>0) or (l>0 and h>=0): 
        matrix = np.empty((targets.shape[0]-s*l-s*h,h+1))   
        for i in range(0, targets.shape[0]-s*h-s*l):
            matrix[i] = np.concatenate([[targets[j]] for j in range(i,i+s*h+1,s)])
        return matrix
    if l==0 and h==0: 
        return targets

### Task II.3 

In [47]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

def train_and_predict(x_train, y_train, x_test, y_test):
    
    # design network
    model = Sequential()
    model.add(LSTM(50, return_sequences = False, input_shape=(x_train.shape[1], x_train.shape[2])))
    model.add(Dense(11))
    model.compile(loss='mae', optimizer='adam', metrics=["mse"])
    
    # fit network
    history = model.fit(x_train, y_train, epochs=50, batch_size=72, validation_data=(x_test, y_test), verbose=2, shuffle=False)
    
    # model prediction
    prediction = model.predict(x_test)
    
    return prediction

In [63]:
#KV service

#Predictions given different lag values and fixed h=10 
def pred_KV(l):
    return train_and_predict(X_KV(X_KV_train,l,10), Y_KV(Y_KV_train,l,10), X_KV(X_KV_test,l,10), Y_KV(Y_KV_test,l,10))

#Compute predictions for l=0,1,...,10 to save computational time later on because we only have to train the 11 networks once
pred = []
for l in range(0,11):
    pred.append(pred_KV(l))

Epoch 1/50
183/183 - 2s - loss: 54.1001 - val_loss: 51.8165
Epoch 2/50
183/183 - 1s - loss: 49.7181 - val_loss: 47.4719
Epoch 3/50
183/183 - 1s - loss: 45.8102 - val_loss: 43.8261
Epoch 4/50
183/183 - 1s - loss: 42.3413 - val_loss: 40.4511
Epoch 5/50
183/183 - 1s - loss: 39.0610 - val_loss: 37.2104
Epoch 6/50
183/183 - 1s - loss: 35.8793 - val_loss: 34.0430
Epoch 7/50
183/183 - 1s - loss: 32.7504 - val_loss: 30.9122
Epoch 8/50
183/183 - 1s - loss: 29.6478 - val_loss: 27.8016
Epoch 9/50
183/183 - 1s - loss: 26.5414 - val_loss: 24.5854
Epoch 10/50
183/183 - 1s - loss: 22.7009 - val_loss: 19.7343
Epoch 11/50
183/183 - 1s - loss: 17.3993 - val_loss: 13.8002
Epoch 12/50
183/183 - 1s - loss: 11.2712 - val_loss: 8.5995
Epoch 13/50
183/183 - 1s - loss: 8.3577 - val_loss: 7.4873
Epoch 14/50
183/183 - 1s - loss: 7.4121 - val_loss: 6.8096
Epoch 15/50
183/183 - 1s - loss: 6.6664 - val_loss: 6.1701
Epoch 16/50
183/183 - 1s - loss: 5.9473 - val_loss: 5.5090
Epoch 17/50
183/183 - 1s - loss: 5.2098 - 

In [87]:
from sklearn.metrics import mean_absolute_error

pred_KV = np.array(pred) #Turn into array to access the predictions
%store pred_KV

#Function that outputs the NMAE for a given horizon value
#Outputs array of shape (,11), at each position there is the nmae for the respective lag value
def errors_KV(h):
    if h in range(0,11):
        errors = np.empty(11)
        for l in range(0,11):
            errors[l] = (1/np.mean(Y_KV(Y_KV_test,l,10)[:,:h+1]))*mean_absolute_error(pred_KV[l][:,:h+1],Y_KV(Y_KV_test,l,10)[:,:h+1])
        return errors
    else: 
        print('Horizon value needs to be 0,...,10')

Stored 'pred_KV' (ndarray)


In [90]:
#VoD service 

#Choosing s = 30

#Predictions given different lag values and fixed h=10 
def pred_VoD(l):
    return train_and_predict(X_VoD(X_VoD_train,l,10,30), Y_VoD(Y_VoD_train,l,10,30), X_VoD(X_VoD_test,l,10,30), Y_VoD(Y_VoD_test,l,10,30))


#Compute predictions for l=0,1,...,10 to save computational time later on because we only have to train the 11 networks once
pred = []
for l in range(0,11):
    pred.append(pred_VoD(l))
    
pred_VoD = np.array(pred) #Turn into array to access the predictions at different lag values
%store pred_VoD

Epoch 1/50
333/333 - 3s - loss: 20.2908 - val_loss: 16.8230
Epoch 2/50
333/333 - 2s - loss: 13.4877 - val_loss: 10.1741
Epoch 3/50
333/333 - 2s - loss: 7.5303 - val_loss: 6.1221
Epoch 4/50
333/333 - 2s - loss: 4.7052 - val_loss: 4.4591
Epoch 5/50
333/333 - 2s - loss: 3.1426 - val_loss: 3.4580
Epoch 6/50
333/333 - 2s - loss: 2.2947 - val_loss: 2.8857
Epoch 7/50
333/333 - 2s - loss: 1.9804 - val_loss: 2.7129
Epoch 8/50
333/333 - 2s - loss: 1.9162 - val_loss: 2.6446
Epoch 9/50
333/333 - 2s - loss: 1.8871 - val_loss: 2.5795
Epoch 10/50
333/333 - 2s - loss: 1.8630 - val_loss: 2.5374
Epoch 11/50
333/333 - 2s - loss: 1.8497 - val_loss: 2.5198
Epoch 12/50
333/333 - 2s - loss: 1.8410 - val_loss: 2.5062
Epoch 13/50
333/333 - 2s - loss: 1.8358 - val_loss: 2.4973
Epoch 14/50
333/333 - 2s - loss: 1.8318 - val_loss: 2.4945
Epoch 15/50
333/333 - 2s - loss: 1.8290 - val_loss: 2.4927
Epoch 16/50
333/333 - 2s - loss: 1.8272 - val_loss: 2.4905
Epoch 17/50
333/333 - 2s - loss: 1.8254 - val_loss: 2.4866
Ep

In [102]:
#Function that outputs the NMAE for a given horizon value
#Outputs array of shape (,11), at each position there is the nmae for the respective lag value
def errors_VoD(h):
    if h in range(0,11):
        errors = np.empty(11)
        for l in range(0,11):
            errors[l] = (1/np.mean(Y_VoD(Y_VoD_test,l,10,30)[:,:h+1]))*mean_absolute_error(pred_VoD[l][:,:h+1],Y_VoD(Y_VoD_test,l,10,30)[:,:h+1])
        return errors
    else: 
        print('Horizon value needs to be 0,...,10')

array([0.11458934, 0.11460219, 0.11535051, 0.11451209, 0.11345963,
       0.11337845, 0.11297954, 0.11330321, 0.11318932, 0.1136309 ,
       0.11253321])

In [103]:
#Matrices for each service that display the NMAE given h = 0,1,...,10 (rows) and l = 0,1,...,10 (columns)

error_matrix_KV = []
error_matrix_VoD = []
for h in range(0,11):
    error_matrix_KV.append(errors_KV(h))
    error_matrix_VoD.append(errors_VoD(h))

In [105]:
#Instaling tabulate to obtain the Latex Script
pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [112]:
#Obtain Latex code 

import tabulate
from tabulate import tabulate

np.set_printoptions(precision=0)
error_matrix_KV = np.array(error_matrix_KV)*10
error_matrix_VoD = np.array(error_matrix_VoD)*10

print(tabulate(error_matrix_KV, tablefmt="latex", floatfmt=".0f"))
print(tabulate(error_matrix_VoD, tablefmt="latex", floatfmt=".0f"))

\begin{tabular}{rrrrrrrrrrr}
\hline
 2163 & 2206 & 2209 & 2275 & 2274 & 2294 & 2288 & 2231 & 2312 & 2419 & 2232 \\
 2154 & 2182 & 2197 & 2251 & 2255 & 2277 & 2260 & 2223 & 2299 & 2395 & 2215 \\
 2147 & 2170 & 2184 & 2230 & 2237 & 2257 & 2241 & 2212 & 2289 & 2390 & 2199 \\
 2148 & 2165 & 2177 & 2215 & 2219 & 2243 & 2227 & 2215 & 2275 & 2383 & 2189 \\
 2161 & 2171 & 2176 & 2206 & 2203 & 2232 & 2219 & 2221 & 2263 & 2384 & 2184 \\
 2178 & 2185 & 2187 & 2204 & 2202 & 2225 & 2216 & 2219 & 2252 & 2390 & 2182 \\
 2198 & 2201 & 2201 & 2212 & 2205 & 2223 & 2213 & 2218 & 2241 & 2396 & 2185 \\
 2219 & 2219 & 2219 & 2225 & 2212 & 2224 & 2217 & 2219 & 2230 & 2401 & 2187 \\
 2240 & 2237 & 2236 & 2237 & 2222 & 2231 & 2220 & 2222 & 2222 & 2402 & 2191 \\
 2260 & 2257 & 2254 & 2251 & 2236 & 2244 & 2229 & 2228 & 2219 & 2407 & 2194 \\
 2279 & 2273 & 2271 & 2265 & 2249 & 2257 & 2240 & 2239 & 2222 & 2413 & 2196 \\
\hline
\end{tabular}
\begin{tabular}{rrrrrrrrrrr}
\hline
 1131 & 1135 & 1142 & 1123 & 1104 & 10