<a href="https://colab.research.google.com/github/TruongPhanNT/PowerBIVN/blob/master/Final_(dataset_full).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Forecast Time Series by Using Common Machine Learning Algorithms

## Nội dung chính


1.   Dự báo trên tập dataset 1 lấy từ Yahoo Finance: Mã stock là Rough Rice Nov 20 (RR=F)
2.   Fill dữ liệu bằng pchip/ cubic
3.   Xây dựng các mô hình dự báo Bayes Regressor, LSTM, và ARIMA
4.   Split tập training, test set lần lượt theo tỉ lệ 70/30, 80/20 và 90/10
5.   Thử nghiệm và so sánh các mô hình để chọn ra mô  hình tối ưu

# 1. Install neccessary libraries

In [None]:
!pip install stockai



In [None]:
!pip install pytictoc



# 2. Load libraries

In [None]:
from stockai import Stock
import numpy as np
from numpy import concatenate
from math import sqrt
 
from pandas import DataFrame
from pandas import concat
import pandas as pd 
 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
 
from sklearn.linear_model import BayesianRidge
from statsmodels.tsa.arima_model import ARIMA
 
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
 
from matplotlib import pyplot
 
import scipy
 
import dateutil
import datetime as dt
import time
from pytictoc import TicToc
t = TicToc() #create instance of class
 
%matplotlib inline

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

# 3. Load & convert dataset
 
*   List item
*   List item

## 3.1 Load Market Data from Yahoo Finance

In [None]:
# RICE ROUGH
td = Stock('ZR=F')
#td.get_historical_prices('2020-01-01', '2021-03-11')
 
 
prices_list = td.get_historical_prices('2005-01-01', '2021-01-01')
 
df = pd.DataFrame.from_dict(prices_list)
print(df.head(10))
 
print('Rows:', df.shape[0])
print('Rows missing value:',df.isnull().any(axis = 1).sum())
pc = 100* df.isnull().any(axis = 1).sum()/df.shape[0]
print('Pecentage of missing value rows (%):', pc)

         date  close   high   open    low  volume  adjclose
0  1104728400  686.0  726.0  726.0  725.5     3.0     686.0
1  1104814800  690.0  690.0  690.0  690.0     3.0     690.0
2  1104901200  694.0  694.0  694.0  694.0     3.0     694.0
3  1104987600  706.0  706.0  706.0  706.0     3.0     706.0
4  1105074000  711.0  711.0  711.0  711.0     3.0     711.0
5  1105333200  706.0  706.0  706.0  706.0     3.0     706.0
6  1105419600  693.0  693.0  693.0  693.0     3.0     693.0
7  1105506000  689.0  689.0  689.0  689.0     3.0     689.0
8  1105592400  696.0  696.0  696.0  696.0     3.0     696.0
9  1105678800  695.0  695.0  695.0  695.0     3.0     695.0
Rows: 4049
Rows missing value: 77
Pecentage of missing value rows (%): 1.9017041244751791


**Vì phần trăm dữ liệu missing >5% nên không dùng phương pháp drop được. Tức là df_fill==dropna không dùng**


## 3.2 Define function to fill missing values

In [None]:
def fill_missing_values(df, fill_method='ffill', fill_order=5):
    if fill_method=='0':
        df_fill= df.fillna(0)  
    if fill_method=='mean':
        df_fill= df.fillna(df.mean())
    if fill_method=='dropna':
        df_fill = df.dropna()              
    if fill_method in ('ffill','bfill'):
        df_fill= df.fillna(method=fill_method)
    if fill_method in ('linear','quadratic','cubic','nearest'):
        df_fill=df.interpolate(method=fill_method)
    if fill_method in ('polynomial','piecewise_polynomial','pchip','slinear'):
        df_fill=df.interpolate(method=fill_method, order=fill_order)
    return df_fill

# 4. Split dataset into training set, test set

## 4.1 Feature Engineering for Time Series

In [None]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [None]:
#split dataset into training, test set --> Dùng tỉ lệ 90% train và 10% test
def split_train_test (dataset, traing_ratio=.9, sliding_size=4):
    values = dataset.values
    # ensure all data is float
    values = values.astype('float32')
    # normalise features
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled = scaler.fit_transform(values)
    # frame as supervised learning
    n_mins = sliding_size
 
    reframed = series_to_supervised(scaled, n_mins, 1)
   
    # split into train and test sets
    count_row_reframed = reframed.shape[0]  # gives number of row count
 
    values = reframed.values
 
    n_train_mins = int(4 * 1 * (count_row_reframed*traing_ratio/(4*1))) 
 
 
    train = values[:n_train_mins, :]
    test = values[n_train_mins:, :]
    # split into input and outputs
    n_obs = n_mins * 1
    train_X, train_y = train[:, :n_obs], train[:, -1]
    test_X,  test_y =test[:, :n_obs], test[:,-1]
    #print(train_y,test_y)
    return train_X, train_y, test_X, test_y, scaler

In [None]:
#drop non predict columns
def drop_non_predict_columns (df):
    ts_df = pd.DataFrame()
 
    # assign column
    ts_df['date'] = df.date.astype(str).replace('-', '')
    ts_df['val'] = df['adjclose']
 
    #set date as index column
    ts_df.set_index('date',inplace=True)
    #sort order by date
    ts_df = ts_df.sort_index()
    return ts_df

# 5. Model

## 5.1 Bayes Ridge 

(đây là một loại thuật toán thuộc ANN: cần đọc thêm bài báo: https://onlinelibrary.wiley.com/doi/abs/10.1111/jbg.12468) --> Trong báo cáo lướt qua các loại Bayes: traditional, Lasso, .... nhưng trong bài báo. Và nêu ra ưu điểm, khuyết điểm của các loại phương pháp này. Điều này rất có ích để khi ra có kết quả đánh giá hình vẽ, hay số liệu mà giải thích được. Đề nghị bổ sung vào báo cáo.

In [None]:
def Model_BayesianRidge (train_X,train_y,test_X, test_y, scaler, sliding_size=4):
    t.tic() #Start timer
    start_time = time.perf_counter()
    
    # design svm model
    model = BayesianRidge()
 
    # fit model
    model.fit(train_X,train_y)
 
    t.toc() #End timer
    
    running_time =time.perf_counter()-start_time
    # make a prediction
    predict_y = model.predict(test_X)
    yhat = predict_y.reshape(predict_y.shape[0],1)
    test_X = test_X.reshape((test_X.shape[0],sliding_size*1))
 
    # invert scaling for forecast
    inv_yhat = concatenate((test_X[:, -1:-1], yhat), axis=1)
    inv_yhat = scaler.inverse_transform(inv_yhat)
    inv_yhat = inv_yhat[:,-1]
 
 
    # invert scaling for actual
    test_y = test_y.reshape((len(test_y), 1))
    inv_y = concatenate((test_X[:, -1:-1], test_y), axis=1)
    inv_y = scaler.inverse_transform(inv_y)
    inv_y = inv_y[:,-1]
 
    # calculate RMSE
    rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
    # calculate MAPE
    mape = np.mean(np.fabs((inv_y - inv_yhat) / inv_y)) * 100
    
    return model, rmse, mape, inv_yhat, inv_y, running_time

## 5.2 LSTM

In [None]:
def Model_SingleLSTM (train_X,train_y,test_X, test_y, scaler, sliding_size=4):
    t.tic() #Start timer
    start_time = time.perf_counter()
    
    # reshape input to be [samples, time steps, features]
    trainX = np.reshape(train_X, (train_X.shape[0], train_X.shape[1], 1))
    testX = np.reshape(test_X, (test_X.shape[0], test_X.shape[1], 1))
    
    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(16, input_shape=(sliding_size, 1)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, train_y, epochs=100, batch_size=20, verbose=2)
 
    t.toc() #End timer
    
    running_time =time.perf_counter()-start_time
    # make a prediction
    predict_y = model.predict(testX) 
    yhat = predict_y.reshape(predict_y.shape[0],1)
    testX = testX.reshape((testX.shape[0],sliding_size*1))
 
    # invert scaling for forecast
    inv_yhat = concatenate((testX[:, -1:-1], yhat), axis=1)
    inv_yhat = scaler.inverse_transform(inv_yhat)
    inv_yhat = inv_yhat[:,-1]
 
    # invert scaling for actual
    test_y = test_y.reshape((len(test_y), 1))
    inv_y = concatenate((testX[:, -1:-1], test_y), axis=1)
    inv_y = scaler.inverse_transform(inv_y)
    inv_y = inv_y[:,-1]
 
    # calculate RMSE
    rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
    # calculate MAPE
    mape = np.mean(np.fabs((inv_y - inv_yhat) / inv_y)) * 100
    
    return model, rmse, mape, inv_yhat, inv_y, running_time

## 5.3 ARIMA

In [None]:
def Model_ARIMA (train_data, test_data):
  
  training_data = train_data['val'].values
  test_data = test_data['val'].values
  history = [x for x in training_data]
  model_predictions = []
  N_test_observations = len(test_data)
 
  t.tic() #Start timer
  start_time = time.perf_counter()
 
  print (range(N_test_observations))
  for time_point in range(N_test_observations):
      model = ARIMA(history, order=(4,1,0))
      model_fit = model.fit(disp=0)
      output = model_fit.forecast()
      yhat = output[0]
      model_predictions.append(yhat)
      true_test_value = test_data[time_point]
      history.append(true_test_value)
      print(time_point)
  print(model_fit.summary())
 
  t.toc() #End timer
  running_time =time.perf_counter()-start_time 
  # RMSE
  rmse = sqrt(mean_squared_error(test_data, model_predictions))
  # MAPE
  mape = np.mean(np.fabs((test_data - model_predictions) / test_data)) * 100
 
  #pyplot.plot(test_data, linestyle='solid', color='blue', label='Actual')
  #pyplot.plot(model_predictions, linestyle='--', color='red', label='Prediction - ARIMA')
 
  return model, rmse, mape, model_predictions, test_data, running_time

# 6 Run & Test model

### 6.1 Use pchip method

### 6.1.1 Split training set, test set

In [None]:
#fill missing values
fill_df = fill_missing_values(df, fill_method='pchip',fill_order=5)
#drop non
dataset = drop_non_predict_columns (fill_df)
#split dataset into training and test set
train_X7, train_y7, test_X7, test_y7, scaler7 = split_train_test (dataset, traing_ratio=.7, sliding_size=4)
train_X8, train_y8, test_X8, test_y8, scaler8 = split_train_test (dataset, traing_ratio=.8, sliding_size=4)
train_X9, train_y9, test_X9, test_y9, scaler9 = split_train_test (dataset, traing_ratio=.9, sliding_size=4)
 
#training for arima
a_train_data7, a_test_data7 = dataset[0:int(len(dataset)*0.7)], dataset[int(len(dataset)*0.7):]
a_train_data8, a_test_data8 = dataset[0:int(len(dataset)*0.8)], dataset[int(len(dataset)*0.8):]
a_train_data9, a_test_data9 = dataset[0:int(len(dataset)*0.9)], dataset[int(len(dataset)*0.9):]

### 6.1.2 Run models

In [None]:
#BAYES RIDGE
b7_model, b7_rmse, b7_mape, b7_inv_yhat, b7_inv_y, running_time7 = Model_BayesianRidge (train_X7,train_y7,test_X7, test_y7, scaler7)
b8_model, b8_rmse, b8_mape, b8_inv_yhat, b8_inv_y, running_time8 = Model_BayesianRidge (train_X8,train_y8,test_X8, test_y8, scaler8)
b9_model, b9_rmse, b9_mape, b9_inv_yhat, b9_inv_y, running_time9 = Model_BayesianRidge (train_X9,train_y9,test_X9, test_y9, scaler9)

Elapsed time is 0.035021 seconds.
Elapsed time is 0.003250 seconds.
Elapsed time is 0.001919 seconds.


In [None]:
#tLSTM
lstm7_model, lstm7_rmse, lstm7_mape, lstm7_inv_yhat, lstm7_inv_y, lstm_running_time7 = Model_SingleLSTM (train_X7,train_y7,test_X7, test_y7, scaler7)
lstm8_model, lstm8_rmse, lstm8_mape, lstm8_inv_yhat, lstm8_inv_y, lstm_running_time8 = Model_SingleLSTM (train_X8,train_y8,test_X8, test_y8, scaler8)
lstm9_model, lstm9_rmse, lstm9_mape, lstm9_inv_yhat, lstm9_inv_y, lstm_running_time9 = Model_SingleLSTM (train_X9,train_y9,test_X9, test_y9, scaler9)

Epoch 1/100
142/142 - 2s - loss: 0.0255
Epoch 2/100
142/142 - 0s - loss: 0.0014
Epoch 3/100
142/142 - 0s - loss: 5.2710e-04
Epoch 4/100
142/142 - 0s - loss: 4.8139e-04
Epoch 5/100
142/142 - 0s - loss: 4.3577e-04
Epoch 6/100
142/142 - 0s - loss: 4.1205e-04
Epoch 7/100
142/142 - 0s - loss: 3.7872e-04
Epoch 8/100
142/142 - 0s - loss: 3.6295e-04
Epoch 9/100
142/142 - 0s - loss: 3.6304e-04
Epoch 10/100
142/142 - 0s - loss: 3.5274e-04
Epoch 11/100
142/142 - 0s - loss: 3.4210e-04
Epoch 12/100
142/142 - 0s - loss: 3.3531e-04
Epoch 13/100
142/142 - 0s - loss: 3.3662e-04
Epoch 14/100
142/142 - 0s - loss: 3.2742e-04
Epoch 15/100
142/142 - 0s - loss: 3.2528e-04
Epoch 16/100
142/142 - 0s - loss: 3.2877e-04
Epoch 17/100
142/142 - 0s - loss: 3.1869e-04
Epoch 18/100
142/142 - 0s - loss: 3.1627e-04
Epoch 19/100
142/142 - 0s - loss: 3.1976e-04
Epoch 20/100
142/142 - 0s - loss: 3.1466e-04
Epoch 21/100
142/142 - 0s - loss: 3.0686e-04
Epoch 22/100
142/142 - 0s - loss: 3.0549e-04
Epoch 23/100
142/142 - 0s -

In [None]:
#ARIMA
a7_model, a7_rmse, a7_mape, a7_inv_yhat, a7_inv_y, a7_running_time = Model_ARIMA (a_train_data7, a_test_data7)
a8_model, a8_rmse, a8_mape, a8_inv_yhat, a8_inv_y, a8_running_time = Model_ARIMA (a_train_data8, a_test_data8)
a9_model, a9_rmse, a9_mape, a9_inv_yhat, a9_inv_y, a9_running_time = Model_ARIMA (a_train_data9, a_test_data9)

range(0, 1215)
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273

### 6.1.3 Evaluation models

In [None]:
df_evaluation = pd.DataFrame(columns=['Dataset', 'Split Ratio', 'Fill method', 'Prediction', 'RMSE', 'MAPE', 'Running time'])
 
df_evaluation = df_evaluation.append({'Dataset': 'Dataset 1st', 'Split Ratio': '70/30', 'Fill method': 'pchip', 'Prediction': 'Bayes Ridge' , 
                                      'RMSE': b7_rmse,'MAPE': b7_mape, 'Running time': running_time7}, ignore_index=True)
df_evaluation = df_evaluation.append({'Dataset': 'Dataset 1st', 'Split Ratio': '80/20', 'Fill method': 'pchip', 'Prediction': 'Bayes Ridge' , 
                                      'RMSE': b8_rmse,'MAPE': b8_mape, 'Running time': running_time8}, ignore_index=True)
df_evaluation = df_evaluation.append({'Dataset': 'Dataset 1st', 'Split Ratio': '90/10', 'Fill method': 'pchip', 'Prediction': 'Bayes Ridge' , 
                                      'RMSE': b9_rmse,'MAPE': b9_mape, 'Running time': running_time9}, ignore_index=True)
 
df_evaluation = df_evaluation.append({'Dataset': 'Dataset 1st', 'Split Ratio': '70/30', 'Fill method': 'pchip', 'Prediction': 'LSTM' , 
                                      'RMSE': lstm7_rmse,'MAPE': lstm7_mape, 'Running time': lstm_running_time7}, ignore_index=True)
df_evaluation = df_evaluation.append({'Dataset': 'Dataset 1st', 'Split Ratio': '80/20', 'Fill method': 'pchip', 'Prediction': 'LSTM' , 
                                      'RMSE': lstm8_rmse,'MAPE': lstm8_mape, 'Running time': lstm_running_time8}, ignore_index=True)
df_evaluation = df_evaluation.append({'Dataset': 'Dataset 1st', 'Split Ratio': '90/10', 'Fill method': 'pchip', 'Prediction': 'LSTM' , 
                                      'RMSE': lstm9_rmse,'MAPE': lstm9_mape, 'Running time': lstm_running_time9}, ignore_index=True)
 
 
df_evaluation = df_evaluation.append({'Dataset': 'Dataset 1st', 'Split Ratio': '70/30', 'Fill method': 'pchip', 'Prediction': 'ARIMA' , 
                                      'RMSE': a7_rmse,'MAPE': a7_mape, 'Running time': a7_running_time}, ignore_index=True)
df_evaluation = df_evaluation.append({'Dataset': 'Dataset 1st', 'Split Ratio': '80/20', 'Fill method': 'pchip', 'Prediction': 'ARIMA' , 
                                      'RMSE': a8_rmse,'MAPE': a8_mape, 'Running time': a8_running_time}, ignore_index=True)
df_evaluation = df_evaluation.append({'Dataset': 'Dataset 1st', 'Split Ratio': '90/10', 'Fill method': 'pchip', 'Prediction': 'ARIMA' , 
                                      'RMSE': a9_rmse,'MAPE': a9_mape, 'Running time': a9_running_time}, ignore_index=True)
 
df_evaluation.sort_values(by=['Dataset', 'Split Ratio', 'RMSE', 'MAPE'], inplace=True, ascending=[True, True, True, True])
 
cols = ['Fill method', 'Split Ratio', 'Prediction', 'RMSE', 'MAPE', 'Running time']
 
print(df_evaluation[cols])

  Fill method Split Ratio   Prediction       RMSE       MAPE  Running time
0       pchip       70/30  Bayes Ridge  24.623078   1.084206      0.035241
6       pchip       70/30        ARIMA  24.663887  14.221685    235.044034
3       pchip       70/30         LSTM  25.200141   1.150771     39.661231
4       pchip       80/20         LSTM  27.922620   1.090634     45.238572
1       pchip       80/20  Bayes Ridge  28.087481   1.067426      0.003627
7       pchip       80/20        ARIMA  28.109467  12.591679    159.971276
5       pchip       90/10         LSTM  36.556462   1.201204     34.314475
2       pchip       90/10  Bayes Ridge  36.682361   1.206034      0.002787
8       pchip       90/10        ARIMA  36.718093  12.454391     82.314205


### 6.2 Use cubic method

### 6.2.1 Split training set, test set

In [None]:
fill_df_f2 = fill_missing_values(df, fill_method='cubic',fill_order=5)
#drop non
dataset_f2 = drop_non_predict_columns (fill_df)
#split dataset into training and test set
train_X7_f2, train_y7_f2, test_X7_f2, test_y7_f2, scaler7_f2 = split_train_test (dataset_f2, traing_ratio=.7, sliding_size=4)
train_X8_f2, train_y8_f2, test_X8_f2, test_y8_f2, scaler8_f2 = split_train_test (dataset_f2, traing_ratio=.8, sliding_size=4)
train_X9_f2, train_y9_f2, test_X9_f2, test_y9_f2, scaler9_f2 = split_train_test (dataset_f2, traing_ratio=.9, sliding_size=4)

#training for arima
a_train_data7_f2, a_test_data7_f2 = dataset_f2[0:int(len(dataset_f2)*0.7)], dataset_f2[int(len(dataset_f2)*0.7):]
a_train_data8_f2, a_test_data8_f2 = dataset_f2[0:int(len(dataset_f2)*0.8)], dataset_f2[int(len(dataset_f2)*0.8):]
a_train_data9_f2, a_test_data9_f2 = dataset_f2[0:int(len(dataset_f2)*0.9)], dataset_f2[int(len(dataset_f2)*0.9):]

### 6.2.2 Run models

In [None]:
#BAYES RIDGE
b7_model_f2, b7_rmse_f2, b7_mape_f2, b7_inv_yhat_f2, b7_inv_y_f2, running_time7_f2 = Model_BayesianRidge (train_X7_f2,train_y7_f2,test_X7_f2, test_y7_f2, scaler7_f2)
b8_model_f2, b8_rmse_f2, b8_mape_f2, b8_inv_yhat_f2, b8_inv_y_f2, running_time8_f2 = Model_BayesianRidge (train_X8_f2,train_y8_f2,test_X8_f2, test_y8_f2, scaler8_f2)
b9_model_f2, b9_rmse_f2, b9_mape_f2, b9_inv_yhat_f2, b9_inv_y_f2, running_time9_f2 = Model_BayesianRidge (train_X9_f2,train_y9_f2,test_X9_f2, test_y9_f2, scaler9_f2)

In [None]:
#tLSTM
lstm7_model_f2, lstm7_rmse_f2, lstm7_mape_f2, lstm7_inv_yhat_f2, lstm7_inv_y_f2, lstm_running_time7_f2 = Model_SingleLSTM (train_X7_f2,train_y7_f2,test_X7_f2, test_y7_f2, scaler7_f2)
lstm8_model_f2, lstm8_rmse_f2, lstm8_mape_f2, lstm8_inv_yhat_f2, lstm8_inv_y_f2, lstm_running_time8_f2 = Model_SingleLSTM (train_X8_f2,train_y8_f2,test_X8_f2, test_y8_f2, scaler8_f2)
lstm9_model_f2, lstm9_rmse_f2, lstm9_mape_f2, lstm9_inv_yhat_f2, lstm9_inv_y_f2, lstm_running_time9_f2 = Model_SingleLSTM (train_X9_f2,train_y9_f2,test_X9_f2, test_y9_f2, scaler9_f2)

In [None]:
#ARIMA
a7_model_f2, a7_rmse_f2, a7_mape_f2, a7_inv_yhat_f2, a7_inv_y_f2, a7_running_time_f2 = Model_ARIMA (a_train_data7_f2, a_test_data7_f2)
a8_model_f2, a8_rmse_f2, a8_mape_f2, a8_inv_yhat_f2, a8_inv_y_f2, a8_running_time_f2 = Model_ARIMA (a_train_data8_f2, a_test_data8_f2)
a9_model_f2, a9_rmse_f2, a9_mape_f2, a9_inv_yhat_f2, a9_inv_y_f2, a9_running_time_f2 = Model_ARIMA (a_train_data9_f2, a_test_data9_f2)

### 6.2.3 Evaluation models

In [None]:
df_evaluation_f2 = pd.DataFrame(columns=['Dataset', 'Split Ratio', 'Fill method', 'Prediction', 'RMSE', 'MAPE', 'Running time'])

df_evaluation_f2 = df_evaluation_f2.append({'Dataset': 'Dataset 1st', 'Split Ratio': '70/30', 'Fill method': 'cubic', 'Prediction': 'Bayes Ridge' , 
                                      'RMSE': b7_rmse_f2,'MAPE': b7_mape_f2, 'Running time': running_time7_f2}, ignore_index=True)
df_evaluation_f2 = df_evaluation_f2.append({'Dataset': 'Dataset 1st', 'Split Ratio': '80/20', 'Fill method': 'cubic', 'Prediction': 'Bayes Ridge' , 
                                      'RMSE': b8_rmse_f2,'MAPE': b8_mape_f2, 'Running time': running_time8_f2}, ignore_index=True)
df_evaluation_f2 = df_evaluation_f2.append({'Dataset': 'Dataset 1st', 'Split Ratio': '90/10', 'Fill method': 'cubic', 'Prediction': 'Bayes Ridge' , 
                                      'RMSE': b9_rmse_f2,'MAPE': b9_mape_f2, 'Running time': running_time9_f2}, ignore_index=True)

df_evaluation_f2 = df_evaluation_f2.append({'Dataset': 'Dataset 1st', 'Split Ratio': '70/30', 'Fill method': 'cubic', 'Prediction': 'LSTM' , 
                                      'RMSE': lstm7_rmse_f2,'MAPE': lstm7_mape_f2, 'Running time': lstm_running_time7_f2}, ignore_index=True)
df_evaluation_f2 = df_evaluation_f2.append({'Dataset': 'Dataset 1st', 'Split Ratio': '80/20', 'Fill method': 'cubic', 'Prediction': 'LSTM' , 
                                      'RMSE': lstm8_rmse_f2,'MAPE': lstm8_mape_f2, 'Running time': lstm_running_time8_f2}, ignore_index=True)
df_evaluation_f2 = df_evaluation_f2.append({'Dataset': 'Dataset 1st', 'Split Ratio': '90/10', 'Fill method': 'cubic', 'Prediction': 'LSTM' , 
                                      'RMSE': lstm9_rmse_f2,'MAPE': lstm9_mape_f2, 'Running time': lstm_running_time9_f2}, ignore_index=True)


df_evaluation_f2 = df_evaluation_f2.append({'Dataset': 'Dataset 1st', 'Split Ratio': '70/30', 'Fill method': 'cubic', 'Prediction': 'ARIMA' , 
                                      'RMSE': a7_rmse_f2,'MAPE': a7_mape_f2, 'Running time': a7_running_time_f2}, ignore_index=True)
df_evaluation_f2 = df_evaluation_f2.append({'Dataset': 'Dataset 1st', 'Split Ratio': '80/20', 'Fill method': 'cubic', 'Prediction': 'ARIMA' , 
                                      'RMSE': a8_rmse_f2,'MAPE': a8_mape_f2, 'Running time': a8_running_time_f2}, ignore_index=True)
df_evaluation_f2 = df_evaluation_f2.append({'Dataset': 'Dataset 1st', 'Split Ratio': '90/10', 'Fill method': 'cubic', 'Prediction': 'ARIMA' , 
                                      'RMSE': a9_rmse_f2,'MAPE': a9_mape_f2, 'Running time': a9_running_time_f2}, ignore_index=True)

df_evaluation_f2.sort_values(by=['Dataset', 'Split Ratio', 'RMSE', 'MAPE'], inplace=True, ascending=[True, True, True, True])

cols = ['Fill method', 'Split Ratio', 'Prediction', 'RMSE', 'MAPE', 'Running time']

print(df_evaluation_f2[cols])

## 6.3 Combination

In [None]:
df_evaluation_f1f2 = pd.concat([df_evaluation, df_evaluation_f2], ignore_index=True)

df_evaluation_f1f2.sort_values(by=['Dataset', 'Split Ratio', 'RMSE', 'MAPE'], inplace=True, ascending=[True, True, True, True])

cols = ['Fill method', 'Split Ratio', 'Prediction', 'RMSE', 'MAPE', 'Running time']

print(df_evaluation_f1f2[cols].head(50))

### 6.3.1 Fill: pchip, Split: 70/30

In [None]:
# plot 
pyplot.figure(figsize=(20,6))

pyplot.plot([x for x in range(1, b7_inv_y.shape[0]+1)], b7_inv_y, linestyle='solid',linewidth=3, color='black', label='Actual')
pyplot.plot([x for x in range(1, lstm7_inv_yhat.shape[0]+1)], lstm7_inv_yhat, linestyle='--', color='purple', label='Prediction - LSTM')
pyplot.plot([x for x in range(1, b7_inv_yhat.shape[0]+1)], b7_inv_yhat, linestyle='--', color='red', label='Prediction - Bayes Ridge')
pyplot.plot(a7_inv_yhat, linestyle='--', color='cyan', label='Prediction - ARIMA')

pyplot.legend(loc=1, prop={'size': 12})
pyplot.show()


**Zoom in**

In [None]:
# plot 
pyplot.figure(figsize=(20,6))

pyplot.plot([x for x in range(1, b7_inv_y.shape[0]+1)], b7_inv_y, linestyle='solid',linewidth=3, color='black', label='Actual')
pyplot.plot([x for x in range(1, lstm7_inv_yhat.shape[0]+1)], lstm7_inv_yhat, linestyle='--', color='purple', label='Prediction - LSTM')
pyplot.plot([x for x in range(1, b7_inv_yhat.shape[0]+1)], b7_inv_yhat, linestyle='-', color='red', label='Prediction - Bayes Ridge')
pyplot.plot(a7_inv_yhat, linestyle='--', color='cyan', label='Prediction - ARIMA')

pyplot.legend(loc=1, prop={'size': 12})
pyplot.xlim([340,360])
pyplot.ylim([900,1430])

pyplot.show()

In [None]:
# plot 
pyplot.figure(figsize=(20,6))

pyplot.plot([x for x in range(1, b8_inv_y.shape[0]+1)], b8_inv_y, linestyle='solid',linewidth=3, color='black', label='Actual')
pyplot.plot([x for x in range(1, lstm8_inv_yhat.shape[0]+1)], lstm8_inv_yhat, linestyle='--', color='purple', label='Prediction - LSTM')
pyplot.plot([x for x in range(1, b8_inv_yhat.shape[0]+1)], b8_inv_yhat, linestyle='--', color='red', label='Prediction - Bayes Ridge')
pyplot.plot(a8_inv_yhat, linestyle='--', color='cyan', label='Prediction - ARIMA')

pyplot.legend(loc=1, prop={'size': 12})
pyplot.show()

In [None]:
# plot 
pyplot.figure(figsize=(20,6))

pyplot.plot([x for x in range(1, b8_inv_y.shape[0]+1)], b8_inv_y, linestyle='solid',linewidth=3, color='black', label='Actual')
pyplot.plot([x for x in range(1, lstm8_inv_yhat.shape[0]+1)], lstm8_inv_yhat, linestyle='--', color='purple', label='Prediction - LSTM')
pyplot.plot([x for x in range(1, b8_inv_yhat.shape[0]+1)], b8_inv_yhat, linestyle='--', color='red', label='Prediction - Bayes Ridge')
pyplot.plot(a7_inv_yhat, linestyle='--', color='cyan', label='Prediction - ARIMA')

pyplot.legend(loc=1, prop={'size': 12})
pyplot.xlim([340,360])
pyplot.ylim([900,1430])

pyplot.show()

### 6.3.1 Fill: pchip, Split: 80/20

In [None]:
# plot 
pyplot.figure(figsize=(20,6))

pyplot.plot([x for x in range(1, b8_inv_y.shape[0]+1)], b8_inv_y, linestyle='solid',linewidth=3, color='black', label='Actual')
pyplot.plot([x for x in range(1, lstm8_inv_yhat.shape[0]+1)], lstm8_inv_yhat, linestyle='--', color='purple', label='Prediction - LSTM')
pyplot.plot([x for x in range(1, b8_inv_yhat.shape[0]+1)], b8_inv_yhat, linestyle='-', color='red', label='Prediction - Bayes Ridge')
pyplot.plot(a8_inv_yhat, linestyle='--', color='cyan', label='Prediction - ARIMA')

pyplot.legend(loc=1, prop={'size': 12})
pyplot.show()

In [None]:
# plot 
pyplot.figure(figsize=(20,6))

pyplot.plot([x for x in range(1, b9_inv_y.shape[0]+1)], b9_inv_y, linestyle='solid',linewidth=3, color='black', label='Actual')
pyplot.plot([x for x in range(1, lstm9_inv_yhat.shape[0]+1)], lstm9_inv_yhat, linestyle='--', color='purple', label='Prediction - LSTM')
pyplot.plot([x for x in range(1, b9_inv_yhat.shape[0]+1)], b9_inv_yhat, linestyle='-', color='red', label='Prediction - Bayes Ridge')
pyplot.plot(a9_inv_yhat, linestyle='--', color='cyan', label='Prediction - ARIMA')

pyplot.legend(loc=1, prop={'size': 12})
pyplot.show()

In [None]:
# plot 
pyplot.figure(figsize=(20,6))

pyplot.plot([x for x in range(1, b9_inv_y.shape[0]+1)], b9_inv_y, linestyle='solid',linewidth=3, color='black', label='Actual')
pyplot.plot([x for x in range(1, lstm9_inv_yhat.shape[0]+1)], lstm9_inv_yhat, linestyle='--', color='purple', label='Prediction - LSTM')
pyplot.plot([x for x in range(1, b9_inv_yhat.shape[0]+1)], b9_inv_yhat, linestyle='-', color='red', label='Prediction - Bayes Ridge')
pyplot.plot(a9_inv_yhat, linestyle='--', color='cyan', label='Prediction - ARIMA')

pyplot.legend(loc=1, prop={'size': 12})
pyplot.xlim([340,360])
pyplot.ylim([900,1430])

pyplot.show()