In [1]:
import pandas as pd
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt

In [2]:
ts = pd.read_excel("data/stock_prices.xlsx", usecols = ["Date", "Adj Close", "Volume"])

In [None]:
# Check for stationarity

In [3]:
result = adfuller(ts["Adj Close"].dropna())
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1]) # Non-stationary -> need to differentiate

ADF Statistic: -1.522240
p-value: 0.522482


In [4]:
ts["Close_Diff"] = ts["Adj Close"].diff()

In [5]:
result = adfuller(ts["Close_Diff"].dropna())
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1]) # Stationary

ADF Statistic: -11.104181
p-value: 0.000000


In [None]:
# Build model and implement rolling window (= ExpandingWindowSplitter) for hyperparameter tuning

In [124]:
# Train, test split
series = ts["Close_Diff"].dropna()
train_size = int(round(len(series) * 0.90, 0))

train = series[:train_size]
y_test = list(series[train_size:])

# Train, val split
train_size2 = int(round(len(train) * 0.90, 0))
y_train = list(train[:train_size2])
y_val = list(train[train_size2:])

In [155]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error

def run_rolling_forecast(train, val, params):
    history = [x for x in train]
    predictions = list()
    for t in range(len(val)):
        model = ARIMA(history, order=params)
        model_fit = model.fit(return_params=False)
        output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat)
        obs = val[t]
        history.append(obs)
        print("%d/%d predicted=%f, expected=%f" % (t, len(val), yhat, obs))
    rmse = mean_squared_error(val, predictions, squared = True)
    mse = mean_squared_error(val, predictions, squared = False)
    mae = mean_absolute_error(val, predictions)

    return rmse, mse, mae

In [156]:
# p: The number of lag observations included in the model, also called the lag order.
# d: The number of times that the raw observations are differenced, also called the degree of differencing.
# q: The size of the moving average window, also called the order of moving average.

para = [(5,1,0), (5,1,1), (5,0,0)]

In [157]:
history = []
for p in para:
    rmse, mse,  mae = run_rolling_forecast(y_train, y_val, p)
    s = "Params: " + str(p) + ";RMSE: " + str(rmse) + ";MSE:" + str(mse) + ";MAE:" + str(mae)
    history.append(s)

0/41 predicted=-9.311467, expected=-1.059998
1/41 predicted=4.815049, expected=-7.029999
2/41 predicted=5.741235, expected=-2.800003
3/41 predicted=-8.035018, expected=3.740005
4/41 predicted=-3.391265, expected=-2.369995
5/41 predicted=4.508938, expected=3.259995
6/41 predicted=-0.448058, expected=-9.779999
7/41 predicted=-1.603150, expected=-1.529999
8/41 predicted=-2.972477, expected=-3.270004
9/41 predicted=-3.726472, expected=-8.899994
10/41 predicted=-1.578441, expected=0.720001
11/41 predicted=-5.134909, expected=-0.150009
12/41 predicted=-4.300181, expected=-1.979996
13/41 predicted=0.047606, expected=-9.190002
14/41 predicted=-2.845783, expected=-12.440002
15/41 predicted=-6.406999, expected=-0.799988
16/41 predicted=-7.894794, expected=2.219986
17/41 predicted=-3.904508, expected=4.450012
18/41 predicted=0.246065, expected=17.689987
19/41 predicted=0.216169, expected=-5.369995
20/41 predicted=6.051371, expected=-6.959991
21/41 predicted=2.996522, expected=1.509995
22/41 predi



0/41 predicted=-7.287503, expected=-1.059998
1/41 predicted=4.623433, expected=-7.029999
2/41 predicted=9.761109, expected=-2.800003
3/41 predicted=-2.984469, expected=3.740005
4/41 predicted=-4.165346, expected=-2.369995
5/41 predicted=3.334134, expected=3.259995




6/41 predicted=1.918371, expected=-9.779999
7/41 predicted=2.444238, expected=-1.529999
8/41 predicted=0.011577, expected=-3.270004
9/41 predicted=-0.694359, expected=-8.899994
10/41 predicted=3.807320, expected=0.720001
11/41 predicted=0.459617, expected=-0.150009
12/41 predicted=0.576828, expected=-1.979996
13/41 predicted=3.836133, expected=-9.190002
14/41 predicted=3.350806, expected=-12.440002
15/41 predicted=1.430975, expected=-0.799988
16/41 predicted=-1.988609, expected=2.219986
17/41 predicted=0.794846, expected=4.450012
18/41 predicted=4.281822, expected=17.689987
19/41 predicted=-0.126790, expected=-5.369995
20/41 predicted=4.667058, expected=-6.959991
21/41 predicted=2.246720, expected=1.509995
22/41 predicted=-6.633220, expected=3.580002
23/41 predicted=-2.089405, expected=-5.400009
24/41 predicted=4.993241, expected=-9.419998
25/41 predicted=2.832595, expected=-4.259995
26/41 predicted=-1.861741, expected=-3.740005
27/41 predicted=0.188135, expected=-3.470001
28/41 predic

In [158]:
# Find best parameters
for item in history:
    print(item)

Params: (5, 1, 0);RMSE: 52.0023748129438;MSE:7.211267212698736;MAE:5.658442127316466
Params: (5, 1, 1);RMSE: 51.98123663786513;MSE:7.209801428462862;MAE:5.778140741560425
Params: (5, 0, 0);RMSE: 52.125588525296315;MSE:7.219805296910459;MAE:5.789548340994598


In [118]:
# Hypothetically now, select best later on
best_param = (5, 1, 1)

In [133]:
y_train_final = y_train + y_val

In [146]:
mse, mae = run_rolling_forecast(y_train_final, y_test, best_param)

0/46 predicted=-0.081510, expected=6.400009
1/46 predicted=-2.502363, expected=5.589996
2/46 predicted=1.401737, expected=45.399994
3/46 predicted=-5.105873, expected=-10.639999
4/46 predicted=9.596390, expected=5.570007
5/46 predicted=1.673913, expected=-0.270004
6/46 predicted=-12.102407, expected=4.250000
7/46 predicted=-3.280263, expected=9.040009
8/46 predicted=-0.539708, expected=-5.270004
9/46 predicted=3.002764, expected=0.550003
10/46 predicted=-0.071737, expected=-10.770004




11/46 predicted=-0.573315, expected=-3.750000




12/46 predicted=-0.119062, expected=-0.199997
13/46 predicted=-0.796680, expected=0.379990
14/46 predicted=2.570833, expected=-8.769989
15/46 predicted=4.812174, expected=12.989990
16/46 predicted=-2.840226, expected=-4.159988




17/46 predicted=2.200620, expected=5.279999
18/46 predicted=2.715225, expected=1.849991
19/46 predicted=-1.534805, expected=-1.399994
20/46 predicted=1.136315, expected=-12.770004




21/46 predicted=2.544484, expected=-2.250000
22/46 predicted=-2.296746, expected=0.190002
23/46 predicted=-1.443927, expected=1.100006




24/46 predicted=3.399298, expected=-6.080002
25/46 predicted=4.306621, expected=4.319992
26/46 predicted=-0.873671, expected=-10.879990
27/46 predicted=2.344883, expected=-2.680008
28/46 predicted=1.323194, expected=-0.449997




29/46 predicted=-1.497516, expected=1.440002




30/46 predicted=1.860322, expected=-5.550003
31/46 predicted=3.959836, expected=0.819992
32/46 predicted=0.139705, expected=-1.109985
33/46 predicted=-0.191435, expected=1.049988




34/46 predicted=1.370306, expected=0.559998
35/46 predicted=0.941306, expected=5.420013
36/46 predicted=-0.245803, expected=-2.279999
37/46 predicted=1.770358, expected=8.239990
38/46 predicted=-1.112391, expected=-0.229996
39/46 predicted=0.114019, expected=-0.550003
40/46 predicted=1.509461, expected=2.740005




41/46 predicted=-1.875854, expected=0.769989
42/46 predicted=-0.369880, expected=-2.269989
43/46 predicted=1.639367, expected=-2.809998
44/46 predicted=0.251667, expected=-11.910004
45/46 predicted=1.656701, expected=4.169998


