In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
plt.style.use('seaborn')
import seaborn as sns

from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from sklearn.ensemble import AdaBoostRegressor
from tsfresh.utilities.dataframe_functions import impute

# Fix needed to pandas datareader
pd.core.common.is_list_like = pd.api.types.is_list_like
import pandas_datareader.data as web
import datetime

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

# Collect the data for the google stock 

In [None]:
start = datetime.datetime(2016, 1, 1)
end = datetime.datetime(2017, 1, 1)

# Need to use iex instead of google
x = web.DataReader("F", 'iex', start, end)
x.head()

In [None]:
x.info()

In [None]:
x.drop("volume", axis=1).plot(figsize=(15, 6))
plt.show()

So, we loaded the google stock for one year. Now, we want to predict the High column.

# Create forecasting frame

In [None]:
df_shift, y = make_forecasting_frame(x["high"], kind="price", max_timeshift=20, rolling_direction=1)

In [None]:
df_shift.head()

In [None]:
df_shift.shape

`df_shift` is ready to be passed into the feature extraction process in tsfresh 

In [None]:
%%capture
X = extract_features(df_shift, column_id="id", column_sort="time", column_value="value", impute_function=impute,
                     show_warnings=False)

In [None]:
# drop constant features
print(X.shape)
X = X.loc[:, X.apply(pd.Series.nunique) != 1] 
print(X.shape)

In [None]:
# Add last value as feature
X["feature_last_value"] = y.shift(1)

In [None]:
# Drop first line
X = X.iloc[1:, ]
y = y.iloc[1: ]

In [None]:
X.head()

#  Fit Adaboost

In [None]:
ada = AdaBoostRegressor(n_estimators=10)
y_pred = [np.NaN] * len(y)

isp = 100   # index of where to start the predictions
assert isp > 0

for i in tqdm(range(isp, len(y))):
    
    ada.fit(X.iloc[:i], y[:i])
    y_pred[i] = ada.predict(X.iloc[i, :].values.reshape((1, -1)))[0]
    
y_pred = pd.Series(data=y_pred, index=y.index)

In [None]:
# Dataframe of predictions and true values
ys = pd.concat([y_pred, y], axis = 1).rename(columns = {0: 'pred', 'value': 'true'})

# Convert index to a datetime
ys.index = pd.to_datetime(ys.index)
ys.head()

In [None]:
ys.plot(figsize=(15, 8))
plt.title('Predicted and True Price')
plt.show()

Looks not too bad. The green curve is the output of the AdaBoost Regressor, the blue curve is the true High value.

Now, we will also inspect last value before the prediction as a benchmark tool, denoted by y-1

In [None]:
# Create column of previous price
ys['y-1'] = ys['true'].shift(1)
ys[['y-1', 'true']].plot(figsize = (15, 8))
plt.title('Benchmark Prediction and True Price')
plt.show()

In [None]:
print("MAE y-1: \t{}".format(np.mean(np.abs(np.diff(y))[isp-1:] )))
print("MAE ada: \t{}".format(np.mean(np.abs(y_pred - y)[isp:])))

However, we are not yet beating the y-1 benchmark, so we need to invest more time into building dedicated features or use a better model.

We can also inspect the relevance of the extracted features

In [None]:
importances = pd.Series(index=X.columns, data=ada.feature_importances_)
importances.sort_values(ascending=False).head(10)

So, the minumum value "feature__maximum" during the last 10 values had the highest importance to predict the next value of the `High` column