In [7]:
import pandas as pd
import numpy as np
import quandl
from matplotlib import pyplot 
quandl.ApiConfig.api_key = 'YOUR_API_KEY_REPLACE_THIS'

from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
from math import sqrt

ModuleNotFoundError: No module named 'quandl'

#### Let's construct an imaginary tech portfolio split evenly between Apple, Amazon, Microsoft, Facebook, Google and Netflix. We will grab data from the beginning of 2015 to min(now, end_of_2018). That will give us 3-4 years worth of data.

In [3]:
ticker_symbols = ["AAPL","AMZN","MSFT","FB","GOOGL","NFLX"]
closing_prices = pd.DataFrame()

for tckr in ticker_symbols:
    closing_prices = pd.concat([closing_prices, 
        quandl.get("WIKI/" + tckr, start_date="2014-12-31", end_date="2018-12-31", collapse="daily")['Close']
        .rename(tckr)], axis=1)
    
    #Might as well calculate price as we go
    if 'portfolio' in closing_prices:
        closing_prices['portfolio'] += closing_prices[tckr] * 1 / len(ticker_symbols)
    else:
        closing_prices['portfolio'] = closing_prices[tckr] * 1 / len(ticker_symbols)
        
closing_prices = closing_prices[ticker_symbols + ['portfolio']] #reorder columns
closing_prices.head(10)

NameError: name 'quandl' is not defined

In [3]:
closing_prices.isnull().sum()

AAPL         1
AMZN         1
MSFT         0
FB           0
GOOGL        0
NFLX         0
portfolio    1
dtype: int64

#### Not bad. Unfortunately, the missing value appears to take place on a Monday, so we will impute with the midpoint between the  preceding Friday and the following Tuesday (open to suggestions on a more elegant way to code this...)


In [4]:
from datetime import timedelta

In [5]:
for key in ['AAPL','AMZN','portfolio']:
    missing_date = closing_prices.index[closing_prices[key].isnull()]
    closing_prices[key][missing_date] = \
    (closing_prices[key][missing_date + timedelta(days=1)][0] + \
    closing_prices[key][missing_date - timedelta(days=3)][0]) / 2

closing_prices.isnull().sum()

AAPL         0
AMZN         0
MSFT         0
FB           0
GOOGL        0
NFLX         0
portfolio    0
dtype: int64

#### Let's split off the last ~1 year as test data. Assume a year has 52 * 5 trading days. The rest will be training data.

In [6]:
train, test = closing_prices[:-260], closing_prices[-260:]
closing_prices.shape, train.shape, test.shape

((795, 7), (535, 7), (260, 7))

In [7]:
#Make moving window prediction
def create_pred_col(df, shift = -1):
    df['preds'] = df['portfolio'].shift(shift)
    df = df[:-1]
    return df
    
train = create_pred_col(train)
test = create_pred_col(test)
train.shape,test.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


((534, 8), (259, 8))

In [87]:
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
output_notebook()

In [9]:
p = figure(plot_width=500, plot_height=250)
x = np.linspace(1,train.shape[0])
p.line(x, train['portfolio'].values, color='blue')
p.line(x, train['preds'].values, color='red')
show(p)



#### Reshape train/test set for LSTM

In [23]:
train = train.drop(['portfolio'], axis=1)
test = test.drop(['portfolio'], axis=1)

In [24]:
train_x, train_y = train.loc[:,train.columns != 'preds'], train['preds']
train_x = train_x.values.reshape(train_x.shape[0],1,train_x.shape[1])
train_y = train_y.values.reshape(train_y.shape[0],1,1)

In [25]:
test_x, test_y = test.loc[:,test.columns != 'preds'], test['preds']
test_x = test_x.values.reshape(test_x.shape[0],1,test_x.shape[1])
test_y = test_y.values.reshape(test_y.shape[0],1,1)

In [26]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((534, 1, 6), (534, 1, 1), (259, 1, 6), (259, 1, 1))

#### Design network

In [118]:
model = Sequential()
#model.reset_states() #Resets state in case you are running more than one different model with the same name in this notebook; feel free to comment out this line

model.add(LSTM(20, input_shape = (train_x.shape[1],train_x.shape[2]), return_sequences = True))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam') #mse numbers are far too big here, and there doesn't seem to be much performance loss from using mae instead


In [119]:
res = model.fit(train_x, train_y, validation_data=(test_x,test_y), epochs=2000, batch_size=32, verbose=1, shuffle=False)

Train on 534 samples, validate on 259 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/20

In [115]:
p = figure(plot_width=500, plot_height=250)
x = np.linspace(1,len(res.history['loss']),len(res.history['loss']))
p.line(x, res.history['loss'], color='blue')
p.line(x, res.history['val_loss'], color='red')
show(p)

#### Test predictions

In [116]:
yhat = model.predict(test_x)
yhat = yhat.reshape(yhat.shape[0])
y = test_y.reshape(test_y.shape[0])

In [117]:
p = figure(plot_width=500, plot_height=250)
x = np.linspace(1,test_y.shape[0],test_y.shape[0])
p.line(x, yhat, color='blue')
p.line(x, y, color='red')
show(p)

#### So it fails epically with a lookback of 1 timestep
