In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense

n_timesteps = 60
n_days_to_predict = 7
test_set_size = 0.2

#strings
test_set_key = 'test'
training_set_key = 'train'
open_key = 'Open'
adam_optimizer = 'adam'
mse_loss = 'mean_squared_error'

#model params
optimizer = adam_optimizer
loss = mse_loss
dropout_rate = 0.2
lstm_layer_size = 50
dense_layer_size = 1

In [2]:
def to_array(df):
    return df.values

def process_data(df, set_key):
    data = df[open_key]

    #split
    if set_key == training_set_key:
        data = to_array(data)[:int(len(data)*(1-test_set_size))]
    else:
        data = to_array(data)[int(len(data)*(1-test_set_size)):]

    data = data.reshape(
        (data.shape[0], 1)
    )

    #scale
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_data = scaler.fit_transform(data)

    #split
    X, y = [], []
    for i in range(n_timesteps, scaled_data.shape[0]-n_days_to_predict-1):
        X.append(scaled_data[i-n_timesteps:i, 0])
        y.append(scaled_data[i:i+n_days_to_predict, 0])

    X, y = np.array(X), np.array(y)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))

    return X, y, scaler

def build_model(n_cols):
    model = Sequential()
    model.add(LSTM(units=lstm_layer_size, return_sequences=True, input_shape=(n_cols, 1)))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=lstm_layer_size, return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=lstm_layer_size, return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=lstm_layer_size))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=n_days_to_predict))
    model.compile(optimizer=optimizer, loss=loss)

    return model

In [3]:
#load data
df = pd.read_csv('../data/train.csv')

df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1986-03-13,0.056367,0.064656,0.056367,0.061893,1031788800,0.0,0.0
1,1986-03-14,0.061893,0.065209,0.061893,0.064103,308160000,0.0,0.0
2,1986-03-17,0.064103,0.065761,0.064103,0.065209,133171200,0.0,0.0
3,1986-03-18,0.065209,0.065761,0.062998,0.063551,67766400,0.0,0.0
4,1986-03-19,0.063551,0.064103,0.061893,0.062446,47894400,0.0,0.0


In [4]:
#scale the data
X_train, y_train, train_scaler = process_data(df, training_set_key)

#TODO fix y_test
X_test, y_test, test_scaler = process_data(df, test_set_key)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6937, 60, 1), (6937, 7), (1684, 60, 1), (1684, 7))

In [5]:
#build model
brain = build_model(X_train.shape[1])

brain.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 60, 50)            10400     
_________________________________________________________________
dropout (Dropout)            (None, 60, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 60, 50)            20200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 50)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 60, 50)            20200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 60, 50)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                2

In [6]:
#training
brain.fit(X_train, y_train, epochs=100, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x21e14ed6b50>

In [7]:
#predict - tego bedziemy uzywac w apce
y_hat = brain.predict(X_test)
mean_squared_error(y_test, y_hat)

#0.00022679094560793073

0.00042778512889173005

In [27]:
import pickle

pickle.dump(brain, open('../data/brain.pickle', 'wb'))

TypeError: cannot pickle '_thread.RLock' object