<a href="https://colab.research.google.com/github/Roflz/stock-prediction/blob/main/predict_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# Installs/upgrades packages needed for running stock prediction program using Google Colab
# After installing/upgrading, Go to Runtime -> Restart runtime to apply changes

!pip install --upgrade pandas-datareader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandas-datareader
  Downloading pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 2.8 MB/s 
Installing collected packages: pandas-datareader
  Attempting uninstall: pandas-datareader
    Found existing installation: pandas-datareader 0.9.0
    Uninstalling pandas-datareader-0.9.0:
      Successfully uninstalled pandas-datareader-0.9.0
Successfully installed pandas-datareader-0.10.0


In [48]:
# Creates the LSTM model to make predictions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_datareader as pdr
from datetime import date
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout

# parameters
ticker = "DOW" # stock ticker
years = 10 # years of stock history to get
split = 0.8 # what percent of dataset will be training
lookback = 100 # number of previous days the algorithm uses per training iteration
epochs = 70 
batchsize = 1

# set dates
end_date = date.today()
start_date = end_date - relativedelta(years=years)

# function to help create the datasets
# for features (x), appends the last <lookback> prices
# for labels (y), appends the next price
def create_dataset(df):
    x = []
    y = []
    for i in range(lookback, df.shape[0]):
        x.append(df[i-lookback:i, 0])
        y.append(df[i, 0])
    x = np.array(x)
    y = np.array(y)
    return x,y

# flattens a list of lists into 1 list
def flatten(l):
    return [item for sublist in l for item in sublist]

# Request data via Yahoo public API
print(f"Gathering data from {start_date} to {end_date}")
df = pdr.get_data_yahoo(ticker, start=start_date, end=end_date)
print(df)

# upload files and read csv (optional)
# df = files.upload()
# df = pd.read_csv(df)

# set dimensions of dataset
df = df['Open'].values
df = df.reshape(-1, 1)

# save clean data before configuring it for model training
dataset = np.array(df)
dataset_train = np.array(df[:int(df.shape[0]*split)])
dataset_test = np.array(df[int(df.shape[0]*split):])

# split the data into training and testing sets
# training set is taking 1st 80% of data points (Oldest 80% of data points)
# test set is taking last 20% of data points minus the lookback (Most recent 20% of data points)
training_input = dataset_train
test_input = dataset[ len(dataset_train) - lookback : ]


# scale data between 0 and 1
scaler = MinMaxScaler(feature_range=(0,1))
training_input = scaler.fit_transform(training_input)
test_input = scaler.transform(test_input)

# create datasets using function
# train is training set data
# test is data to test model on
# x is a chunk of the dataset where each position in the array contains the last 50 datapoints
# y is normal, its just all the y values of the dataset
# x dataset's format looks like this: [[50 values], [50 values],..., [50 values], [50 values]]
# y dataset's format looks like this: [value1, value2, value3, value4,..., valueX]
x_train, y_train = create_dataset(training_input)
x_test, y_test = create_dataset(test_input)

# initialize model as a sequential one with 96 units in the output’s dimensionality
# use return_sequences=True to make the LSTM layer with three-dimensional input and input_shape to shape our dataset
# Making the dropout fraction 0.2 drops 20% of the layers
# Finally add a dense layer with a value of 1 because we want to output one value
model = Sequential()
model.add(LSTM(units=96, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=96,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=96,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=96))
model.add(Dropout(0.2))
model.add(Dense(units=1))

# reshape data into 3d array for LSTM because it is sequential_3 which is expecting 3 dimensions, not 2
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

# now compile the model
# used loss='mean_squared_error' because it is a regression problem
# use the adam optimizer to update network weights iteratively based on training data
model.compile(loss='mean_squared_error', optimizer='adam')

# Save model and start training!
# Every epoch refers to one cycle through the full training dataset
# batch size refers to the number of training examples utilized in one iteration
model.fit(x_train, y_train, epochs=epochs, batch_size=batchsize)
model.save('stock_prediction.h5')

Gathering data from 2012-08-06 to 2022-08-06
                 High        Low       Open      Close     Volume  Adj Close
Date                                                                        
2019-03-20  53.500000  49.500000  52.750000  49.799999  2350800.0  41.921730
2019-03-21  50.000000  48.200001  49.990002  48.980000  1764700.0  41.231449
2019-03-22  49.950001  48.160000  48.799999  48.599998   844700.0  40.911568
2019-03-25  49.400002  48.000000  48.599998  49.150002   440900.0  41.374557
2019-03-26  49.750000  48.180000  49.000000  48.849998   504700.0  41.122017
...               ...        ...        ...        ...        ...        ...
2022-08-01  52.599998  51.430000  52.580002  52.459999  5913400.0  52.459999
2022-08-02  52.669998  51.540001  52.299999  51.549999  5797800.0  51.549999
2022-08-03  52.000000  51.335999  52.000000  51.490002  5139300.0  51.490002
2022-08-04  51.700001  50.860001  51.310001  50.990002  6963700.0  50.990002
2022-08-05  51.770000  50.73000

In [102]:
# Visualize the model plotted against test dataset
import plotly.graph_objects as go

# load fitted model
model = load_model('stock_prediction.h5')

# predict and rescale data 
test_predictions = model.predict(x_test)
test_predictions = scaler.inverse_transform(test_predictions)
y_test_scaled = scaler.inverse_transform(y_test.reshape(-1, 1))

# visualize data
# plot training set
trace1 = go.Scatter(
    x = list(np.arange(0, len(dataset_train))),
    y = dataset_train.reshape(-1),
    mode = 'lines',
    name = 'Data'
)
# plot test predictions
trace2 = go.Scatter(
    x = list(np.arange(len(dataset_train), len(dataset))),
    y = test_predictions.reshape(-1),
    mode = 'lines',
    name = 'Prediction'
)
# plot test dataset
trace3 = go.Scatter(
    x = list(np.arange(len(dataset_train), len(dataset))),
    y = dataset_test.reshape(-1),
    mode='lines',
    name = 'Ground Truth'
)
layout = go.Layout(
    title = "Model Test",
    xaxis = {'title' : "Days"},
    yaxis = {'title' : "Open"}
)
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
fig.show()

In [103]:
## Calculate Errors
import math

# flattens a list of lists into 1 list
def flatten(l):
    return [item for sublist in l for item in sublist]

def mean_absolute_error(actual, predicted):
  return abs(sum(flatten(actual)) - sum(flatten(predicted))) / predicted.size

def mean_squared_error(actual, predicted):
  return math.sqrt(abs(sum(flatten(y_test_scaled)) - sum(flatten(predicted)))**2 / predicted.size)

def mean_absolute_percentage_error(actual, predicted):
  error = sum(abs(actual - predicted) / abs(actual)) / predicted.size * 100
  return error[0]

# calculate Mean Absolute Error
MAE = mean_absolute_error(y_test_scaled, test_predictions)
print(f"Mean Absolute Error: {MAE}")

# calculate Mean Squared Error
MSE = mean_squared_error(y_test_scaled, test_predictions)
print(f"Mean Squared Error: {MSE}")

# calculate Mean Absolute Percentage Error
MAPE = mean_absolute_percentage_error(y_test_scaled, test_predictions)
print(f"Mean Absolute Percentage Error: {MAPE}")

# calculate Mean Absolute Scaled Error
# compares current model error with previous model error by a ratio (<1 means new model better, >1 means old model better)
# add this code when the time comes to make models more precise
# MASE = MAE_current / MAE_previous


Mean Absolute Error: 0.32514250905890213
Mean Squared Error: 4.251790017711038
Mean Absolute Percentage Error: 2.5762483799157496


In [104]:
# Make predictions

def predict(num_predictions: int, model, data, lookback: int ):
  """Gets and prints the spreadsheet's header columns

  Parameters
  ----------
  num_predictions : int
    The number of predictions to make
  model
    the model to use to make predictions
  data: ndarray
    dataset used to create the model in 2d format (<datapoints>, 1)
    must contain atleast <lookback> datapoints in its 1st dimension
  lookback: int
    number of previous days the model uses per prediction

  Returns
  -------
  prediction_list
      a (<num_predictions>, 1) shape array containing predictions
  """
  prediction_list = data

  for _ in range(num_predictions):
    x = prediction_list[ -lookback : ]
    x = np.reshape(x, (x.shape[1], x.shape[0], 1))
    out = model.predict(x)
    prediction_list = np.append(prediction_list, out, axis = 0)

  return prediction_list[ -num_predictions - 1: ]

# load fitted model
model = load_model('stock_prediction.h5')

# set number of days out do predict
num_predictions = 30

# create predictions:
# make each next day prediction based on the model and
# using the last <lookback> data points
#   1. make nparray to contain predictions
#   2. fill the array with <lookback> data points from dataset
#   3. scale data with same scaler from the model
#   4. for each prediction:
#     a. reshape input data to be 3d, (1, <lookback>, 1)
#     b. make next prediction using model
#     c. add output prediction to the end of the predictions array
#   5. shorten output array to just future predictions
#   6. rescale

# 1,2
predictions = np.array(dataset[ -lookback : ])
# 3
predictions = scaler.transform(predictions)
# 4,5
predictions = predict(num_predictions, model, predictions, lookback)
# 6
predictions = scaler.inverse_transform(predictions)

# visualize data
# x and y inputs must be 1 dimensional
# plot full dataset
trace1 = go.Scatter(
    x = list(np.arange(0, len(dataset))),
    y = dataset.reshape(-1),
    mode = 'lines',
    name = 'Data'
)
# plot predictions
trace2 = go.Scatter(
    x = list(np.arange(len(dataset) - 1, len(dataset) + num_predictions)),
    y = predictions.reshape(-1),
    mode = 'lines',
    name = 'Prediction'
)
# plot test predictions
trace3 = go.Scatter(
    x = list(np.arange(len(dataset_train), len(dataset))),
    y = test_predictions.reshape(-1),
    mode = 'lines',
    name = 'Test Predictions'
)
layout = go.Layout(
    title = "Predictions",
    xaxis = {'title' : "Days"},
    yaxis = {'title' : "Open"}
)
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
fig.show()

In [None]:
# Calculate Prediction Intervals
#
# A prediction interval for a single future observation is an interval
# that will, with a specified degree of confidence, contain a future
# randomly selected observation from a distribution.

In [None]:
# Calculate Confidence Intervals