In [11]:
"""
Description: This program recurrent neural network called Long Short Term Memory
             (LSTM) to predict the next days closing stock price of a selected Stock using the
             past 30 day closing stock price.
"""

#Import the libraries
import math
import pandas_datareader as web
import numpy as np
import plotly.graph_objs as go

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error
from keras.models import Sequential,load_model
from keras.layers import Dense, LSTM
from datetime import date,timedelta



In [12]:
# Set stock ticker
stock_ticker = 'AAPL'
# Get current date
today = date.today().strftime("%Y-%m-%d")
# Get tomorrows date
tomorrow = (date.today() + timedelta(days=1)).strftime("%Y-%m-%d")

# Get the past stock ticker values (between start and end dates)
stock_data_frame = web.DataReader(stock_ticker, data_source='yahoo', start ='2014-01-01', end=today)
# Show the data
stock_data_frame


Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-02,19.893929,19.715000,19.845715,19.754642,234684800.0,17.516605
2014-01-03,19.775000,19.301071,19.745001,19.320715,392467600.0,17.131842
2014-01-06,19.528570,19.057142,19.194643,19.426071,412610800.0,17.225264
2014-01-07,19.498571,19.211430,19.440001,19.287144,317209200.0,17.102070
2014-01-08,19.484285,19.238930,19.243214,19.409286,258529600.0,17.210381
...,...,...,...,...,...,...
2022-01-24,162.300003,154.699997,160.020004,161.619995,162706700.0,161.619995
2022-01-25,162.759995,157.020004,158.979996,159.779999,115798400.0,159.779999
2022-01-26,164.389999,157.820007,163.500000,159.690002,108275300.0,159.690002
2022-01-27,163.839996,158.279999,162.449997,159.220001,116691400.0,159.220001


In [13]:
# Filter data frame to contain only Closing Stock Values
closing_data = stock_data_frame.filter(['Close'])

# Convert the dataframe to a numpy array
close_dataset = closing_data.values

# Set the training data set to 90% of original data values
training_data_length = math.ceil(len(close_dataset) *0.90)


In [14]:
# Scale the data (Normalise Data) - This creates new values that maintain the general distribution
# and ratios of the source data, while keeping values within a scale applied across all numeric
# columns used in the model.

# set the scaler to scale values between 0 and 1
scaler = MinMaxScaler(feature_range=(0,1))

# scale stock closing price into the values between 0 and 1
scaled_data = scaler.fit_transform(close_dataset)


In [15]:
# Create the training data sets

# Create the scaled training data set from the first 90% of overall data
scaled_training_data = scaled_data[0:training_data_length, :]

# Split the data into independent and dependent training variables

# independent training variables
training_set_a = []

# dependent training variables (value to predict)
training_set_b = []

# Populate training sets
# The model uses past 30 data values(training_set_a) to predict each training_set_b value

for i in range(30,len(scaled_training_data)):
    # Append past 30 values to the training_set_a data set9
    training_set_a.append(scaled_training_data[i-30:i,0])
    # append 31st value that the model is to predict
    training_set_b .append(scaled_training_data[i,0])
    # print data just to check the values
    if i <= 30:
        print(training_set_a)
        print(training_set_b )
        print()


[array([0.01160666, 0.00896335, 0.00960514, 0.00875885, 0.00950289,
       0.00799304, 0.00721419, 0.00782118, 0.01014033, 0.01252693,
       0.01185033, 0.0088959 , 0.01072339, 0.01125423, 0.01227021,
       0.01007071, 0.01103448, 0.00146199, 0.00021103, 0.        ,
       0.00017839, 0.00038072, 0.00196018, 0.00278691, 0.0027695 ,
       0.00432938, 0.00635484, 0.00787122, 0.0078625 , 0.00971392])]
[0.009618184520078371]



In [16]:
# Convert training_set_a and training_set_b data sets to numby arrays
# So that they can be used by the LSTM
training_set_a,training_set_b = np.array(training_set_a), np.array(training_set_b )

In [17]:
#Reshape the data: Since LSTM expects 3d data model set (currently is 2d: training_set_a and training_set_b)
# The LSTM expected input is: number of samples, number of steps and number of features
# training_set_a.shape[0] - number of data rows(samples)
# training_set_a.shape[1] - number of columns( number of steps) i.e last 60 days used to predict next day price
# number of features = 1 since we want only one predicted price

training_set_a = np.reshape(training_set_a,(training_set_a.shape[0],training_set_a.shape[1],1))

# check reshaped data set
training_set_a.shape


(1801, 30, 1)

In [18]:
#Build the LSTM model
model = Sequential()

# Add layers to the model
# First LSTM layer with 100 neurons that takes input and returns sequences into another LSTM layer
model.add(LSTM(100,return_sequences=True,input_shape=(training_set_a.shape[1],1)))
# Second and last LSTM layer with 50 neurons, doesnt return sequences
model.add(LSTM(50,return_sequences=False))
# Dense layer with 10 neurons
model.add(Dense(10))
# final layer with the result of prediction
model.add(Dense(1))

In [19]:
# Compile the model, adding the optimizer and loss function
#optimizer: an algorithm/method used to minimize an error functions or to maximize the efficiency of training
#           Main goal is to improve upon the loss function.
# loss function is used to measure how well the model did on training
model.compile(optimizer='adam',loss='mean_squared_error')


In [20]:
# Train the model:
# The batch size is a number of samples processed before the model is updated
# The epoch is the number of complete passes through the training dataset
# validation_split - The part of the data used to validate the model: in this case is 15%
model.fit(training_set_a,training_set_b,batch_size=16,epochs=32,validation_split=0.15)

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.callbacks.History at 0x1700c379850>

In [21]:
#Create the testing data set that is 10% of overall data
test_data = scaled_data[training_data_length-30: , :]
#Create testing data sets
# testing_set_a will contain first 30 testing values used to predict next value
testing_set_a = []
# testing_set_b will contain the value that is to be predicted
testing_set_b = close_dataset[training_data_length: , :]

for i in range(30,len(test_data)):
    # append past 30 values
    testing_set_a.append(test_data[i-30:i,0])


In [22]:
#Convert the testing_set_a to a numpy array
testing_set_a = np.array(testing_set_a)

#Reshape the testing_set_a into a 3d data model
testing_set_a = np.reshape(testing_set_a,(testing_set_a.shape[0],testing_set_a.shape[1],1))

In [23]:
# Use model to predict the values using testing_set_a data. If the values correspond to the testing_set_b
# then the predictions of the model are accurate.
predictions = model.predict(testing_set_a)
# unscale the values into actual USD values
predictions = scaler.inverse_transform(predictions )


In [24]:
# Check the model accuracy by taking the Root Mean Squared Error (RMSE)
# The lower the RMSE value, the better the fit and accuracy of the model is
rmse = np.sqrt(np.mean(predictions - testing_set_b)**2)
# check the RMSE value
rmse

3.2736844969500463

In [25]:
# Calculate the absolute percentage error for the predictions
mape = mean_absolute_percentage_error(testing_set_b,predictions)
# Check the percentage error
print(mape*100)

2.3235646616118077


In [26]:
# Plot the data
# training data set
training_data_set = closing_data[0:training_data_length]
# validation data set
validation_data_set = closing_data[training_data_length:]
# predicted values data set
validation_data_set['Predictions'] = predictions

print(training_data_set)

# Create graph objects

trainingSet =  go.Scatter(
                x=training_data_set.index,
                y=training_data_set['Close'],
                mode='lines',
                connectgaps=True,
                name = "Training Data"
)
validationSet =  go.Scatter(
                x=validation_data_set.index,
                y=validation_data_set['Close'],
                mode='lines',
                connectgaps=True,
                name = "Actual Values"
)
testingSet =  go.Scatter(
                x=validation_data_set.index,
                y=validation_data_set['Predictions'],
                mode='lines',
                connectgaps=True,
                name = "Predicted Values"
)
# set graph layout
graphLayout = go.Layout(
    title = "Model",
    xaxis = {'title' : "Date"},
    yaxis = {'title' : "Closing Price USD"},
    autosize= False,
    width=900,
    height=600
)
# Create graph based on the above data
model_testing_figure= go.Figure(data=[trainingSet,testingSet,validationSet],layout=graphLayout)
# Display graph
model_testing_figure.show()

                 Close
Date                  
2014-01-02   19.754642
2014-01-03   19.320715
2014-01-06   19.426071
2014-01-07   19.287144
2014-01-08   19.409286
...                ...
2021-04-06  126.209999
2021-04-07  127.900002
2021-04-08  130.360001
2021-04-09  133.000000
2021-04-12  131.240005

[1831 rows x 1 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_data_set['Predictions'] = predictions


In [27]:
# Show the actual and predicted prices
validation_data_set


Unnamed: 0_level_0,Close,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-04-13,134.429993,130.353928
2021-04-14,132.029999,131.666946
2021-04-15,134.500000,131.230606
2021-04-16,134.160004,131.751663
2021-04-19,134.839996,132.009262
...,...,...
2022-01-24,161.619995,158.221619
2022-01-25,159.779999,156.722855
2022-01-26,159.690002,155.345520
2022-01-27,159.220001,154.758347


In [28]:
# Use the model to predict tomorrows closing price of a given stock
#Get the stock data
stock_quote = web.DataReader(stock_ticker,data_source='yahoo',start='2014-01-01',end=tomorrow)
# filter the stock data to contain only closing stock price
closing_prices = stock_quote.filter(['Close'])
#Get the last 30 day closing price values
last_30_values = closing_prices[-30:].values
# Scale the data to be values between 0 and 1
last_30_values_scaled = scaler.transform(last_30_values)
# create list and append the past 30 scaled values
data_set = [last_30_values_scaled]
#Convert data_set to a numpy array
data_set = np.array(data_set)
# Reshape the data so it can be used by the model
X_test = np.reshape(data_set,(data_set.shape[0],data_set.shape[1],1))
# Predict the next day closing price
pred_price = model.predict(data_set)
# Undo the scaling
pred_price = scaler.inverse_transform(pred_price)
# check the predicted value
print(pred_price[0])


[160.49709]


In [29]:
# Display stock chart together with the predicted value for tomorrow closing price.

# stock data
datasets =  go.Scatter(
                x=closing_prices.index,
                y=closing_prices['Close'],
                mode='lines',
                connectgaps=True,
                name = "Past Stock Price"
)
# predicted price
predicted_closing_price=  go.Scatter(
                x=[tomorrow],
                y=pred_price[0],
                mode='markers',
                name = "Predicted Next Day Closing Price"
)

#create graph
prediction_figure= go.Figure(data=[datasets,predicted_closing_price],layout=graphLayout)
prediction_figure.show()