<a href="https://colab.research.google.com/github/Roflz/stock-prediction/blob/main/predict_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:

# Installs/upgrades packages needed for running stock prediction program using Google Colab
# After installing/upgrading, Go to Runtime -> Restart runtime to apply changes

!pip install --upgrade pandas-datareader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_datareader as pdr
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from google.colab import files

# function to help create the datasets
# for features (x), appends the last 50 prices
# for labels (y), appends the next price
def create_dataset(df):
    x = []
    y = []
    for i in range(50, df.shape[0]):
        x.append(df[i-50:i, 0])
        y.append(df[i, 0])
    x = np.array(x)
    y = np.array(y)
    return x,y

# flattens a list of lists into 1 list
def flatten(l):
    return [item for sublist in l for item in sublist]

# Request data via Yahoo public API
# currently reads in 5 years of data as default
df = pdr.get_data_yahoo('NVDA')
print(df)

# upload files and read csv (optional)
# df = files.upload()
# df = pd.read_csv(df)

# get number of trading days
df.shape

# set dimensions of dataset
df = df['Open'].values
df = df.reshape(-1, 1)

# split the data into training and testing sets
# training set is taking 1st 20% of data points (Oldest 20% of data points)
# test set is taking last 80% of data points (Most recent 80% of data points)
dataset_train = np.array(df[:int(df.shape[0]*0.8)])
dataset_test = np.array(df[int(df.shape[0]*0.8):])

# scale data between 0 and 1
scaler = MinMaxScaler(feature_range=(0,1))
dataset_train = scaler.fit_transform(dataset_train)
dataset_test = scaler.transform(dataset_test)

# create datasets using function
x_train, y_train = create_dataset(dataset_train)
x_test, y_test = create_dataset(dataset_test)

# initialize model as a sequential one with 96 units in the output’s dimensionality
# use return_sequences=True to make the LSTM layer with three-dimensional input and input_shape to shape our dataset
# Making the dropout fraction 0.2 drops 20% of the layers
# Finally add a dense layer with a value of 1 because we want to output one value
model = Sequential()
model.add(LSTM(units=96, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=96,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=96,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=96))
model.add(Dropout(0.2))
model.add(Dense(units=1))

# reshape data into 3d array for LSTM because it is sequential_3 which is expecting 3 dimensions, not 2
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

# now compile the model
# used loss='mean_squared_error' because it is a regression problem
# use the adam optimizer to update network weights iteratively based on training data
model.compile(loss='mean_squared_error', optimizer='adam')

# Save model and start training!
# Every epoch refers to one cycle through the full training dataset
# batch size refers to the number of training examples utilized in one iteration
model.fit(x_train, y_train, epochs=50, batch_size=32)
model.save('stock_prediction.h5')

                  High         Low        Open       Close       Volume  \
Date                                                                      
2017-08-07   43.092499   42.000000   42.097500   43.087502   71741200.0   
2017-08-08   43.639999   42.177502   43.472500   42.575001   75533600.0   
2017-08-09   43.052502   41.917500   42.107498   43.027500   53078400.0   
2017-08-10   43.165001   41.082500   43.040001   41.185001  132808800.0   
2017-08-11   39.750000   38.227501   39.285000   38.990002  149719600.0   
...                ...         ...         ...         ...          ...   
2022-07-28  181.399994  174.399994  179.750000  179.839996   47464600.0   
2022-07-29  182.440002  176.919998  178.130005  181.630005   43520200.0   
2022-08-01  188.460007  179.899994  181.820007  184.410004   47646900.0   
2022-08-02  189.380005  180.919998  181.220001  185.259995   48952700.0   
2022-08-03  189.679993  181.369995  181.839996  188.929993   41773400.0   

             Adj Close  

In [None]:
# load fitted model
model = load_model('stock_prediction.h5')

# visualize data
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
y_test_scaled = scaler.inverse_transform(y_test.reshape(-1, 1))

fig, ax = plt.subplots(figsize=(16,8))
ax.set_facecolor('#000041')
ax.plot(y_test_scaled, color='red', label='Original price')
plt.plot(predictions, color='cyan', label='Predicted price')
plt.legend()

In [63]:
import math

# flattens a list of lists into 1 list
def flatten(l):
    return [item for sublist in l for item in sublist]

def mean_absolute_error(actual, predicted):
  return abs(sum(flatten(actual)) - sum(flatten(predicted))) / predictions.size

def mean_squared_error(actual, predicted):
  return math.sqrt(abs(sum(flatten(y_test_scaled)) - sum(flatten(predictions)))**2 / predictions.size)

def mean_absolute_percentage_error(actual, predicted):
  error = sum(abs(actual - predicted) / abs(actual)) / predictions.size * 100
  return error[0]

# calculate Mean Absolute Error
MAE = mean_absolute_error(y_test_scaled, predictions)
print(f"Mean Absolute Error: {MAE}")

# calculate Mean Squared Error
MSE = mean_squared_error(y_test_scaled, predictions)
print(f"Mean Squared Error: {MSE}")

# calculate Mean Absolute Percentage Error
MAPE = mean_absolute_percentage_error(y_test_scaled, predictions)
print(f"Mean Absolute Percentage Error: {MAPE}")

# calculate Mean Absolute Scaled Error
# compares current model error with previous model error by a ratio (<1 means new model better, >1 means old model better)
# add this code when the time comes to make models more precise
MASE = MAE_current / MAE_previous


Mean Absolute Error: 16.447479323585434
Mean Squared Error: 233.76260259535445
Mean Absolute Percentage Error: 8.538594516407883
