# Imports

In [1]:
import os

import numpy as np
import pandas as pd
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.layers import Dense, LSTM, Dropout
from keras.models import Sequential
from keras.utils import plot_model
from pandas import read_csv
from tqdm import tqdm  # For the loading bar effect

Using TensorFlow backend.


# Global Constants

In [2]:
TRAINING_DIRECTORY = "./trainingData/"  # Remember to add a "/" at the end
TESTING_DIRECTORY = "./testData/"  # Remember to add a "/" at the end
OUTPUT_DIRECTORY = "./output/"  # Remember to add a "/" at the end

NO_ENTRIES_TAKING_AVG = 10  # No entries to consider when taking average
FORECAST_STEP = 5  # No entries to predict ahead of time

NO_EPOCHS = 100
VAL_SPLIT = 0.2
DROPOUT_RATE = 0.1

MODEL_PLOT_NAME = "Forecaster"  # A ".png" will be appended to this at the end
MODEL_OUTPUT_NAME = f"Forecaster_FC{FORECAST_STEP}"  # Name of the model's file (without .h5)
MODEL_CHECKPOINT_NAME = f"ForecasterCP_FC{FORECAST_STEP}"  # Name of the model's checkpoint file (without .hdf5)


# 1. Obtaining Data

In [3]:
# FUNCTIONS
def get_data_values(stock_directory, root_directory):
    max_stock_price = -float("inf")
    min_stock_price = float("inf")
    
    stock_symbol = stock_directory
    stock_dir = root_directory + stock_symbol + "/"

    # Load stock prices from the CSV file
    stock_data = read_csv(stock_dir + stock_symbol + ".csv", header=0, squeeze=True)

    # Drop the columns "High", "Low", "Open", "Adj Close" and "Volume", i.e. only leave the "Date" and "Close" column.
    stock_data = stock_data.drop(stock_data.columns[list(range(1, 4)) + list(range(5, 7))], axis=1)

    # Load sentiment data
    sentiment_data = read_csv(stock_dir + stock_symbol + "_Sentiments.csv", header=0, squeeze=True)

    # Get common dates
    common_dates = list(np.intersect1d(sentiment_data["Date"], stock_data["Date"]))
    combined_dataframe = pd.DataFrame(columns=["Close", "Sentiment"])

    # Get common data and save
    common_dates_index = 0

    temp_stock_values = []
    temp_saved_prices_indexes = []

    for index in range(stock_data.shape[0]):
        temp_stock_values.append(stock_data.iloc[index]["Close"])

        # Save values with date in the commonDates array to the combinedDataframe
        if stock_data.iloc[index]["Date"] in common_dates:
            # Get the stock value
            stock_val = stock_data.iloc[index]["Close"]  # Close value

            # Save this stock value's index to savedPricesIndexes
            temp_saved_prices_indexes.append(index)

            # Check and compare with maximum / minimum stock values
            if stock_val < min_stock_price:
                min_stock_price = stock_val

            if stock_val > max_stock_price:
                max_stock_price = stock_val

            # Assign values to the combined dataframe
            combined_dataframe.at[common_dates_index, "Close"] = stock_val
            combined_dataframe.at[common_dates_index, "Sentiment"] = sentiment_data.iloc[common_dates_index]["Sentiment"]  # Sentiment value

            # Increment commonDatesIndex by 1
            common_dates_index += 1

    return combined_dataframe, temp_stock_values, temp_saved_prices_indexes, max_stock_price, min_stock_price

# Get all training data
trainingDirs = next(os.walk(TRAINING_DIRECTORY))[1]  # All the subdirectories

trainingDFs = []
stockValues = []
savedPricesIndexes = []

maxStockPrice = -float("inf")
minStockPrice = float("inf")

for directory in tqdm(trainingDirs, desc="Getting stock and sentiment files"):
    combinedDataframe, tempStockValues, tempSavedPricesIndexes, maxi, mini = get_data_values(directory, TRAINING_DIRECTORY)
    
    trainingDFs.append(combinedDataframe)
    stockValues.append(tempStockValues)
    savedPricesIndexes.append(tempSavedPricesIndexes)

    maxStockPrice = maxi if maxi > maxStockPrice else maxStockPrice
    minStockPrice = mini if mini < minStockPrice else minStockPrice


Getting stock and sentiment files: 100%|██████████| 1/1 [00:03<00:00,  3.93s/it]


# 2. Preprocessing

In [4]:
def gen_x_and_y(training_dataframes):
    x = []  # This is our training list
    y = []  # This is our resultant list

    for index, stock_dataframe in enumerate(training_dataframes):
        # Convert dataframe to np.array
        # NOTE: The first element represents the price, while the second represents the sentiment
        stock_arr = stock_dataframe.values

        # Init lists to store data
        stock_prices = []
        sentiment_scores = []

        for entry in stock_arr:  # Each row is a new entry
            # Append the original price to stockPrices
            stock_prices.append(entry[0])

            # Append the sentiment score to sentimentScores
            sentiment_scores.append(entry[1])

        # Calculate moving average for stock prices
        stock_ma = []
        for j in range(len(stock_prices)):
            stock_ma.append(sum(stock_prices[j:j + NO_ENTRIES_TAKING_AVG]) / NO_ENTRIES_TAKING_AVG)

        # Calculate moving average for sentiments
        sentiment_ma = []
        for j in range(len(sentiment_scores)):
            sentiment_ma.append(sum(sentiment_scores[j:j + NO_ENTRIES_TAKING_AVG]) / NO_ENTRIES_TAKING_AVG)

        # Gather future prediction
        last_n_unavailable = 0

        for j in savedPricesIndexes[index]:
            try:
                # Calculate normalized values
                normalised_stock = (stockValues[index][j + FORECAST_STEP] - minStockPrice) / (maxStockPrice - minStockPrice)
                y.append([normalised_stock])  # Must follow the same shape

            except IndexError:
                last_n_unavailable += 1  # Increment the number of elements which cannot be found

        # Gather sentiment and stock value
        for j in range(len(sentiment_ma) - last_n_unavailable):
            # Calculate normalized values
            normalised_stock = (stock_ma[j] - minStockPrice) / (maxStockPrice - minStockPrice)
            normalised_sentiment = (sentiment_ma[j] + 1) / 2

            x.append([normalised_stock, normalised_sentiment])
    
    return np.array(x).reshape((np.array(x).shape[0], 1, 2)), np.array(y)

X_train, Y_train = gen_x_and_y(trainingDFs)

print("Will train model on {} data points.".format(X_train.shape[0]))

Will train model on 117 data points.


# 3. Model Creation & Training

In [5]:
# Base model
model = Sequential()
model.add(LSTM(32, input_shape=(1,2), return_sequences=True))
model.add(Dropout(DROPOUT_RATE))
model.add(LSTM(16, return_sequences=True))
model.add(Dropout(DROPOUT_RATE))
model.add(LSTM(8, return_sequences=True))
model.add(Dropout(DROPOUT_RATE))
model.add(LSTM(4, return_sequences=True))
model.add(Dropout(DROPOUT_RATE))
model.add(LSTM(2))
model.add(Dropout(DROPOUT_RATE))
model.add(Dense(1))

# Loss Function
model.compile(loss="mse", optimizer='adam', metrics=["mae"])

# Callbacks
checkpoint_maker = ModelCheckpoint(monitor="val_loss",
                                   filepath=OUTPUT_DIRECTORY + MODEL_CHECKPOINT_NAME + ".hdf5",
                                   verbose=1, save_best_only=True)
stop_early = EarlyStopping(monitor="val_loss", patience=80, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", patience=5, verbose=1, factor=0.85)  # Reduce learning rate if there is no improvement.

# Save model image
plot_model(model, OUTPUT_DIRECTORY + MODEL_PLOT_NAME + ".png", show_layer_names=True, show_shapes=True)

In [6]:
model.fit(X_train, Y_train, epochs=NO_EPOCHS, verbose=1, validation_split=VAL_SPLIT, callbacks=[checkpoint_maker, stop_early, reduce_lr], shuffle=False)

Train on 93 samples, validate on 24 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.49698, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.49698 to 0.49139, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.49139 to 0.48582, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 0.48582 to 0.48021, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 0.48021 to 0.47460, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 0.47460 to 0.46901, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 0.46901 to 0.46339, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 8/100

Epoch 00008: val_loss improved from 0.46339 to 0.45778, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 9/100

Epoch 00009: val_lo


Epoch 00031: val_loss improved from 0.31863 to 0.31043, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 32/100

Epoch 00032: val_loss improved from 0.31043 to 0.30199, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 33/100

Epoch 00033: val_loss improved from 0.30199 to 0.29327, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 34/100

Epoch 00034: val_loss improved from 0.29327 to 0.28435, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 35/100

Epoch 00035: val_loss improved from 0.28435 to 0.27526, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 36/100

Epoch 00036: val_loss improved from 0.27526 to 0.26601, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 37/100

Epoch 00037: val_loss improved from 0.26601 to 0.25669, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 38/100

Epoch 00038: val_loss improved from 0.25669 to 0.24735, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 39/100

Epoch 00039: val_loss improved from 0.24735 to 0.23811, saving 


Epoch 00061: val_loss improved from 0.14174 to 0.14128, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 62/100

Epoch 00062: val_loss improved from 0.14128 to 0.14079, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 63/100

Epoch 00063: val_loss improved from 0.14079 to 0.14044, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 64/100

Epoch 00064: val_loss improved from 0.14044 to 0.14006, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 65/100

Epoch 00065: val_loss improved from 0.14006 to 0.13988, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 66/100

Epoch 00066: val_loss improved from 0.13988 to 0.13975, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 67/100

Epoch 00067: val_loss improved from 0.13975 to 0.13972, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 68/100

Epoch 00068: val_loss improved from 0.13972 to 0.13963, saving model to ./output/ForecasterCP_FC5.hdf5
Epoch 69/100

Epoch 00069: val_loss did not improve from 0.13963
Epoch 70/100


Epoch 00095: val_loss did not improve from 0.13963
Epoch 96/100

Epoch 00096: val_loss did not improve from 0.13963
Epoch 97/100

Epoch 00097: val_loss did not improve from 0.13963
Epoch 98/100

Epoch 00098: val_loss did not improve from 0.13963

Epoch 00098: ReduceLROnPlateau reducing learning rate to 0.00037714955396950245.
Epoch 99/100

Epoch 00099: val_loss did not improve from 0.13963
Epoch 100/100

Epoch 00100: val_loss did not improve from 0.13963


<keras.callbacks.History at 0x10e8a2eb8>

In [7]:
model.save(OUTPUT_DIRECTORY + MODEL_OUTPUT_NAME + ".h5")

# 4. Model Evaluation

In [8]:
# Get all testing data
testingDirs = next(os.walk(TESTING_DIRECTORY))[1]  # All the subdirectories

testingDFs = []
stockValues = []
savedPricesIndexes = []

maxStockPrice = -float("inf")
minStockPrice = float("inf")

for directory in tqdm(testingDirs, desc="Getting testing stock and sentiment files"):
    combinedDataframe, tempStockValues, tempSavedPricesIndexes, maxi, mini = get_data_values(directory, TESTING_DIRECTORY)
    
    testingDFs.append(combinedDataframe)
    stockValues.append(tempStockValues)
    savedPricesIndexes.append(tempSavedPricesIndexes)

    maxStockPrice = maxi if maxi > maxStockPrice else maxStockPrice
    minStockPrice = mini if mini < minStockPrice else minStockPrice

Getting testing stock and sentiment files: 100%|██████████| 2/2 [00:01<00:00,  1.54it/s]


In [9]:
X_test, Y_test = gen_x_and_y(testingDFs)
print("Will test model on {} data points.".format(X_test.shape[0]))


Will test model on 263 data points.


In [10]:
X_test.shape

(263, 1, 2)

In [11]:
totalError = 0
highestPercentageError = -float("inf")
lowestPercentageError = float("inf")

for i in range(X_test.shape[0]):
    Y_pred = model.predict(np.array([X_test[i]]))
    percentageError = (abs(Y_pred[0][0] - Y_test[i][0]) / Y_test[i][0]) * 100
    
    print(f"{Y_test[i][0]:.10f} {Y_pred[0][0]:.10f} {percentageError:.10f}%")
    
    highestPercentageError = percentageError if highestPercentageError < percentageError else highestPercentageError
    lowestPercentageError = percentageError if lowestPercentageError > percentageError else lowestPercentageError
    
    totalError += percentageError

print()
print(f"Mean Percentage Error: {totalError / X_test.shape[0]:.10f}%")
print(f"Highest Percentage Error: {highestPercentageError:.10f}%")
print(f"Lowest Percentage Error: {lowestPercentageError:.10f}%")
print(f"Standard deviation: {max(abs(lowestPercentageError - totalError / X_test.shape[0]), abs(highestPercentageError - totalError / X_test.shape[0])):5g}%")

0.0249521341 0.3723124564 1392.1066679270%
0.0277246104 0.3740393221 1249.1238164377%
0.0232312831 0.3769199252 1522.4671021545%
0.0230401167 0.3780537844 1540.8501276625%
0.0553536663 0.3788244426 584.3710050972%
0.0387188848 0.3802673221 882.1236437435%
0.0434033785 0.3817211688 779.4734004312%
0.0429254148 0.3849523664 796.7935855619%
0.0372848407 0.3883454800 941.5640052075%
0.0476098989 0.3908050358 720.8482785731%
0.0558317065 0.3940671384 605.8124548978%
0.1061185369 0.3965035081 273.6420797354%
0.0998087466 0.4010450244 301.8135062368%
0.0900573500 0.4030264914 347.5220416562%
0.1180688247 0.4036943018 241.9143901043%
0.2062141573 0.4048881531 96.3435286565%
0.2411089404 0.4085056186 69.4278187393%
0.3505736248 0.4106309414 17.1311565761%
0.3485659095 0.4124384820 18.3243888077%
0.3925430261 0.4149697423 5.7131867537%
0.4263861839 0.4167949855 2.2494158450%
0.4237093290 0.4195920527 0.9717218904%
0.4186424165 0.4202520251 0.3844829253%
0.4836520110 0.4232425988 12.4902638411%
0

0.4586042555 0.4094347656 10.7215511743%
0.4617591506 0.4069552422 11.8685051282%
0.5086042216 0.4026439190 20.8335475984%
0.5075525867 0.4011756480 20.9588014143%
0.3882409512 0.4000574648 3.0436031065%
0.4057361917 0.3998885751 1.4412361383%
0.2753345839 0.3974028826 44.3345317970%
0.2680687898 0.3977864683 48.3896982453%
0.2069789279 0.3965130448 91.5716970685%
0.2362332804 0.3850357234 62.9896189284%
0.2526768095 0.3708152175 46.7547489709%
0.1458890565 0.3560692370 144.0685034668%
0.2218929350 0.3413960040 53.8561847496%
0.3997131988 0.3246702850 18.7741896094%
0.3845124709 0.3066940904 20.2381941958%

Mean Percentage Error: 77.4081815675%
Highest Percentage Error: 1540.8501276625%
Lowest Percentage Error: 0.1580820043%
Standard deviation: 1463.44%
