# LSTM Model analyzing 60 days of stock data at 5 min intervals

__Important__: run Generate_Dataset_for_5min.ipynb before running this notebook to generate the local datasets needed

# Library Imports
### Python 3.10 with requirements.txt

In [1]:
import numpy as np
import pandas as pd
from os.path import exists
import yaml
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from tslearn.clustering import TimeSeriesKMeans
import tensorflow as tf

2022-12-03 18:11:06.890083: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-03 18:11:07.006133: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-03 18:11:07.009959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-03 18:11:07.009984: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudar

# Dataset generation

Dataset Downloads from yfinance and saves in specified folder. The default being ./5minData

In [2]:
with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)

In [3]:
def get_sp500():
    tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
    return tickers.Symbol.to_list()

In [4]:
def kmeans_dataset_from_tickers(tickers_list):
    # Use Smallest Stock Dataset Size for Model Inputs
    dataset_size_list = []
    for t in tickers_list:
        if exists(config['path'] + '/' + t + '.pkl'):
            dataset_size_list.append(len(pd.read_pickle(config['path'] + '/' + t + '.pkl')))
    min_frame_size = min(dataset_size_list)
    frame_size = 50 # Amount of past samples to use for clustering

    arrays_to_stack = []
    new_t_list = []
    for t in tickers_list:
            if exists(config['path'] + '/' + t + '.pkl'):
                new_t_list.append(t)
                df = pd.read_pickle(config['path'] + '/' + t + '.pkl')
                df = (df - df.mean()) / df.std()
                arrays_to_stack.append(df.iloc[0:frame_size].to_numpy())
    return np.stack(arrays_to_stack), new_t_list, min_frame_size - 1

In [5]:
def train_kmeans(data):
    for f in range(data.shape[2]):
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(data[:, :, f])
        sse = []
        for k in range(1, 11):
            model = KMeans(init='random', n_clusters=k, n_init=10, max_iter=300)
            model.fit(scaled_features)
            sse.append(model.inertia_)
        knee = KneeLocator(range(1, 11), sse, curve='convex', direction='decreasing')
        model = KMeans(n_clusters=knee.elbow)
        model.fit(scaled_features)
        fte_colors = {0: "#008fd5", 1: "#fc4f30", 2: "#d66cb2", 3: "#772a98", 4: "#e69666"}
        km_colors = [fte_colors[label] for label in model.labels_]
        plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=km_colors)
        plt.savefig('plots/feature'+str(f)+'kmeans.png')

In [6]:
def train_tseries_kmeans(data, tickers_list):
    map_stock_to_label = {}
    for f in range(data.shape[2]):
        print('Feature: ' + str(f))
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(data[:, :, f])
        sse = []
        for k in range(1, 11):
            print('Number of Clusters: ' + str(k))
            model = TimeSeriesKMeans(n_clusters=k, metric="softdtw", max_iter=10, n_jobs=-1, verbose=1)
            model.fit(scaled_features)
            sse.append(model.inertia_)
        knee = KneeLocator(range(1, 11), sse, curve='convex', direction='decreasing')
        print('Elbow: ' + str(knee.elbow))
        model = TimeSeriesKMeans(n_clusters=knee.elbow, metric="softdtw", max_iter=10, n_jobs=-1, verbose=1)
        model.fit(scaled_features)

        # Grouped to an elbow of 2 on testing
        # model = TimeSeriesKMeans(n_clusters=2, metric="softdtw", max_iter=10, n_jobs=-1, verbose=1)
        # model.fit(scaled_features)
        model.to_pickle('saved_models/kmeans'+str(f))
        counter = 0
        for _ in model.labels_:
            if f in map_stock_to_label:
                if model.labels_[counter] in map_stock_to_label[f]:
                    map_stock_to_label[f][model.labels_[counter]].append(tickers_list[counter])
                else:
                    map_stock_to_label[f][model.labels_[counter]] = [tickers_list[counter]]
            else:
                map_stock_to_label[f] = {}
                map_stock_to_label[f][model.labels_[counter]] = [tickers_list[counter]]
            counter += 1
    return map_stock_to_label

# Functions for Plotting Model Ouputs

In [7]:
def plot_groups(label_dict):
    for feature in label_dict:
        for group in label_dict[feature]:
#             plt.clf()
            tickers_list = label_dict[feature][group]
            for t in tickers_list:
                if exists(config['path'] + '/' + t + '.pkl'):
                    df = pd.read_pickle(config['path'] + '/' + t + '.pkl')
                    df = (df - df.mean()) / df.std()
                    plt.plot(df.to_numpy()[0:50, feature])
                    plt.title('Group ' + str(group) + ' for feature ' + str(feature))
            plt.savefig('plots/feature'+str(feature)+'Group'+str(group)+'Plot.png')
            plt.close()

In [8]:
def plot_prediction(val_data, prediction, filename, feature):
    fig, axs = plt.subplots(val_data.shape[0], figsize=(20, val_data.shape[0] * 6))
    for i in range(val_data.shape[0]):
        # Plot all the close prices
        axs[i].plot(val_data[i, :, feature], color ='red', label='Actual value')
        axs[i].plot(prediction[i, :, feature], color='green', label='Predicted value')

        # Show the legend
        axs[i].legend()

        # Define the label for the title of the figure
        axs[i].set_title('LSTM Model Grouped by Feature: ' + str(feature))

        # Define the labels for x-axis and y-axis
        axs[i].set_ylabel('Normalized Value, Feature: ' + str(feature))
        axs[i].set_xlabel('Samples')

    plt.savefig('plots/'+filename)
    plt.close()

In [9]:
def lstm_dataset_from_mapped_labels(label_dict):
    data = {}
    for feature in label_dict:
        data[feature] = {}
        for group in label_dict[feature]:
            tickers_list = label_dict[feature][group]
            # Use Smallest Stock Dataset Size for Model Inputs
            dataset_size_list = []
            for t in tickers_list:
                if exists(config['path'] + '/' + t + '.pkl'):
                    dataset_size_list.append(len(pd.read_pickle(config['path'] + '/' + t + '.pkl')))
            frame_size = min(dataset_size_list)

            arrays_to_stack = []
            for t in tickers_list:
                if exists(config['path'] + '/' + t + '.pkl'):
                    df = pd.read_pickle(config['path'] + '/' + t + '.pkl')
                    date_time = pd.to_datetime(df.index, format='%Y-%m-%d %H:%M:%S%z')
                    timestamp_s = date_time.map(pd.Timestamp.timestamp)
                    day = 24 * 60 * 60
                    year = 365.2425 * day
                    # Normalize df before adding time back
                    df = (df - df.mean()) / df.std()
                    df['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
                    df['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
                    df['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
                    df['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))
                    arrays_to_stack.append(df.iloc[-frame_size:-1].to_numpy())
            data[feature][group] = np.stack(arrays_to_stack)
    return data

# Model Functions For Training
They custom dataset dictionary as an input

In [20]:
def create_lstm_model(label_dict, frame_size):
    data = label_dict[0][0]
    # frame_size = data.shape[1]
    features = data.shape[2]
    in_size = frame_size // 2
    out_size = in_size
    # Model
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(32, return_sequences=False),
        tf.keras.layers.Dense(out_size * features, kernel_initializer=tf.initializers.zeros()),
        tf.keras.layers.Reshape([out_size, features])
    ])

    model.compile(optimizer='adam', loss='MSE')
    return model

In [21]:
def train_lstm_model(label_dict, model, frame_size):
    for feature in label_dict:
        for group in label_dict[feature]:
            data = label_dict[feature][group]
            print('Feature: ' + str(feature) + ' Group: ' + str(group))
            max_epochs = 50
            samples = data.shape[0]
            if samples >= 10:
                features = data.shape[2]

                # Using 80 - 10 - 10 Split    

                samples_for_training = int(samples * 0.80)
                samples_for_validation = int(samples * 0.10)
                samples_for_testing = int(samples * 0.10)

                in_size = frame_size // 2

                x_train = data[:samples_for_training, :in_size, :]
                y_train = data[:samples_for_training, in_size:2 * in_size, :]

                x_val = data[samples_for_training:samples_for_training + samples_for_validation, :in_size, :]
                y_val = data[samples_for_training:samples_for_training + samples_for_validation, in_size:2 * in_size, :]

                x_test = data[samples_for_training + samples_for_validation:samples_for_training + samples_for_validation + samples_for_testing,
                              :in_size, :]
                y_test = data[samples_for_training + samples_for_validation:samples_for_training + samples_for_validation + samples_for_testing,
                              in_size:2 * in_size, :]

                early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=4,mode='min')

                model.fit(x=x_train, y=y_train, validation_data=(x_val, y_val), epochs=max_epochs, callbacks=[early_stopping])

                model.save('saved_models/lstm'+str(feature)+str(group))

                p = model.predict(x_test)

                filename = 'lstmFeature' + str(feature) + 'Group' + str(group) + '.png'

                plot_prediction(y_test, p, filename, feature)


# Running the above functions
Below is where the functions are run. Data is collected according to a tickers list. Then pre-processing and feature extraction happens in kmeans_dataset_from_tickers. The groupings then are used to train seperate models

In [12]:
tick_list = []
if config['S&P500']:
    for t in get_sp500():
        tick_list.append(t)
for t in config['tickers']:
    tick_list.append(t)

In [13]:
model_dataset, tick_list, min_f_size = kmeans_dataset_from_tickers(tick_list)

In [14]:
# train_kmeans(model_dataset)

In [15]:
mapped_labels = train_tseries_kmeans(model_dataset, tick_list)

Feature: 0
Number of Clusters: 1
21903.199 --> 11021.597 --> 11033.927 --> 11028.259 --> 11028.884 --> 11028.610 --> 11028.651 --> 11028.634 --> 11028.634 --> 
Number of Clusters: 2
3336.319 --> 4725.624 --> 4629.020 --> 4620.442 --> 4612.143 --> 4612.170 --> 4612.145 --> 4612.154 --> 4612.152 --> 4612.153 --> 
Number of Clusters: 3
4765.255 --> 5208.536 --> 5270.862 --> 5295.600 --> 5305.323 --> 5306.104 --> 5307.222 --> 5307.471 --> 5312.192 --> 5317.290 --> 
Number of Clusters: 4
5011.331 --> 5833.447 --> 5883.100 --> 5910.200 --> 5920.343 --> 5924.827 --> 5926.372 --> 5926.574 --> 5926.245 --> 5926.010 --> 
Number of Clusters: 5
5327.518 --> 5944.560 --> 6024.060 --> 6121.217 --> 6224.313 --> 6249.492 --> 6257.077 --> 6259.315 --> 6261.110 --> 6262.351 --> 
Number of Clusters: 6
5091.872 --> 5977.742 --> 6195.050 --> 6252.714 --> 6274.109 --> 6287.438 --> 6303.080 --> 6319.777 --> 6343.153 --> 6371.732 --> 
Number of Clusters: 7
5917.427 --> 6340.573 --> 6444.843 --> 6467.239 --> 6

4739.265 --> 6217.706 --> 6457.881 --> 6521.276 --> 6563.993 --> 6582.808 --> 6593.939 --> 6604.190 --> 6614.866 --> 6623.298 --> 
Elbow: 2
3918.788 --> 4439.775 --> 4549.623 --> 4581.363 --> 4597.666 --> 4605.919 --> 4614.201 --> 4614.149 --> 4614.187 --> 4614.179 --> 
Feature: 5
Number of Clusters: 1
15890.326 --> 11129.427 --> 11033.959 --> 11051.063 --> 11039.219 --> 11045.549 --> 11041.973 --> 11041.882 --> 11041.926 --> 11041.914 --> 
Number of Clusters: 2
2944.872 --> 4339.899 --> 4479.586 --> 4595.150 --> 4774.356 --> 4804.079 --> 4823.179 --> 4859.422 --> 4948.321 --> 5043.115 --> 
Number of Clusters: 3
2497.969 --> 4054.765 --> 3829.490 --> 3853.652 --> 3806.686 --> 3843.022 --> 3883.906 --> 3912.909 --> 3938.181 --> 3966.627 --> 
Number of Clusters: 4
4201.753 --> 4164.313 --> 4008.373 --> 3934.766 --> 3905.679 --> 3874.107 --> 3860.309 --> 3859.240 --> 3858.836 --> 3858.131 --> 
Number of Clusters: 5
4001.703 --> 4052.335 --> 4012.434 --> 4028.790 --> 4014.568 --> 4035.462 

In [16]:
plot_groups(mapped_labels)

In [17]:
model_dataset = lstm_dataset_from_mapped_labels(mapped_labels)

In [22]:
lstm_model = create_lstm_model(model_dataset, min_f_size)

In [23]:
train_lstm_model(model_dataset, lstm_model, min_f_size)

Feature: 0 Group: 0
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50




INFO:tensorflow:Assets written to: saved_models/lstm00/assets


INFO:tensorflow:Assets written to: saved_models/lstm00/assets


Feature: 0 Group: 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50




INFO:tensorflow:Assets written to: saved_models/lstm01/assets


INFO:tensorflow:Assets written to: saved_models/lstm01/assets


Feature: 1 Group: 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50




INFO:tensorflow:Assets written to: saved_models/lstm11/assets


INFO:tensorflow:Assets written to: saved_models/lstm11/assets


Feature: 1 Group: 0
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50




INFO:tensorflow:Assets written to: saved_models/lstm10/assets


INFO:tensorflow:Assets written to: saved_models/lstm10/assets


Feature: 2 Group: 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50




INFO:tensorflow:Assets written to: saved_models/lstm21/assets


INFO:tensorflow:Assets written to: saved_models/lstm21/assets


Feature: 2 Group: 0
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50




INFO:tensorflow:Assets written to: saved_models/lstm20/assets


INFO:tensorflow:Assets written to: saved_models/lstm20/assets


Feature: 3 Group: 0
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50




INFO:tensorflow:Assets written to: saved_models/lstm30/assets


INFO:tensorflow:Assets written to: saved_models/lstm30/assets


Feature: 3 Group: 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50




INFO:tensorflow:Assets written to: saved_models/lstm31/assets


INFO:tensorflow:Assets written to: saved_models/lstm31/assets


Feature: 4 Group: 0
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50




INFO:tensorflow:Assets written to: saved_models/lstm40/assets


INFO:tensorflow:Assets written to: saved_models/lstm40/assets


Feature: 4 Group: 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50




INFO:tensorflow:Assets written to: saved_models/lstm41/assets


INFO:tensorflow:Assets written to: saved_models/lstm41/assets


Feature: 5 Group: 0
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50




INFO:tensorflow:Assets written to: saved_models/lstm50/assets


INFO:tensorflow:Assets written to: saved_models/lstm50/assets


Feature: 5 Group: 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50




INFO:tensorflow:Assets written to: saved_models/lstm52/assets


INFO:tensorflow:Assets written to: saved_models/lstm52/assets


Feature: 5 Group: 1
