In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Model, Sequential
from keras.layers import Input, Reshape, LSTM, Dense, Dropout
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../utils')
import KMeansDateRange
import get_open_close
import timeseries

"""K is the number of different
industries/sectors in the stock market"""
k = 11

In [2]:
"""This takes a few minutes"""
df = get_open_close.populate_values_efficient("2012-05-18", "2020-04-01")

In [3]:
"""This takes a few minutes"""
clusters = KMeansDateRange.kmeans_automated("2012-05-18", "2020-04-01", k=k)

In [4]:
%%time
mean_absolute_loss = 0
mean_absolute_training_loss = 0

"""Randomly sampling 20 stocks from each cluster"""
for cluster in range(0, k):
    
    cluster_mae, cluster_training_mae = 0, 0
    sampled_stocks = clusters.loc[clusters['clusters'] == cluster].sample(n=20, random_state=1337)
    
    encoder_inputs = Input(shape=(14, 2))
    encoder_layer1 = LSTM(units=64, activation='sigmoid', return_sequences=True, return_state=True)
    encoder_output1, _, _ = encoder_layer1(encoder_inputs)
    encoder_layer2 = LSTM(units=64, activation='sigmoid', return_sequences=True, return_state=True)
    encoder_outputs, state_h, state_c = encoder_layer2(encoder_output1)
    encoder_states = [state_h, state_c]

    
    decoder_inputs = Input(shape=(1, 1))
    decoder_layer1 = LSTM(units=64, activation='sigmoid', return_sequences=True, return_state=True)
    decoder_output1, _, _ = decoder_layer1(decoder_inputs, initial_state=encoder_states)
    decoder_layer2 = LSTM(units=64, activation='sigmoid', return_sequences=False, return_state=True)
    decoder_outputs, _, _ = decoder_layer2(decoder_output1)

    decoder_dense = Dense(units=1, activation="linear")
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer="adam", loss="mae")
    
    stock_x_test, stock_train = {}, {}
    
    """Fit the data of each stock within a cluster on the RNN"""
    for index, stock in enumerate(sampled_stocks['symbols']):
        stock_data = df[stock]
        
        x_train, x_test = train_test_split(stock_data, test_size=0.3, shuffle=False)
        
        scaler = StandardScaler()
        x_train= scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        stock_x_test[stock] = x_test

        x_train, y_train = timeseries.get_timeseries_X_y(x_train)
        stock_train[stock] = [x_train, y_train]
        
        model.fit([x_train, y_train], y_train, validation_split=0.3, verbose=0,
                            initial_epoch=index*20, epochs=(index + 1)*20)
        
    for stock in stock_x_test:
        x_test, y_test = timeseries.get_timeseries_X_y(stock_x_test[stock])
        loss = model.evaluate([x_test, y_test], y_test)
        cluster_mae = cluster_mae + loss
        
        training_data = stock_train[stock]
        x_train, y_train = training_data[0], training_data[1]
        training_loss = model.evaluate([x_train, y_train], y_train)
        cluster_training_mae = cluster_training_mae + training_loss

    cluster_mae = cluster_mae/(sampled_stocks.shape[0])
    cluster_training_mae = cluster_training_mae/(sampled_stocks.shape[0])
    mean_absolute_loss = mean_absolute_loss + cluster_mae
    mean_absolute_training_loss = mean_absolute_training_loss + cluster_training_mae

print("Training Loss of Sequence-to-Sequence LSTM: ", mean_absolute_training_loss/k)
print("Loss of Sequence-to-Sequence LSTM: ", mean_absolute_loss/k)

2022-12-13 15:37:15.081881: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.








Training Loss of Sequence-to-Sequence LSTM:  0.012425803051287819
Loss of Sequence-to-Sequence LSTM:  0.22182658400428903
CPU times: user 1h 18min 35s, sys: 15min 35s, total: 1h 34min 10s
Wall time: 35min 40s
