In [None]:
import ta
import os
import math
import tqdm
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import cufflinks as cf
import tensorflow as tf
import plotly.express as px
import chart_studio.plotly as py

from sklearn.preprocessing import MaxAbsScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

%matplotlib inline

init_notebook_mode(connected=True)
cf.go_offline()

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

MODELS_DIR = "Models"
DATA_DIR = "Data"


def next_free_model_name(base_name: str) -> str:
    model_files = [name for name in os.listdir(MODELS_DIR) if name.split(".")[-1].lower() == "h5"]
    similar_models_numbers = [int(name.split("_")[-1].split(".")[0]) for name in model_files if name.find(base_name) >= 0]
    
    if len(similar_models_numbers) > 0:
        max_value = str(max(similar_models_numbers)+1).zfill(3)
        new_name = base_name + "_" + max_value
    else:
        new_name = base_name + "_000"
    return new_name


def infinite_iterator(data: list or tuple or str) -> list or tuple or char or str:
    while True:
        for item in data:
            yield item
            

def generate_input_sequences(data: pd.DataFrame or np.ndarray, input_length=30, output_length=1) -> np.ndarray:
    if type(data) == pd.DataFrame:
        price_array = data.to_numpy()
    else:
        price_array= data
    input_sequences = np.zeros((price_array.shape[0] - input_length - output_length,
                                input_length, price_array.shape[1]), dtype=float)
    for index in range(input_sequences.shape[0]):
        input_sequences[index] = price_array[index:index+input_length, :]
    return input_sequences


def generate_output_sequences(data: pd.DataFrame or np.ndarray, outputs: list,
                              input_length=30, output_length=1) -> np.ndarray:
    if type(data) == pd.DataFrame:
        selected_dataframe = data[outputs]
        price_array = selected_dataframe.to_numpy()
    else:
        price_array = data[:,outputs]
        if price_array.ndim == 1:
            price_array = price_array.reshape((-1,1))
        
    if output_length == 1:
        output_sequences = np.zeros((price_array.shape[0] - input_length - output_length,
                                     price_array.shape[1]), dtype=float)
        for index in range(output_sequences.shape[0]):
            output_sequences[index] = price_array[index+input_length : index+input_length+output_length, :] 
    else:
        output_sequences = np.zeros((price_array.shape[0] - input_length - output_length,
                                     output_length, price_array.shape[1]), dtype=float)
        for index in range(output_sequences.shape[0]):
            output_sequences[index] = price_array[index+input_length : index+input_length+output_length, :]
    return output_sequences


def scale_closing_data(data: np.ndarray, max_value: float) -> np.ndarray:
    for company in data:
#         company[1:,-1] = (company[1:,-1] - company[:-1,-1]) / max_value
        company[1:,-1] = company[1:,-1] / max_value
    return data


def flatten_data(data: np.ndarray) -> np.ndarray:
    """
    Stacks mutlidimensional arrays on top of each other reducing their dimensionality by 1 (starting from the higgest dimension)
    """
    new_data = None
    for single_read in data:
        if new_data is None:
            new_data = single_read
        else:
            new_data = np.append(new_data, single_read, axis=0)   
    return new_data


def scale_3d_price_data(price_data: np.ndarray) -> tuple:
    """
    Scale 3d tensor 3rd dimension using sklearn.preprocessing.MinMaxScaler
    """
    scalers = []
    scaled_price_data = []
    for company in price_data:
        scaler = MinMaxScaler()
        scaler.fit(company)
        scalers.append(scaler)
        scaled_price_data.append(scaler.transform(company))
    
    return (np.array(scaled_price_data, dtype=object), scalers)

# New York Stock Exchange Data
Dataset seleceted for project [New York Stock Exchange Data] contains large amount of data separated into 4 files.  
____
* **fundamentals.csv**  
metrics extracted from annual SEC 10K fillings (2012-2016), should be enough to derive most of popular fundamental indicators.
----
* **securities.csv**  
general description of each company with division on sectors  
----
* **prices-split-adjusted.csv**  
same as prices, but there have been added adjustments for splits.
----
* **prices.csv**  
raw, as-is daily prices. Most of data spans from 2010 to the end 2016, for companies new on stock market date range is shorter. There have been approx. 140 stock splits in that time, this set doesn't account for that.
----

[New York Stock Exchange Data]: <https://www.kaggle.com/dgawlik/nyse?select=prices.csv>

In [None]:
# Load Data
import pandas as pd

fundamentals = pd.read_csv(DATA_DIR + "/" + "fundamentals.csv", index_col=0)
securities   = pd.read_csv(DATA_DIR + "/" + "securities.csv")
prices       = pd.read_csv(DATA_DIR + "/" + "prices-split-adjusted.csv")
prices[prices["symbol"] == "MSFT"].head()

In [None]:
pd.set_option("display.max_columns", 80)
fundamentals[fundamentals["Ticker Symbol"] == "MSFT"]

In [None]:
securities[securities["Ticker symbol"] == "MSFT"].head()

In [None]:
print(f"Number of unique companies: { len(prices['symbol'].unique()) : >4}")

# Selected stock indicators
____
* __On-Balance Volume__ *(OBV)*  
Indicator of stock momentum based on close price and stock volume
____
* __Accumulation/Distribution Index__ *(ADI)*  
money flow indicator considers stock volume as well as closing price in regard to price range.
____
* __Aroon Indicator__ *(AI / AIU / AID)*  
Trend following indicator, describes strength of the current trend and likely hood that it will continue.  
May be divided into up and down indexes.
____
* __Relative Strength Index__ *(RSI)*  
Measurement of the of price change magnitude, indicates overbought or oversold conditions in the price of a stock.
____
* __Volume Weighted Average Price__ *(VWAP)*
Average price calculated in regard to stock volume.
____
* __Simple Moving Average 7 Days__ *(SMA7)*  
Moving average over 7 day period
____
* __Simple Moving Average 14 Days__ *(SMA14)*  
Moving average over 14 day period
____

In [None]:
ticker = "MSFT"
stock_price = prices[prices["symbol"] == ticker]

In [None]:
OBV = ta.volume.on_balance_volume(stock_price["close"], stock_price["volume"])
fig = px.line(stock_price, x=stock_price["date"], y=OBV, labels=dict(y="On-Balance Volume"))
fig.update_layout(template="simple_white")
fig

In [None]:
indicator = ta.volume.AccDistIndexIndicator(stock_price["high"], stock_price["low"],
                                            stock_price["close"], stock_price["volume"])
ADI = indicator.acc_dist_index()
fig = px.line(stock_price, x=stock_price["date"], y=ADI, labels=dict(y="Accumulation & Distribution Index"))
fig.update_layout(template="simple_white")
fig

In [None]:
indicator = ta.trend.AroonIndicator(stock_price["close"])
AI  = indicator.aroon_indicator()
AIU = indicator.aroon_up()
AID = indicator.aroon_down()

AI_df = pd.concat([stock_price["date"], AI], axis=1)
AI_df["name"] = "AI"
AI_df = AI_df.rename(columns={AI_df.columns[0]: "date", AI_df.columns[1]: "value"})
AIU_df = pd.concat([stock_price["date"], AIU], axis=1)
AIU_df["name"] = "AIU"
AIU_df = AIU_df.rename(columns={AIU_df.columns[1]: "date", AIU_df.columns[1]: "value"})
AID_df = pd.concat([stock_price["date"], AID], axis=1)
AID_df["name"] = "AID"
AID_df = AID_df.rename(columns={AID_df.columns[1]: "date", AID_df.columns[1]: "value"})
AI_df = AI_df.append([AIU_df, AID_df])

fig = px.line(AI_df, x="date", y="value", color="name")
fig.update_layout(template="simple_white")
fig


In [None]:
indicator = ta.momentum.RSIIndicator(close=stock_price["close"], window=14)
RSI = indicator.rsi()

fig = px.line(stock_price, x=stock_price["date"], y=RSI, labels=dict(y="Relative Strength Index"))
fig.update_layout(template="simple_white")
fig

In [None]:
indicator = ta.volume.VolumeWeightedAveragePrice(stock_price["high"], stock_price["low"], 
                                     stock_price["close"],stock_price["volume"], window=14)
VWAP = indicator.volume_weighted_average_price()

indicator = ta.trend.SMAIndicator(stock_price["close"], window=7)
SMA7 = indicator.sma_indicator()

indicator = ta.trend.SMAIndicator(stock_price["close"], window=14)
SMA14 = indicator.sma_indicator()

VMAP_df = pd.concat([stock_price["date"], VWAP], axis=1)
VMAP_df["name"] = "VWAP"
VMAP_df = VMAP_df.rename(columns={VMAP_df.columns[0]: "date", VMAP_df.columns[1]: "value"})
SMA7_df = pd.concat([stock_price["date"], SMA7], axis=1)
SMA7_df["name"] = "SMA7"
SMA7_df = SMA7_df.rename(columns={SMA7_df.columns[1]: "date", SMA7_df.columns[1]: "value"})
SMA14_df = pd.concat([stock_price["date"], SMA14], axis=1)
SMA14_df["name"] = "SMA14"
SMA14_df = SMA14_df.rename(columns={SMA14_df.columns[1]: "date", SMA14_df.columns[1]: "value"})
VMAP_df = VMAP_df.append([SMA7_df, SMA14_df])

fig = px.line(VMAP_df, x="date", y="value", color="name")
fig.update_layout(template="simple_white")
fig


In [None]:
indicators = pd.DataFrame()
indicators["OBV"] = OBV
indicators["ADI"] = ADI
indicators["AI"] = AI
# indicators["AID"] = AID
# indicators["AIU"] = AIU
indicators["VWAP"] = VWAP
# indicators["SMA7"] = SMA7
# indicators["SMA14"] = SMA14
indicators["close"] = stock_price["close"]

indicators = indicators.dropna()
indicators.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
transformer = MinMaxScaler().fit(indicators)
scaled_indicators = pd.DataFrame(transformer.transform(indicators), columns=indicators.columns)
scaled_indicators.head()

In [None]:
sequence_length = 14
predicted_values = ["close"]

train_data_length = (0, round(0.7 * scaled_indicators.shape[0]))
val_data_length = (round(0.7 * scaled_indicators.shape[0])+1, round(0.9 * scaled_indicators.shape[0]))
test_data_length = (round(0.9 * scaled_indicators.shape[0])+1, scaled_indicators.shape[0])

train_x = generate_input_sequences(data=scaled_indicators[train_data_length[0]:train_data_length[1]],
                                   input_length=sequence_length)
train_y = generate_output_sequences(data=scaled_indicators[train_data_length[0]:train_data_length[1]],
                                    input_length=sequence_length, outputs=predicted_values)

val_x = generate_input_sequences(data=scaled_indicators[val_data_length[0]:val_data_length[1]],
                                 input_length=sequence_length)
val_y = generate_output_sequences(data=scaled_indicators[val_data_length[0]:val_data_length[1]],
                                  input_length=sequence_length, outputs=predicted_values)

test_x = generate_input_sequences(data=scaled_indicators[test_data_length[0]:test_data_length[1]],
                                  input_length=sequence_length)
test_y = generate_output_sequences(data=scaled_indicators[test_data_length[0]:test_data_length[1]],
                                   input_length=sequence_length, outputs=predicted_values)

print(f"Train_x: {train_x.shape}")
print(f"Train_y: {train_y.shape}")
print("---------------------------------")
print(f"Val_x:   {val_x.shape}")
print(f"Val_y:   {val_y.shape}")
print("---------------------------------")
print(f"Test_x:  {test_x.shape}")
print(f"Test_y:  {test_y.shape}")

In [None]:
epochs = 40
batch_size = 64

In [None]:
import keras
from keras import models
from keras import layers
from keras import metrics

model_name = "First Model"

model = models.Sequential(name=model_name)

model.add(layers.LSTM(units=50, activation="relu", name="LSTM_1", dropout=0.05, 
                      input_shape=(train_x.shape[1], train_x.shape[2]), 
                      return_sequences=True))

model.add(layers.LSTM(units=100, activation="relu", name="LSTM_2", dropout=0.1))

model.add(layers.Dense(units=200, activation="relu", name="Dense_1"))
model.add(layers.Dropout(0.15, name="Dropout_1"))

model.add(layers.Dense(units=400, activation="relu", name="Dense_2"))
model.add(layers.Dropout(0.25, name="Dropout_2"))

model.add(layers.Dense(units=50, activation="relu", name="Dense_3"))

model.add(layers.Dense(1, activation="tanh", name="classifier"))

model.compile(optimizer="Adam", 
              loss="mae", 
              metrics=["mean_absolute_percentage_error"])
model.summary()

history =  model.fit(train_x, train_y,
                     epochs=epochs,
                     batch_size=batch_size, 
                     validation_data=(val_x, val_y))


model_save_name = next_free_model_name(model_name)
model_save_path = MODELS_DIR + "/" + model_save_name + ".h5"
model.save(model_save_path)

history_save_path = MODELS_DIR + "/" + model_save_name + ".csv"
history_dataframe = pd.DataFrame(history.history)
history_dataframe.to_csv(history_save_path)

In [None]:
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
import matplotlib.pyplot as plt

predictions = model.predict(test_x)
plt.rcParams["figure.figsize"] = (16,6)
scaler = MinMaxScaler().fit(indicators["close"].to_numpy().reshape(-1,1))
unscaled_predictions = scaler.inverse_transform(predictions)
unscaled_test_data = scaler.inverse_transform(test_y)
plt.plot(np.arange(len(predictions)), unscaled_predictions, label="Guess")
plt.plot(np.arange(len(predictions)), unscaled_test_data, label="Test")
plt.legend()
plt.show()

## Expanding dataset using multiple companies
Data of all companies is stored in 3D tensor where 3rd dimension represents different company.  
Data is scaled to range (-1,1).

In [None]:
import ta
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

companies_to_process = 100

company_data = []
keys = prices.symbol.unique()
for ticker in keys[:companies_to_process]:
    stock_price = prices[prices["symbol"] == ticker]
    OBV = ta.volume.on_balance_volume(stock_price["close"], stock_price["volume"])
    
    indicator = ta.volume.AccDistIndexIndicator(stock_price["high"], stock_price["low"],
                                            stock_price["close"], stock_price["volume"])
    ADI = indicator.acc_dist_index()
    
    indicator = ta.trend.AroonIndicator(stock_price["close"])
    AI  = indicator.aroon_indicator()
    
    indicator = ta.momentum.RSIIndicator(close=stock_price["close"], window=14)
    RSI = indicator.rsi()
    
    indicator = ta.volume.VolumeWeightedAveragePrice(stock_price["high"], stock_price["low"], 
                                     stock_price["close"],stock_price["volume"], window=14)
    VWAP = indicator.volume_weighted_average_price()
    
    new_indicators = pd.DataFrame()
    new_indicators["OBV"] = OBV
    new_indicators["ADI"] = ADI
    new_indicators["AI"] = AI
    new_indicators["VWAP"] = VWAP
    new_indicators["close"] = stock_price["close"]
    new_indicators = new_indicators.to_numpy()
    
    company_data.append(new_indicators)

company_data = np.array(company_data, dtype=object)
# max_price = max([np.max(company[:,-1]) for company in company_data])
# price_data = scale_closing_data(company_data, max_price)
price_data=company_data

new_price_data = []
for company in price_data:
    new_comapny = company[~np.isnan(company).any(axis=1)]
    new_price_data.append(new_comapny)

price_data = np.array(new_price_data, dtype=object)
scaled_price_data, scalers = scale_3d_price_data(price_data)

In [None]:
sequence_length = 14
predicted_values = -1
data_split = (0.7, 0.2, 0.1)

# price_data = np.array([0,1,2,3,4,5,6,7,8,9,10,11])
train_data_length = (0, round(data_split[0] * scaled_price_data.shape[0]))
val_data_length = (train_data_length[1], train_data_length[1] + round(data_split[1] * scaled_price_data.shape[0]))
test_data_length = (val_data_length[1], val_data_length[1] + round(data_split[2] * scaled_price_data.shape[0]))

input_data, output_data = [], []
for company in scaled_price_data:
    input_data.append(generate_input_sequences(data=company, input_length=sequence_length))
    output_data.append(generate_output_sequences(data=company, input_length=sequence_length, 
                                                 outputs=predicted_values))

train_x_segmented = np.array(input_data[train_data_length[0]:train_data_length[1]], dtype=object)
train_y_segmented = np.array(output_data[train_data_length[0]:train_data_length[1]], dtype=object)
train_scalers = scalers[train_data_length[0]:train_data_length[1]]

val_x_segmented = np.array(input_data[val_data_length[0]:val_data_length[1]], dtype=object)
val_y_segmented = np.array(output_data[val_data_length[0]:val_data_length[1]], dtype=object)
val_scalers = scalers[val_data_length[0]:val_data_length[1]]

test_x_segmented = np.array(input_data[test_data_length[0]:test_data_length[1]], dtype=object)
test_y_segmented = np.array(output_data[test_data_length[0]:test_data_length[1]], dtype=object)
test_scalers = scalers[test_data_length[0]:test_data_length[1]]

train_x = flatten_data(train_x_segmented)
train_y = flatten_data(train_y_segmented)

val_x = flatten_data(val_x_segmented)
val_y = flatten_data(val_y_segmented)

print(f"Segmented train_x: {train_x_segmented.shape}")
print(f"Segmented rain_y: {train_y_segmented.shape}")
print("---------------------------------")
print(f"Segmented val_x:   {val_x_segmented.shape}")
print(f"Segmented val_y:   {val_y_segmented.shape}")
print("---------------------------------")
print(f"Segmented test_x:  {test_x_segmented.shape}")
print(f"Segmented test_y:  {test_y_segmented.shape}")
print("---------------------------------")
print(f"Train_x:  {train_x.shape}")
print(f"Train_y:  {train_y.shape}")
print("---------------------------------")
print(f"Val_x:   {val_x.shape}")
print(f"Val_y:   {val_y.shape}")


In [None]:
epochs = 10
batch_size = 512

In [None]:
import keras
from keras import models
from keras import layers
from keras import metrics

model_name = "Second Model"

model = models.Sequential(name=model_name)

model.add(layers.LSTM(units=50, activation="relu", name="LSTM_1", dropout=0.05, 
                      input_shape=(train_x.shape[1], train_x.shape[2]), 
                      return_sequences=True))

model.add(layers.LSTM(units=100, activation="relu", name="LSTM_2", dropout=0.1))

model.add(layers.Dense(units=200, activation="relu", name="Dense_1"))
model.add(layers.Dropout(0.15, name="Dropout_1"))

model.add(layers.Dense(units=400, activation="relu", name="Dense_2"))
model.add(layers.Dropout(0.25, name="Dropout_2"))

model.add(layers.Dense(units=50, activation="relu", name="Dense_3"))

model.add(layers.Dense(1, activation="tanh", name="classifier"))

model.compile(optimizer="Adam", 
              loss="mae", 
              metrics=["mean_absolute_percentage_error"])
model.summary()

history =  model.fit(train_x, train_y,
                     epochs=epochs,
                     batch_size=batch_size, 
                     validation_data=(val_x, val_y))

model_save_name = next_free_model_name(model_name)
model_save_path = MODELS_DIR + "/" + model_save_name + ".h5"
model.save(model_save_path)

history_save_path = MODELS_DIR + "/" + model_save_name + ".csv"
history_dataframe = pd.DataFrame(history.history)
history_dataframe.to_csv(history_save_path)

In [None]:
import keras
model = keras.models.load_model(MODELS_DIR + "/" + "Second Model_001.h5")
# model = keras.models.load_model(model_save_path)

print(test_x.shape)

In [None]:
print(test_y.shape)
print(predictions.shape)

In [None]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
selected_company = 6
test_x = test_x_segmented[selected_company]
test_y = test_y_segmented[selected_company]
scaler = test_scalers[selected_company]

predictions = model.predict(test_x)
inversed_predictions = scaler.inverse_transform(np.repeat(predictions, 5, axis=1))[:,-1]
inversed_test = scaler.inverse_transform(np.repeat(test_y, 5, axis=1))[:,-1]
# plt.plot(np.arange(len(inversed_predictions)), inversed_predictions)
# plt.plot(np.arange(len(inversed_predictions)), inversed_test)
fig = px.line(x=np.arange(len(inversed_predictions)), y=inversed_predictions)
fig.add_trace(go.Scatter(x=np.arange(len(inversed_predictions)), y=inversed_test, mode="lines", 
                         showlegend=False, name="test_data"))
fig

In [None]:
np.arange(len(inversed_predictions)).shape