In [271]:
import ta
import os
import math
import tqdm
import keras
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import cufflinks as cf
import tensorflow as tf
import plotly.express as px
import matplotlib.pyplot as plt
import chart_studio.plotly as py
import plotly.graph_objects as go

from keras import models
from keras import layers
from keras import metrics
from sklearn.preprocessing import MaxAbsScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

%matplotlib inline

init_notebook_mode(connected=True)
cf.go_offline()

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

MODELS_DIR = "Models"
DATA_DIR = "Data"


def next_free_model_name(base_name: str) -> str:
    model_files = [name for name in os.listdir(MODELS_DIR) if name.split(".")[-1].lower() == "h5"]
    similar_models_numbers = [int(name.split("_")[-1].split(".")[0]) for name in model_files if name.find(base_name) >= 0]
    
    if len(similar_models_numbers) > 0:
        max_value = str(max(similar_models_numbers)+1).zfill(3)
        new_name = base_name + "_" + max_value
    else:
        new_name = base_name + "_000"
    return new_name


def infinite_iterator(data: list or tuple or str) -> list or tuple or char or str:
    while True:
        for item in data:
            yield item
            

def generate_input_sequences(data: pd.DataFrame or np.ndarray, input_length=30, output_length=1) -> np.ndarray:
    if type(data) == pd.DataFrame:
        price_array = data.to_numpy()
    else:
        price_array= data
    input_sequences = np.zeros((price_array.shape[0] - input_length - output_length,
                                input_length, price_array.shape[1]), dtype=float)
    for index in range(input_sequences.shape[0]):
        input_sequences[index] = price_array[index:index+input_length, :]
    return input_sequences


def generate_output_sequences(data: pd.DataFrame or np.ndarray, outputs: list,
                              input_length=30, output_length=1) -> np.ndarray:
    if type(data) == pd.DataFrame:
        selected_dataframe = data[outputs]
        price_array = selected_dataframe.to_numpy()
    else:
        price_array = data[:,outputs]
        if price_array.ndim == 1:
            price_array = price_array.reshape((-1,1))
        
    if output_length == 1:
        output_sequences = np.zeros((price_array.shape[0] - input_length - output_length,
                                     price_array.shape[1]), dtype=float)
        for index in range(output_sequences.shape[0]):
            output_sequences[index] = price_array[index+input_length : index+input_length+output_length, :] 
    else:
        output_sequences = np.zeros((price_array.shape[0] - input_length - output_length,
                                     output_length, price_array.shape[1]), dtype=float)
        for index in range(output_sequences.shape[0]):
            output_sequences[index] = price_array[index+input_length : index+input_length+output_length, :]
    return output_sequences


def scale_closing_data(data: np.ndarray, max_value: float) -> np.ndarray:
    for company in data:
#         company[1:,-1] = (company[1:,-1] - company[:-1,-1]) / max_value
        company[1:,-1] = company[1:,-1] / max_value
    return data


def flatten_data(data: np.ndarray) -> np.ndarray:
    """
    Stacks mutlidimensional arrays on top of each other reducing their dimensionality by 1 (starting from the higgest dimension)
    """
    new_data = None
    for single_read in data:
        if new_data is None:
            new_data = single_read
        else:
            new_data = np.append(new_data, single_read, axis=0)   
    return new_data


def scale_3d_price_data(price_data: np.ndarray) -> tuple:
    """
    Scale 3d tensor 3rd dimension using sklearn.preprocessing.MinMaxScaler
    """
    scalers = []
    scaled_price_data = []
    for company in price_data:
        scaler = MinMaxScaler()
        scaler.fit(company)
        scalers.append(scaler)
        scaled_price_data.append(scaler.transform(company))
    
    return (np.array(scaled_price_data, dtype=object), scalers)


def plot_result_verification(results: pd.DataFrame) -> None:
    fig = px.line(results, x="x", y=["guess", "verification"], animation_frame="symbol",
                  color_discrete_sequence=px.colors.qualitative.G10,
                  title="Results Verificaition",
                  labels={"guess": "predicted closing price", 
                          "verification": "verification closing price",
                          "x": "test sample number"},
                  template="plotly_white"
                )

    for frame_index, frame in enumerate(fig.frames):
        company_data = results[results["symbol"] == test_tickers[frame_index]]
        lower_y_boundary = min(company_data["guess"].min(), company_data["verification"].min())
        upper_y_boundary = max(company_data["guess"].max(), company_data["verification"].max())
        frame["layout"] = {"yaxis": {"range": [lower_y_boundary, upper_y_boundary], "autorange": False}}


    fig.update_layout(
        title = dict(
            y = 0.95,
            x = 0.5,
            xanchor = "center",
            yanchor = "top",
        ),
        updatemenus=[dict(buttons=[dict(visible=False)])],
        sliders=[dict(x=0, len=1)],
        showlegend=True
    )
    fig.update_yaxes(tickprefix="$", title="closing price")
    fig.show()
    
    
def generate_verification_dataframe(model: keras.Model, test_x_segmented: np.ndarray,
                                    test_y_segmented: np.ndarray, test_tickers: list) -> pd.DataFrame:
    results = pd.DataFrame()
    for selected_company in range(len(test_tickers)):
        ticker = test_tickers[selected_company]
        test_x = test_x_segmented[selected_company]
        test_y = test_y_segmented[selected_company]
        scaler = test_scalers[selected_company]

        single_result = pd.DataFrame()
        single_result["guess"] = scaler.inverse_transform(np.repeat(model.predict(test_x), 6, axis=1))[:,-1]
        single_result["verification"] = scaler.inverse_transform(np.repeat(test_y, 6, axis=1))[:,-1]
        single_result["x"] = np.arange(single_result.shape[0])
        single_result["symbol"] = np.repeat(ticker, single_result.shape[0])

        results = results.append(single_result, ignore_index=True)
    return results


def plot_losses(history: pd.DataFrame) -> None:
    loss = ["loss", "val_loss"]
    fig = px.line(history, y=loss, title="Losses During Training",
                  template="plotly_white")
    fig.update_layout(
        title = dict(
            y = 0.9,
            x = 0.5,
            xanchor = "center",
            yanchor = "top",
        ),
    )
    fig.update_yaxes(title="losses [-]")
    fig.show()

1 Physical GPUs, 1 Logical GPUs


# New York Stock Exchange Data
Dataset seleceted for project [New York Stock Exchange Data] contains large amount of data separated into 4 files.  
____
* **fundamentals.csv**  
metrics extracted from annual SEC 10K fillings (2012-2016), should be enough to derive most of popular fundamental indicators.
----
* **securities.csv**  
general description of each company with division on sectors  
----
* **prices-split-adjusted.csv**  
same as prices, but there have been added adjustments for splits.
----
* **prices.csv**  
raw, as-is daily prices. Most of data spans from 2010 to the end 2016, for companies new on stock market date range is shorter. There have been approx. 140 stock splits in that time, this set doesn't account for that.
----

[New York Stock Exchange Data]: <https://www.kaggle.com/dgawlik/nyse?select=prices.csv>

In [137]:
fundamentals = pd.read_csv(DATA_DIR + "/" + "fundamentals.csv", index_col=0)
securities   = pd.read_csv(DATA_DIR + "/" + "securities.csv")
prices       = pd.read_csv(DATA_DIR + "/" + "prices-split-adjusted.csv")
prices[prices["symbol"] == "MSFT"].head()

Unnamed: 0,date,symbol,open,close,low,high,volume
544,2010-01-04,MSFT,30.620001,30.950001,30.59,31.1,38409100.0
1012,2010-01-05,MSFT,30.85,30.959999,30.639999,31.1,49749600.0
1480,2010-01-06,MSFT,30.879999,30.77,30.52,31.08,58182400.0
1948,2010-01-07,MSFT,30.629999,30.450001,30.190001,30.700001,50559700.0
2416,2010-01-08,MSFT,30.280001,30.66,30.24,30.879999,51197400.0


In [138]:
pd.set_option("display.max_columns", 80)
fundamentals[fundamentals["Ticker Symbol"] == "MSFT"]

Unnamed: 0,Ticker Symbol,Period Ending,Accounts Payable,Accounts Receivable,Add'l income/expense items,After Tax ROE,Capital Expenditures,Capital Surplus,Cash Ratio,Cash and Cash Equivalents,Changes in Inventories,Common Stocks,Cost of Revenue,Current Ratio,Deferred Asset Charges,Deferred Liability Charges,Depreciation,Earnings Before Interest and Tax,Earnings Before Tax,Effect of Exchange Rate,Equity Earnings/Loss Unconsolidated Subsidiary,Fixed Assets,Goodwill,Gross Margin,Gross Profit,Income Tax,Intangible Assets,Interest Expense,Inventory,Investments,Liabilities,Long-Term Debt,Long-Term Investments,Minority Interest,Misc. Stocks,Net Borrowings,Net Cash Flow,Net Cash Flow-Operating,Net Cash Flows-Financing,Net Cash Flows-Investing,Net Income,Net Income Adjustments,Net Income Applicable to Common Shareholders,Net Income-Cont. Operations,Net Receivables,Non-Recurring Items,Operating Income,Operating Margin,Other Assets,Other Current Assets,Other Current Liabilities,Other Equity,Other Financing Activities,Other Investing Activities,Other Liabilities,Other Operating Activities,Other Operating Items,Pre-Tax Margin,Pre-Tax ROE,Profit Margin,Quick Ratio,Research and Development,Retained Earnings,Sale and Purchase of Stock,"Sales, General and Admin.",Short-Term Debt / Current Portion of Long-Term Debt,Short-Term Investments,Total Assets,Total Current Assets,Total Current Liabilities,Total Equity,Total Liabilities,Total Liabilities & Equity,Total Revenue,Treasury Stock,For Year,Earnings Per Share,Estimated Shares Outstanding
1107,MSFT,2013-06-30,10182000000.0,-1807000000.0,288000000.0,28.0,-4257000000.0,0.0,206.0,3804000000.0,-802000000.0,67306000000.0,20385000000.0,271.0,0.0,3469000000.0,3755000000.0,27052000000.0,27052000000.0,-8000000.0,0.0,9991000000.0,14655000000.0,74.0,57464000000.0,5189000000.0,3083000000.0,0.0,1938000000.0,-17802000000.0,1841000000.0,12601000000.0,10844000000.0,0.0,0.0,3537000000.0,-3134000000.0,28833000000.0,-8148000000.0,-23811000000.0,21863000000.0,4590000000.0,21863000000.0,21863000000.0,19118000000.0,0.0,26764000000.0,34.0,2392000000.0,3388000000.0,24236000000.0,1743000000.0,-10000000.0,-1752000000.0,10000000000.0,-607000000.0,0.0,35.0,34.0,28.0,266.0,10411000000.0,9895000000.0,-4429000000.0,20289000000.0,2999000000.0,73218000000.0,142431000000.0,101466000000.0,37417000000.0,78944000000.0,63487000000.0,142431000000.0,77849000000.0,0.0,2013.0,2.61,8376628000.0
1108,MSFT,2014-06-30,13569000000.0,-1120000000.0,61000000.0,25.0,-5485000000.0,0.0,188.0,8669000000.0,-161000000.0,68366000000.0,27078000000.0,250.0,0.0,4736000000.0,5212000000.0,27820000000.0,27820000000.0,-139000000.0,0.0,13011000000.0,20127000000.0,69.0,59755000000.0,5746000000.0,6981000000.0,0.0,2660000000.0,-7324000000.0,2562000000.0,20645000000.0,14597000000.0,0.0,0.0,6962000000.0,4865000000.0,32502000000.0,-8665000000.0,-18833000000.0,22074000000.0,4592000000.0,22074000000.0,22074000000.0,21485000000.0,127000000.0,27759000000.0,32.0,3422000000.0,4392000000.0,30056000000.0,3708000000.0,-39000000.0,-6024000000.0,11594000000.0,-657000000.0,0.0,32.0,31.0,25.0,245.0,11381000000.0,17710000000.0,-6709000000.0,20488000000.0,2000000000.0,77040000000.0,172384000000.0,114246000000.0,45625000000.0,89784000000.0,82600000000.0,172384000000.0,86833000000.0,0.0,2014.0,2.66,8298496000.0
1109,MSFT,2015-06-30,12385000000.0,1456000000.0,346000000.0,15.0,-5944000000.0,0.0,194.0,5595000000.0,-272000000.0,68465000000.0,33038000000.0,247.0,0.0,3390000000.0,5957000000.0,18507000000.0,18507000000.0,-73000000.0,0.0,14731000000.0,16939000000.0,65.0,60542000000.0,6314000000.0,4835000000.0,0.0,2902000000.0,-12868000000.0,-79000000.0,27808000000.0,12053000000.0,0.0,0.0,13661000000.0,-3074000000.0,29668000000.0,-9668000000.0,-23001000000.0,12193000000.0,10005000000.0,12193000000.0,12193000000.0,17908000000.0,10011000000.0,18161000000.0,19.0,3117000000.0,5461000000.0,29778000000.0,2522000000.0,362000000.0,-4189000000.0,13544000000.0,408000000.0,0.0,20.0,23.0,13.0,241.0,12046000000.0,9096000000.0,-13809000000.0,20324000000.0,7484000000.0,90931000000.0,174472000000.0,122797000000.0,49647000000.0,80083000000.0,94389000000.0,174472000000.0,93580000000.0,0.0,2015.0,1.49,8183221000.0
1110,MSFT,2016-06-30,13036000000.0,-530000000.0,-431000000.0,23.0,-8343000000.0,0.0,191.0,6510000000.0,600000000.0,68178000000.0,32780000000.0,235.0,0.0,7917000000.0,6622000000.0,19751000000.0,19751000000.0,-67000000.0,0.0,18356000000.0,17872000000.0,62.0,52540000000.0,2953000000.0,3733000000.0,0.0,2251000000.0,-14417000000.0,-938000000.0,40783000000.0,10431000000.0,0.0,0.0,18283000000.0,915000000.0,33325000000.0,-8393000000.0,-23950000000.0,16798000000.0,11981000000.0,16798000000.0,16798000000.0,18277000000.0,1110000000.0,20182000000.0,24.0,3642000000.0,5892000000.0,33417000000.0,1537000000.0,-369000000.0,-1190000000.0,13640000000.0,-1208000000.0,0.0,23.0,27.0,20.0,231.0,11988000000.0,2282000000.0,-15301000000.0,19260000000.0,12904000000.0,106730000000.0,193694000000.0,139660000000.0,59357000000.0,71997000000.0,121697000000.0,193694000000.0,85320000000.0,0.0,2016.0,2.12,7923585000.0


In [139]:
securities[securities["Ticker symbol"] == "MSFT"].head()

Unnamed: 0,Ticker symbol,Security,SEC filings,GICS Sector,GICS Sub Industry,Address of Headquarters,Date first added,CIK
306,MSFT,Microsoft Corp.,reports,Information Technology,Systems Software,"Redmond, Washington",1994-06-01,789019


In [140]:
print(f"Number of unique companies in the dataset: { len(prices['symbol'].unique()) : >4}")

Number of unique companies in the dataset:  501


# Selected stock indicators
____
* __On-Balance Volume__ *(OBV)*  
Indicator of stock momentum based on close price and stock volume
____
* __Accumulation/Distribution Index__ *(ADI)*  
money flow indicator considers stock volume as well as closing price in regard to price range.
____
* __Aroon Indicator__ *(AI / AIU / AID)*  
Trend following indicator, describes strength of the current trend and likely hood that it will continue.  
May be divided into up and down indexes.
____
* __Relative Strength Index__ *(RSI)*  
Measurement of the of price change magnitude, indicates overbought or oversold conditions in the price of a stock.
____
* __Volume Weighted Average Price__ *(VWAP)*
Average price calculated in regard to stock volume.
____
* __Simple Moving Average 7 Days__ *(SMA7)*  
Moving average over 7 day period
____
* __Simple Moving Average 14 Days__ *(SMA14)*  
Moving average over 14 day period
____

In [245]:
stock_price = prices[prices["symbol"] == "MSFT"]
for symbol in prices['symbol'].unique()[:7]:
    stock_price = stock_price.append(prices[prices["symbol"] == symbol], ignore_index=True)

In [None]:
fig = px.area(stock_price, x="date", y="close", color="symbol",
              animation_group="close", animation_frame="symbol", 
              hover_data={"date": "|%d.%m.%Y"}, 
              color_discrete_sequence=px.colors.qualitative.G10,
              title = "Closing prices",
              template="plotly_white",
              range_y = [stock_price.close.min(), stock_price.close.max()],
              labels={"close": "close price"}
            )

fig.update_layout(
    title = dict(
        y = 0.95,
        x = 0.5,
        xanchor = "center",
        yanchor = "top",
    ),
    updatemenus=[dict(buttons=[dict(visible=False)])],
    sliders=[dict(x=0, len=1)],
    showlegend=False
)
fig.update_yaxes(tickprefix="$")
fig.show()

In [None]:
stock_price["OBV"] = ta.volume.on_balance_volume(stock_price["close"],
                                                   stock_price["volume"])
fig = px.area(stock_price, x="date", y="OBV", color="symbol",
              facet_col="symbol", facet_col_wrap=2, height=600, 
              hover_data={"date": "|%d.%m.%Y"}, template="plotly_white",
              color_discrete_sequence=px.colors.qualitative.G10,
              title = "On Balance Voulme",
              labels={"OBV": "OBV [-]"}
            )

fig.update_layout(
    title = dict(
        y = 0.95,
        x = 0.5,
        xanchor = "center",
        yanchor = "top",
    )
)
fig.show()

In [None]:
indicator = ta.volume.AccDistIndexIndicator(stock_price["high"], stock_price["low"],
                                            stock_price["close"], stock_price["volume"])
stock_price["ADI"] = indicator.acc_dist_index()
fig = px.area(stock_price, x="date", y="ADI", color="symbol",
              facet_col="symbol", facet_col_wrap=2, height=600, 
              hover_data={"date": "|%d.%m.%Y"}, template="plotly_white",
              color_discrete_sequence=px.colors.qualitative.G10,
              title="Accumulation & Distribution Index",
              labels={"ADI": "ADI [-]"}
            )
          
fig.update_layout(
    title = dict(
        y = 0.95,
        x = 0.5,
        xanchor = "center",
        yanchor = "top",
    )
)
fig.show()

In [None]:
indicator = ta.trend.AroonIndicator(stock_price["close"])
stock_price["AI"] = indicator.aroon_indicator()
stock_price["AIU"] = indicator.aroon_up()
stock_price["AID"] = indicator.aroon_down()

fig = px.area(stock_price, x="date", y="AI", color="symbol",
              animation_group="AI", animation_frame="symbol", 
              hover_data={"date": "|%d.%m.%Y"}, template="simple_white",
              color_discrete_sequence=px.colors.qualitative.G10,
              title = "Accumulation & Distribution Index",
              labels = {"AI": "Accumulation & Distribution Index"}
            )

fig.update_layout(
    title = dict(
        y = 0.95,
        x = 0.5,
        xanchor = "center",
        yanchor = "top",
    ),
    updatemenus=[dict(buttons=[dict(visible=False)])],
    sliders=[dict(x=0, len=1)],
    showlegend=False
)
fig.update_yaxes(ticksuffix="%")
fig.show()

In [None]:
indicator = ta.momentum.RSIIndicator(close=stock_price["close"], window=14)
stock_price["RSI"] = indicator.rsi()

fig = px.area(stock_price, x="date", y="RSI", color="symbol",
              animation_group="RSI", animation_frame="symbol", 
              hover_data={"date": "|%d.%m.%Y"}, template="simple_white",
              color_discrete_sequence=px.colors.qualitative.G10,
              title="Relative Strength Index",
              labels={"RSI": "Relative Strength Index"},
              range_y = [0, 100]
            )

fig.update_layout(
    title = dict(
        y = 0.95,
        x = 0.5,
        xanchor = "center",
        yanchor = "top",
    ),
    updatemenus=[dict(buttons=[dict(visible=False)])],
    sliders=[dict(x=0, len=1)],
    showlegend=False
)
fig.update_yaxes(ticksuffix="%")
fig.show()

In [None]:
indicator = ta.volume.VolumeWeightedAveragePrice(stock_price["high"], stock_price["low"], 
                                     stock_price["close"],stock_price["volume"], window=14)

stock_price["VWAP"] = indicator.volume_weighted_average_price()

indicator = ta.trend.SMAIndicator(stock_price["close"], window=7)
stock_price["SMA7"] = indicator.sma_indicator()

indicator = ta.trend.SMAIndicator(stock_price["close"], window=14)
stock_price["SMA14"] = indicator.sma_indicator()

fig = px.area(stock_price, x="date", y="VWAP", color="symbol",
              animation_group="VWAP", animation_frame="symbol", 
              hover_data={"date": "|%d.%m.%Y"}, template="simple_white",
              color_discrete_sequence=px.colors.qualitative.G10,
              title="Volume Weighted Average Price",
              labels={"VWAP": "Volume Weighted Average Price"},
              range_y = [stock_price.VWAP.min(), stock_price.VWAP.max()]
            )
fig.update_layout(
    title = dict(
        y = 0.95,
        x = 0.5,
        xanchor = "center",
        yanchor = "top",
    ),
    updatemenus=[dict(buttons=[dict(visible=False)])],
    sliders=[dict(x=0, len=1)],
    showlegend=False
)
fig.update_yaxes(ticksuffix="%")
fig.show()

In [252]:
single_company = stock_price["symbol"] == "MSFT"
indicators = pd.DataFrame()

indicators["OBV"] = stock_price["OBV"][single_company]
indicators["ADI"] = stock_price["ADI"][single_company]
indicators["AI"] = stock_price["AI"][single_company]
# indicators["AID"] = stock_price["AID"][single_company]
# indicators["AIU"] = stock_price["AIU"][single_company]
indicators["VWAP"] = stock_price["VWAP"][single_company]
indicators["RSI"] = stock_price["RSI"][single_company]
# indicators["SMA7"] = stock_price["SMA7"][single_company]
# indicators["SMA14"] = stock_price["SMA14"][single_company]
indicators["close"] = stock_price["close"][single_company]

indicators = indicators.dropna()
indicators.head()

Unnamed: 0,OBV,ADI,AI,VWAP,RSI,close
24,-217647100.0,-184558900.0,-56.0,28.918897,28.882689,27.719999
25,-158451300.0,-191582200.0,-56.0,28.792828,33.642631,28.01
26,-207042600.0,-203730000.0,-56.0,28.665757,33.476222,27.99
27,-141048900.0,-190531100.0,-56.0,28.567642,35.70252,28.120001
28,-222166100.0,-153352200.0,-56.0,28.47047,33.916011,27.93


In [253]:
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
transformer = MinMaxScaler().fit(indicators)
scaled_indicators = pd.DataFrame(transformer.transform(indicators), columns=indicators.columns)
scaled_indicators.head()

Unnamed: 0,OBV,ADI,AI,VWAP,RSI,close
0,0.821259,0.147287,0.208333,0.127246,0.163888,0.115981
1,0.839087,0.145149,0.208333,0.124011,0.235852,0.123122
2,0.824453,0.141451,0.208333,0.120751,0.233336,0.12263
3,0.844328,0.145469,0.208333,0.118234,0.266995,0.125831
4,0.819898,0.156788,0.208333,0.115741,0.239985,0.121152


In [254]:
sequence_length = 14
predicted_values = ["close"]

train_data_length = (0, round(0.7 * scaled_indicators.shape[0]))
val_data_length = (round(0.7 * scaled_indicators.shape[0])+1, round(0.9 * scaled_indicators.shape[0]))
test_data_length = (round(0.9 * scaled_indicators.shape[0])+1, scaled_indicators.shape[0])

train_x = generate_input_sequences(data=scaled_indicators[train_data_length[0]:train_data_length[1]],
                                   input_length=sequence_length)
train_y = generate_output_sequences(data=scaled_indicators[train_data_length[0]:train_data_length[1]],
                                    input_length=sequence_length, outputs=predicted_values)

val_x = generate_input_sequences(data=scaled_indicators[val_data_length[0]:val_data_length[1]],
                                 input_length=sequence_length)
val_y = generate_output_sequences(data=scaled_indicators[val_data_length[0]:val_data_length[1]],
                                  input_length=sequence_length, outputs=predicted_values)

test_x = generate_input_sequences(data=scaled_indicators[test_data_length[0]:test_data_length[1]],
                                  input_length=sequence_length)
test_y = generate_output_sequences(data=scaled_indicators[test_data_length[0]:test_data_length[1]],
                                   input_length=sequence_length, outputs=predicted_values)

print(f"Train_x: {train_x.shape}")
print(f"Train_y: {train_y.shape}")
print("---------------------------------")
print(f"Val_x:   {val_x.shape}")
print(f"Val_y:   {val_y.shape}")
print("---------------------------------")
print(f"Test_x:  {test_x.shape}")
print(f"Test_y:  {test_y.shape}")

Train_x: (1202, 14, 6)
Train_y: (1202, 1)
---------------------------------
Val_x:   (331, 14, 6)
Val_y:   (331, 1)
---------------------------------
Test_x:  (158, 14, 6)
Test_y:  (158, 1)


In [256]:
epochs = 40
batch_size = 64

model_name = "first_model"

model = models.Sequential(name=model_name)

model.add(layers.LSTM(units=50, activation="relu", name="LSTM_1", dropout=0.05, 
                      input_shape=(train_x.shape[1], train_x.shape[2]), 
                      return_sequences=True))

model.add(layers.LSTM(units=100, activation="relu", name="LSTM_2", dropout=0.1))

model.add(layers.Dense(units=200, activation="relu", name="Dense_1"))
model.add(layers.Dropout(0.15, name="Dropout_1"))

model.add(layers.Dense(units=400, activation="relu", name="Dense_2"))
model.add(layers.Dropout(0.25, name="Dropout_2"))

model.add(layers.Dense(units=50, activation="relu", name="Dense_3"))

model.add(layers.Dense(1, activation="tanh", name="classifier"))

model.compile(optimizer="Adam", 
              loss="mae", 
              metrics=["mean_absolute_percentage_error"])
model.summary()

history =  model.fit(train_x, train_y,
                     epochs=epochs,
                     batch_size=batch_size, 
                     validation_data=(val_x, val_y))


model_save_name = next_free_model_name(model_name)
model_save_path = MODELS_DIR + "/" + model_save_name + ".h5"
model.save(model_save_path)

history_save_path = MODELS_DIR + "/" + model_save_name + ".csv"
history_dataframe = pd.DataFrame(history.history)
history_dataframe.to_csv(history_save_path)

Model: "first_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM_1 (LSTM)                (None, 14, 50)            11400     
_________________________________________________________________
LSTM_2 (LSTM)                (None, 100)               60400     
_________________________________________________________________
Dense_1 (Dense)              (None, 200)               20200     
_________________________________________________________________
Dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
Dense_2 (Dense)              (None, 400)               80400     
_________________________________________________________________
Dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
Dense_3 (Dense)              (None, 50)                

In [None]:
loaded_history = pd.read_csv(MODELS_DIR + "/" + "first_model_000.csv", index_col=0)
plot_losses(loaded_history)

model = keras.models.load_model(MODELS_DIR + "/" + "first_model_000.h5")
predictions = model.predict(test_x)
scaler = MinMaxScaler().fit(indicators["close"].to_numpy().reshape(-1,1))

results = pd.DataFrame()
results["guesses"] = scaler.inverse_transform(predictions)[:,0]
results["verification"] = scaler.inverse_transform(test_y)[:,0]
results["x"] = np.arange(results.shape[0])

fig = px.line(results, x="x", y=["guesses", "verification"], 
              color_discrete_sequence=px.colors.qualitative.G10,
              labels = {"y": "close price", "x": "test sample number"},
              template="plotly_white",
              title="Results Verification"
            )
fig.update_layout(
    title = dict(
        y = 0.9,
        x = 0.5,
        xanchor = "center",
        yanchor = "top",
    ),
)
fig.update_yaxes(tickprefix="$", title="closing price")
fig.show()

## Expanding training data using multiple companies
Data of all companies is stored in 3D tensor where 3rd dimension represents different company.  
Data is scaled to range (-1,1).

In [260]:
companies_to_process = 100
tickers = prices.symbol.unique()[:companies_to_process]

company_data = []
for ticker in tickers:
    stock_price = prices[prices["symbol"] == ticker]
    new_indicators = pd.DataFrame()
    
    new_indicators["OBV"] = ta.volume.on_balance_volume(stock_price["close"], stock_price["volume"])
    
    indicator = ta.volume.AccDistIndexIndicator(stock_price["high"], stock_price["low"],
                                            stock_price["close"], stock_price["volume"])
    new_indicators["ADI"] = indicator.acc_dist_index()
    
    indicator = ta.trend.AroonIndicator(stock_price["close"])
    new_indicators["AI"]  = indicator.aroon_indicator()
    
    indicator = ta.momentum.RSIIndicator(close=stock_price["close"], window=14)
    new_indicators["RSI"] = indicator.rsi()
    
    indicator = ta.volume.VolumeWeightedAveragePrice(stock_price["high"], stock_price["low"], 
                                     stock_price["close"],stock_price["volume"], window=14)
    new_indicators["VWAP"] = indicator.volume_weighted_average_price()
    
    new_indicators["close"] = stock_price["close"]
    new_indicators = new_indicators.to_numpy()
    
    company_data.append(new_indicators)

company_data = np.array(company_data, dtype=object)
price_data=company_data

new_price_data = []
for company in price_data:
    new_comapny = company[~np.isnan(company).any(axis=1)]
    new_price_data.append(new_comapny)

price_data = np.array(new_price_data, dtype=object)
scaled_price_data, scalers = scale_3d_price_data(price_data)

In [261]:
sequence_length = 14
predicted_values = -1 # last value in 
data_split = (0.7, 0.2, 0.1)

# price_data = np.array([0,1,2,3,4,5,6,7,8,9,10,11])
train_data_length = (0, round(data_split[0] * scaled_price_data.shape[0]))
val_data_length = (train_data_length[1], train_data_length[1] + round(data_split[1] * scaled_price_data.shape[0]))
test_data_length = (val_data_length[1], val_data_length[1] + round(data_split[2] * scaled_price_data.shape[0]))

input_data, output_data = [], []
for company in scaled_price_data:
    input_data.append(generate_input_sequences(data=company, input_length=sequence_length))
    output_data.append(generate_output_sequences(data=company, input_length=sequence_length, 
                                                 outputs=predicted_values))

train_x_segmented = np.array(input_data[train_data_length[0]:train_data_length[1]], dtype=object)
train_y_segmented = np.array(output_data[train_data_length[0]:train_data_length[1]], dtype=object)
train_scalers = scalers[train_data_length[0]:train_data_length[1]]
train_tickers = tickers[train_data_length[0]:train_data_length[1]]

val_x_segmented = np.array(input_data[val_data_length[0]:val_data_length[1]], dtype=object)
val_y_segmented = np.array(output_data[val_data_length[0]:val_data_length[1]], dtype=object)
val_scalers = scalers[val_data_length[0]:val_data_length[1]]
val_tickers = tickers[val_data_length[0]:val_data_length[1]]

test_x_segmented = np.array(input_data[test_data_length[0]:test_data_length[1]], dtype=object)
test_y_segmented = np.array(output_data[test_data_length[0]:test_data_length[1]], dtype=object)
test_scalers = scalers[test_data_length[0]:test_data_length[1]]
test_tickers = tickers[test_data_length[0]:test_data_length[1]]
train_x = flatten_data(train_x_segmented)
train_y = flatten_data(train_y_segmented)

val_x = flatten_data(val_x_segmented)
val_y = flatten_data(val_y_segmented)

print(f"Segmented train_x: {train_x_segmented.shape}")
print(f"Segmented rain_y: {train_y_segmented.shape}")
print("---------------------------------")
print(f"Segmented val_x:   {val_x_segmented.shape}")
print(f"Segmented val_y:   {val_y_segmented.shape}")
print("---------------------------------")
print(f"Segmented test_x:  {test_x_segmented.shape}")
print(f"Segmented test_y:  {test_y_segmented.shape}")
print("---------------------------------")
print(f"Train_x:  {train_x.shape}")
print(f"Train_y:  {train_y.shape}")
print("---------------------------------")
print(f"Val_x:   {val_x.shape}")
print(f"Val_y:   {val_y.shape}")


Segmented train_x: (70,)
Segmented rain_y: (70,)
---------------------------------
Segmented val_x:   (20, 1723, 14, 6)
Segmented val_y:   (20, 1723, 1)
---------------------------------
Segmented test_x:  (10, 1723, 14, 6)
Segmented test_y:  (10, 1723, 1)
---------------------------------
Train_x:  (119099, 14, 6)
Train_y:  (119099, 1)
---------------------------------
Val_x:   (34460, 14, 6)
Val_y:   (34460, 1)


In [267]:
epochs = 30
batch_size = 256

In [268]:
model_name = "second_model"

model = models.Sequential(name=model_name)

model.add(layers.LSTM(units=50, activation="relu", name="LSTM_1", dropout=0.1, 
                      input_shape=(train_x.shape[1], train_x.shape[2]), 
                      return_sequences=True))

model.add(layers.LSTM(units=100, activation="relu", name="LSTM_2", dropout=0.1))

model.add(layers.Dense(units=200, activation="relu", name="Dense_1"))
model.add(layers.Dropout(0.15, name="Dropout_1"))

model.add(layers.Dense(units=400, activation="relu", name="Dense_2"))
model.add(layers.Dropout(0.25, name="Dropout_2"))

model.add(layers.Dense(units=50, activation="relu", name="Dense_3"))

model.add(layers.Dense(1, activation="tanh", name="classifier"))

model.compile(optimizer="Adam", 
              loss="mae", 
              metrics=["mean_absolute_percentage_error"])
model.summary()

history =  model.fit(train_x, train_y,
                     epochs=epochs,
                     batch_size=batch_size, 
                     validation_data=(val_x, val_y))

model_save_name = next_free_model_name(model_name)
model_save_path = MODELS_DIR + "/" + model_save_name + ".h5"
model.save(model_save_path)

history_save_path = MODELS_DIR + "/" + model_save_name + ".csv"
history_dataframe = pd.DataFrame(history.history)
history_dataframe.to_csv(history_save_path)

Model: "second_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM_1 (LSTM)                (None, 14, 50)            11400     
_________________________________________________________________
LSTM_2 (LSTM)                (None, 100)               60400     
_________________________________________________________________
Dense_1 (Dense)              (None, 200)               20200     
_________________________________________________________________
Dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
Dense_2 (Dense)              (None, 400)               80400     
_________________________________________________________________
Dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
Dense_3 (Dense)              (None, 50)               

In [None]:
model = keras.models.load_model(MODELS_DIR + "/" + "second_model_000.h5")
loaded_history = pd.read_csv(MODELS_DIR + "/" + "second_model_000.csv", index_col=0)
plot_losses(loaded_history)
results = generate_verification_dataframe(model, test_x_segmented, test_y_segmented, test_tickers)
plot_result_verification(results)

In [275]:
model_name = "third_model"

model = models.Sequential(name=model_name)

model.add(layers.LSTM(units=50, activation="relu", name="LSTM_1", dropout=0.1, 
                      input_shape=(train_x.shape[1], train_x.shape[2]), 
                      return_sequences=True))

model.add(layers.LSTM(units=100, activation="relu", name="LSTM_2", dropout=0.1))

# model.add(layers.LSTM(units=200, activation="relu", name="LSTM_3", dropout=0.15))

model.add(layers.Dense(units=200, activation="relu", name="Dense_1"))
model.add(layers.Dropout(0.20, name="Dropout_1"))

model.add(layers.Dense(units=400, activation="relu", name="Dense_2"))
model.add(layers.Dropout(0.30, name="Dropout_2"))

model.add(layers.Dense(units=50, activation="relu", name="Dense_3"))

model.add(layers.Dense(1, activation="sigmoid", name="classifier"))

model.compile(optimizer="Adam", 
              loss="mae", 
              metrics=["mean_absolute_percentage_error"])
model.summary()

history =  model.fit(train_x, train_y,
                     epochs=epochs,
                     batch_size=batch_size, 
                     validation_data=(val_x, val_y))

model_save_name = next_free_model_name(model_name)
model_save_path = MODELS_DIR + "/" + model_save_name + ".h5"
model.save(model_save_path)

history_save_path = MODELS_DIR + "/" + model_save_name + ".csv"
history_dataframe = pd.DataFrame(history.history)
history_dataframe.to_csv(history_save_path)

Model: "third_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM_1 (LSTM)                (None, 14, 50)            11400     
_________________________________________________________________
LSTM_2 (LSTM)                (None, 100)               60400     
_________________________________________________________________
Dense_1 (Dense)              (None, 200)               20200     
_________________________________________________________________
Dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
Dense_2 (Dense)              (None, 400)               80400     
_________________________________________________________________
Dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
Dense_3 (Dense)              (None, 50)                

In [None]:
model = keras.models.load_model(MODELS_DIR + "/" + "third_model_000.h5")
loaded_history = pd.read_csv(MODELS_DIR + "/" + "third_model_000.csv", index_col=0)
plot_losses(loaded_history)
results = generate_verification_dataframe(model, test_x_segmented, test_y_segmented, test_tickers)
plot_result_verification(results)

In [277]:
companies_to_process = 500
tickers = prices.symbol.unique()[:companies_to_process]

company_data = []
for ticker in tickers:
    stock_price = prices[prices["symbol"] == ticker]
    new_indicators = pd.DataFrame()
    
    new_indicators["OBV"] = ta.volume.on_balance_volume(stock_price["close"], stock_price["volume"])
    
    indicator = ta.volume.AccDistIndexIndicator(stock_price["high"], stock_price["low"],
                                            stock_price["close"], stock_price["volume"])
    new_indicators["ADI"] = indicator.acc_dist_index()
    
    indicator = ta.trend.AroonIndicator(stock_price["close"])
    new_indicators["AI"]  = indicator.aroon_indicator()
    
    indicator = ta.momentum.RSIIndicator(close=stock_price["close"], window=14)
    new_indicators["RSI"] = indicator.rsi()
    
    indicator = ta.volume.VolumeWeightedAveragePrice(stock_price["high"], stock_price["low"], 
                                     stock_price["close"],stock_price["volume"], window=14)
    new_indicators["VWAP"] = indicator.volume_weighted_average_price()
    
    new_indicators["close"] = stock_price["close"]
    new_indicators = new_indicators.to_numpy()
    
    company_data.append(new_indicators)

company_data = np.array(company_data, dtype=object)
price_data=company_data

new_price_data = []
for company in price_data:
    new_comapny = company[~np.isnan(company).any(axis=1)]
    new_price_data.append(new_comapny)

price_data = np.array(new_price_data, dtype=object)
scaled_price_data, scalers = scale_3d_price_data(price_data)

In [278]:
sequence_length = 14
predicted_values = -1 # last value in 
data_split = (0.7, 0.2, 0.1)

# price_data = np.array([0,1,2,3,4,5,6,7,8,9,10,11])
train_data_length = (0, round(data_split[0] * scaled_price_data.shape[0]))
val_data_length = (train_data_length[1], train_data_length[1] + round(data_split[1] * scaled_price_data.shape[0]))
test_data_length = (val_data_length[1], val_data_length[1] + round(data_split[2] * scaled_price_data.shape[0]))

input_data, output_data = [], []
for company in scaled_price_data:
    input_data.append(generate_input_sequences(data=company, input_length=sequence_length))
    output_data.append(generate_output_sequences(data=company, input_length=sequence_length, 
                                                 outputs=predicted_values))

train_x_segmented = np.array(input_data[train_data_length[0]:train_data_length[1]], dtype=object)
train_y_segmented = np.array(output_data[train_data_length[0]:train_data_length[1]], dtype=object)
train_scalers = scalers[train_data_length[0]:train_data_length[1]]
train_tickers = tickers[train_data_length[0]:train_data_length[1]]

val_x_segmented = np.array(input_data[val_data_length[0]:val_data_length[1]], dtype=object)
val_y_segmented = np.array(output_data[val_data_length[0]:val_data_length[1]], dtype=object)
val_scalers = scalers[val_data_length[0]:val_data_length[1]]
val_tickers = tickers[val_data_length[0]:val_data_length[1]]

test_x_segmented = np.array(input_data[test_data_length[0]:test_data_length[1]], dtype=object)
test_y_segmented = np.array(output_data[test_data_length[0]:test_data_length[1]], dtype=object)
test_scalers = scalers[test_data_length[0]:test_data_length[1]]
test_tickers = tickers[test_data_length[0]:test_data_length[1]]
train_x = flatten_data(train_x_segmented)
train_y = flatten_data(train_y_segmented)

val_x = flatten_data(val_x_segmented)
val_y = flatten_data(val_y_segmented)

print(f"Segmented train_x: {train_x_segmented.shape}")
print(f"Segmented rain_y: {train_y_segmented.shape}")
print("---------------------------------")
print(f"Segmented val_x:   {val_x_segmented.shape}")
print(f"Segmented val_y:   {val_y_segmented.shape}")
print("---------------------------------")
print(f"Segmented test_x:  {test_x_segmented.shape}")
print(f"Segmented test_y:  {test_y_segmented.shape}")
print("---------------------------------")
print(f"Train_x:  {train_x.shape}")
print(f"Train_y:  {train_y.shape}")
print("---------------------------------")
print(f"Val_x:   {val_x.shape}")
print(f"Val_y:   {val_y.shape}")


Segmented train_x: (350,)
Segmented rain_y: (350,)
---------------------------------
Segmented val_x:   (100, 1723, 14, 6)
Segmented val_y:   (100, 1723, 1)
---------------------------------
Segmented test_x:  (50,)
Segmented test_y:  (50,)
---------------------------------
Train_x:  (601539, 14, 6)
Train_y:  (601539, 1)
---------------------------------
Val_x:   (172300, 14, 6)
Val_y:   (172300, 1)


In [283]:
epochs = 30
batch_size=512
model_name = "max_data_large_model"

model = models.Sequential(name=model_name)

model.add(layers.LSTM(units=50, activation="relu", name="LSTM_1", dropout=0.1, 
                      input_shape=(train_x.shape[1], train_x.shape[2]), 
                      return_sequences=True))

model.add(layers.LSTM(units=100, activation="relu", name="LSTM_2", dropout=0.1, return_sequences=True))

model.add(layers.LSTM(units=200, activation="relu", name="LSTM_3", dropout=0.2))

model.add(layers.Dense(units=200, activation="relu", name="Dense_1"))
model.add(layers.Dropout(0.20, name="Dropout_1"))

model.add(layers.Dense(units=400, activation="relu", name="Dense_2"))
model.add(layers.Dropout(0.30, name="Dropout_2"))

model.add(layers.Dense(units=50, activation="relu", name="Dense_3"))

model.add(layers.Dense(1, activation="sigmoid", name="classifier"))

model.compile(optimizer="Adam", 
              loss="mae", 
              metrics=["mean_absolute_percentage_error"])
model.summary()

history =  model.fit(train_x, train_y,
                     epochs=epochs,
                     batch_size=batch_size, 
                     validation_data=(val_x, val_y))

model_save_name = next_free_model_name(model_name)
model_save_path = MODELS_DIR + "/" + model_save_name + ".h5"
model.save(model_save_path)

history_save_path = MODELS_DIR + "/" + model_save_name + ".csv"
history_dataframe = pd.DataFrame(history.history)
history_dataframe.to_csv(history_save_path)

Model: "max_data_large_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM_1 (LSTM)                (None, 14, 50)            11400     
_________________________________________________________________
LSTM_2 (LSTM)                (None, 14, 100)           60400     
_________________________________________________________________
LSTM_3 (LSTM)                (None, 200)               240800    
_________________________________________________________________
Dense_1 (Dense)              (None, 200)               40200     
_________________________________________________________________
Dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
Dense_2 (Dense)              (None, 400)               80400     
_________________________________________________________________
Dropout_2 (Dropout)          (None, 400)      

In [None]:
model = keras.models.load_model(MODELS_DIR + "/" + "max_data_large_model_000.h5")
loaded_history = pd.read_csv(MODELS_DIR + "/" + "max_data_large_model_000.csv", index_col=0)
plot_losses(loaded_history)
results = generate_verification_dataframe(model, test_x_segmented, test_y_segmented, test_tickers)
plot_result_verification(results)