# **Prediction of Stocks Prices with Deep Learning**

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import display, Javascript, clear_output
import os
import shutil
from datetime import datetime

!pip install --upgrade tensorflow==2.4.0-rc4
import tensorflow as tf

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
tf.keras.backend.set_floatx('float64')

!pip install --upgrade scikit-learn==0.24.0rc1

!pip install yahooquery
from yahooquery import Ticker

!rm -rf stocks
!git clone https://github.com/Talendar/stocks_prices_prediction stocks

%load_ext autoreload
%autoreload 2

from stocks.aux.stocks_data import MultiStocksDataset
from stocks.aux.eval import *
from stocks.aux.normalization import *
from stocks.aux.tf_callbacks import ClearCallback

Collecting tensorflow==2.4.0-rc4
  Downloading tensorflow-2.4.0rc4-cp37-cp37m-manylinux2010_x86_64.whl (394.7 MB)
[K     |████████████████████████████████| 394.7 MB 25 kB/s s eta 0:00:01��████                      | 124.7 MB 49.8 MB/s eta 0:00:06     |█████████████▌                  | 166.4 MB 48.5 MB/s eta 0:00:05        | 171.2 MB 48.5 MB/s eta 0:00:05     |██████████████                  | 172.6 MB 48.5 MB/s eta 0:00:05     |███████████████                 | 184.1 MB 62.6 MB/s eta 0:00:04�█▌ | 376.1 MB 34.7 MB/s eta 0:00:01
Collecting grpcio~=1.32.0
  Downloading grpcio-1.32.0-cp37-cp37m-manylinux2014_x86_64.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 33.9 MB/s eta 0:00:01
[?25hCollecting numpy~=1.19.2
  Downloading numpy-1.19.4-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB)
[K     |████████████████████████████████| 14.5 MB 40.4 MB/s eta 0:00:01
[?25hCollecting six~=1.15.0
  Downloading six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting tensorflow-estimator<2.5

Collecting scikit-learn==0.24.0rc1
  Downloading scikit_learn-0.24.0rc1-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 661 kB/s eta 0:00:01
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.2
    Uninstalling scikit-learn-0.23.2:
      Successfully uninstalled scikit-learn-0.23.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autogluon-core 0.0.15b20201207 requires scikit-learn<0.24,>=0.22.0, but you have scikit-learn 0.24.0rc1 which is incompatible.[0m
Successfully installed scikit-learn-0.24.0rc1
Collecting yahooquery
  Downloading yahooquery-2.2.8.tar.gz (45 kB)
[K     |████████████████████████████████| 45 kB 146 kB/s eta 0:00:011
Collecting requests-futures==1.0.0
  Downloading requests-futures-1.0.0.tar.gz (10 kB)
Coll

## **0) Base settings**

In [5]:
NUM_SESSIONS = 10  # number of previous trading sessions the model will analyse in order to make a prediction
NORMALIZE_LABELS = True

TRAIN_PC, VAL_PC, TEST_PC = 0.75, 0.15, 0.1
assert (TRAIN_PC + VAL_PC + TEST_PC) == 1

LABELS_NAMES = [    # values that the model will try to predict
    "open", "low", "high",
]

INTERVAL = "1d"
NAME_LIST = set([
    # South America
    "^BVSP",                                        # Brazil
    # North America
    "^DJI", "^GSPC", "^IXIC", "^NYA", "^RUT",       # US
    "^MXX",                                         # Mexico
    "^GSPTSE",                                      # Canada
    # Europe
    "^FTSE", "^FCHI", "^GDAXI", "^IBEX", "^AEX", "^ATX",
    "^N100",  "^BFX", "^OMX",
    # Asia
    "000001.SS", "^HSI", "399001.SZ", "^TWII",      # China
    "^N225",                                        # Japan
    "^KS11",                                        # S. Korea 
    "^BSESN", "^NSEI",                              # India
    "TA35.TA",                                      # Israel
    # Oceania
    "^AORD",                                        # Australia
    
])

# loading stocks with specific params:
STOCKS = {
}

# loading stocks from NAME_LIST with default info (same params for all):
STOCKS.update({name: {"start": "2007-01-01",
                      "end": "2020-12-08",
                      "period": None} \
               for name in NAME_LIST})

## **1) Preparing the data**

#### **1.1) Fetching and pre-processing the data**

In [6]:
MAX_ZERO_VOL_PC = 0.5
multi_data = {}

for name, info in STOCKS.items():
    hist = Ticker(name).history(
        start=info["start"],
        end=info["end"],
        period=info["period"], 
        interval=INTERVAL
    ).reset_index("symbol", drop=True)
    hist = hist[["high", "close", "open", "low", "volume"]]
    
    zero_vols_pc = hist["volume"].isin([0]).sum() / len(hist)
    if zero_vols_pc > MAX_ZERO_VOL_PC:
        print(f"[WARNING] {100*zero_vols_pc : .2f}% of the" + \
              f"volumes of {name} are 0! Skipping symbol...")
    else:
        multi_data[name] = hist


multi_data = MultiStocksDataset(
    stocks=multi_data, 
    num_sessions=NUM_SESSIONS,
    labels_names=LABELS_NAMES,
    batch_size=len(STOCKS) * 5,
    data_split_pc=(TRAIN_PC, VAL_PC, TEST_PC),
    feature_normalization=(min_max_norm, min_max_denorm), 
    label_normalization=(min_max_norm, min_max_denorm) \
                        if NORMALIZE_LABELS else None,
)

TypeError: from_generator() got an unexpected keyword argument 'output_signature'

#### **1.2) Inspecting the data**

In [None]:
visualize = "all"  # "all" or list with specific symbols
total_sessions = 0
for symbol, sdata in multi_data.stocks:
    total_sessions += len(sdata.raw)
    if visualize == "all" or symbol in visualize:
        zero_vols_pc = sdata.raw["volume"].isin([0]).sum() / len(sdata.raw)
        print("\n\n" + "#"*35 + f"   {symbol}   " + "#"*35 + "\n\n" +
              f". Period: from {sdata.raw.index[0]} to {sdata.raw.index[-1]}\n" +
              f". Trading sessions: {len(sdata.raw)}\n" +
              f". Zero volumes: {100*zero_vols_pc : .2f}%\n" 
              f". Data:\n")
        display(sdata.raw)

        print(f"\n. Statistics:\n")
        display(sdata.raw.describe())

        print(f"\n. Plot:\n")
        ax = sdata.raw["open"].plot(figsize=(12,5), color=np.random.rand(1, 3))
        ax.set_title(f"{symbol} opening prices\n", fontsize=16, color="#ffffff");
        ax.set_ylabel("Opening prices", fontsize="14", color="#ffffff");
        ax.set_xlabel("Date", fontsize="14", color="#ffffff");
        plt.show()
        print("\n\n" + "#"*80 + "\n\n")

print(f"Total number of trading sessions: {total_sessions}")

In [None]:
# data division dates
for symbol, data in multi_data.stocks:
    print(f">>>>> {symbol}")
    print(". Training range: %s to %s" % (
        data.raw_train["features"].index[0], data.raw_train["features"].index[-1]))
    print(". Validation range: %s to %s" % (
        data.raw_val["features"].index[0], data.raw_val["features"].index[-1]))
    print(". Test range: %s to %s\n" % (
        data.raw_test["features"].index[0], data.raw_test["features"].index[-1]))

In [None]:
# comparing normalized values of the training sets
print("#"*20 + " Normalized training data " + "#"*20)
for symbol, data in multi_data.stocks:
    print(f"\n>>>>> {symbol}")
    display(data.norm_train["features"].tail())

In [None]:
# comparing normalized values of the validation sets
print("#"*20 + " Normalized validation data " + "#"*20)
for symbol, data in multi_data.stocks:
    print(f"\n>>>>> {symbol}")
    display(data.norm_val["features"].tail())

In [None]:
# comparing normalized values of the test sets
print("#"*20 + " Normalized test data " + "#"*20)
for symbol, data in multi_data.stocks:
    print(f"\n>>>>> {symbol}")
    display(data.norm_test["features"].tail())

In [None]:
# sample output shape
sample_x, sample_y = next(iter(multi_data.tf_datasets["train"]))
print(f"Sample input shape: {sample_x.shape}")
print(f"Sample label shape: {sample_y.shape}")

## **2) Defining a model**

In [None]:
class MultiLSTM(tf.keras.Model):
    """ Custom LSTM model. """

    def __init__(self, num_sessions=NUM_SESSIONS, load_path=None):
        super(MultiLSTM, self).__init__()
        if load_path is not None:
            self.custom_load(load_path)
        else:
            self._num_sessions = num_sessions
            self._open_predictor = model = tf.keras.models.Sequential([
                tf.keras.layers.LSTM(32, return_sequences=True),
                tf.keras.layers.Dropout(0.25),
                tf.keras.layers.LSTM(32, return_sequences=True),
                tf.keras.layers.Dropout(0.25),
                tf.keras.layers.LSTM(32, return_sequences=False),
                tf.keras.layers.BatchNormalization(),
                #tf.keras.layers.Dense(128, activation="relu"),  
                #tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dense(1, activation="relu"),
            ])

            self._pre_lowest = tf.keras.models.Sequential([
                tf.keras.layers.LSTM(32, return_sequences=True)
            ])
            self._lowest_predictor = tf.keras.models.Sequential([
                tf.keras.layers.LSTM(32, return_sequences=True),
                tf.keras.layers.Dropout(0.25),
                tf.keras.layers.LSTM(32, return_sequences=True),
                tf.keras.layers.Dropout(0.25),
                tf.keras.layers.LSTM(32, return_sequences=False),
                tf.keras.layers.BatchNormalization(),
                #tf.keras.layers.Dense(128, activation="relu"),  
                #tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dense(1, activation="relu"),
            ])

            self._pre_highest = tf.keras.models.Sequential([
                tf.keras.layers.LSTM(32, return_sequences=True)
            ])
            self._highest_predictor = tf.keras.models.Sequential([
                tf.keras.layers.LSTM(32, return_sequences=True),
                tf.keras.layers.Dropout(0.25),
                tf.keras.layers.LSTM(32, return_sequences=True),
                tf.keras.layers.Dropout(0.25),
                tf.keras.layers.LSTM(32, return_sequences=False),
                tf.keras.layers.BatchNormalization(),
                #tf.keras.layers.Dense(128, activation="relu"),  
                #tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dense(1, activation="relu"),
            ])

    def call(self, inputs, training=None):
        # calculating the opening price
        open_price = self._open_predictor(inputs, training=training)

        # cloning prices: expanding shape from (batches, 1) to (batches, NUM_SESSIONS, 1)
        open_price_exp = tf.tile(tf.expand_dims(open_price, -1),   
                                 [1, self._num_sessions, 1])

        # calculating the lowest price
        lowest_price = self._lowest_predictor(
            # appends the opening price to the end of each item in the input sequence
            tf.concat([self._pre_lowest(inputs), open_price_exp], -1),
            training=training,
        )

        # cloning prices: expanding shape from (batches, 1) to (batches, NUM_SESSIONS, 1)
        lowest_price_exp = tf.tile(tf.expand_dims(lowest_price, -1),   
                                   [1, self._num_sessions, 1])

        # calculating the highest price
        highest_price = self._highest_predictor(
            # appends the opening price and the lowest price to the end of each item in the input sequence
            tf.concat(
                [self._pre_highest(inputs), open_price_exp, lowest_price_exp], -1),
            training=training,
        )

        # returning the concatenation of the opening price, lowest price and highest price
        return tf.concat([open_price, lowest_price, highest_price], -1)
    
    def custom_save(self, dir_path):
        if not os.path.isdir(dir_path):
            os.makedirs(dir_path)
            
        with open(os.path.join(dir_path, "info.txt"), "w") as file:
            file.write("num_sessions %d" % self._num_sessions)
        
        pred_dir = os.path.join(dir_path, "predictors")
        if not os.path.isdir(pred_dir):
            os.makedirs(pred_dir)
            
        pre_dir = os.path.join(dir_path, "pre_layers")
        if not os.path.isdir(pre_dir):
            os.makedirs(pre_dir)
            
        # saving predictors
        self._open_predictor.save(os.path.join(pred_dir, "open_predictor"))
        self._open_predictor.save(os.path.join(pred_dir, "open_predictor.h5"))
        
        self._lowest_predictor.save(os.path.join(pred_dir, "low_predictor"))
        self._lowest_predictor.save(os.path.join(pred_dir, "low_predictor.h5"))
        
        self._highest_predictor.save(os.path.join(pred_dir, "high_predictor"))
        self._highest_predictor.save(os.path.join(pred_dir, "high_predictor.h5"))
        
        # saving pre-layers
        self._pre_lowest.save(os.path.join(pred_dir, "pre_low"))
        self._pre_lowest.save(os.path.join(pred_dir, "pre_low.h5"))
        
        self._pre_highest.save(os.path.join(pred_dir, "pre_high"))
        self._pre_highest.save(os.path.join(pred_dir, "pre_high.h5"))
    
    def custom_load(self, dir_path):
        with open(os.path.join(dir_path, "info.txt"), "r") as file:
            self._num_sessions = int(file.readline().split()[1])
            
        pred_dir = os.path.join(dir_path, "predictors")
        pre_dir = os.path.join(dir_path, "pre_layers")
        
        # loading predictors
        self._open_predictor = tf.keras.models.load_model(os.path.join(pred_dir, "open_predictor"))
        self._lowest_predictor = tf.keras.models.load_model(os.path.join(pred_dir, "low_predictor"))
        self._highest_predictor = tf.keras.models.load_model(os.path.join(pred_dir, "high_predictor"))
        
        # saving pre-layers
        self._pre_lowest = tf.keras.models.load_model(os.path.join(pred_dir, "pre_low"))
        self._pre_highest = tf.keras.models.load_model(os.path.join(pred_dir, "pre_high"))

In [None]:
# testing output shape
x, y = next(iter(multi_data.tf_datasets["train"]))
MultiLSTM()(x).shape

## **3) Training the model**

In [None]:
# loading all validation data into memory (so we dont have to use a generator)
val_inputs, val_outputs = [], []
for x, y in multi_data.tf_datasets["val"]:
    val_inputs.append(x)
    val_outputs.append(y)

val_inputs = tf.concat(val_inputs, axis=0)
val_outputs = tf.concat(val_outputs, axis=0)

print(val_inputs.shape, val_outputs.shape)

In [None]:
save_path = f"saved_model_{datetime.today().strftime('%Y-%m-%d-%H-%M-%S')}"
tf.keras.backend.clear_session()

# building and compiling
model = MultiLSTM()
model.compile(loss=tf.losses.MeanSquaredError(),
              optimizer=tf.optimizers.Adam(learning_rate=1e-3),
              metrics=[])
                       #tf.metrics.MeanAbsoluteError()]) 
                       #tf.metrics.MeanAbsolutePercentageError()])

# training
epochs = 20
history = model.fit(
    multi_data.tf_datasets["train"].shuffle(buffer_size=multi_data.size["train"]), 
    epochs=epochs,
    validation_data=(val_inputs, val_outputs),
    callbacks=[ClearCallback(),
               ModelCheckpoint(filepath=os.path.join(save_path, "checkpoint_best"), 
                                                     monitor='val_loss', 
                                                     verbose=1, save_best_only=True)],
)

# restoring the checkpoint of the best model
model.load_weights(os.path.join(save_path, "checkpoint_best"))

# saving and downloading the model
model.save(os.path.join(save_path, "full_save"))
model.custom_save(save_path)
shutil.make_archive(save_path, "zip", save_path)

# visualizing loss history
print("\n\n")
plt.rc('xtick',labelsize=12, color="#DCDCDC")
plt.rc('ytick',labelsize=12, color="#DCDCDC")
plt.rcParams.update({'legend.fontsize': 14, 'legend.handlelength': 2})

plt.figure(figsize=(10, 6))
plt.plot(range(epochs), history.history["loss"], 'r--')
plt.plot(range(epochs), history.history["val_loss"], 'b-')
plt.legend(['Training Loss', 'Validation Loss'])

plt.title("Loss History\n", fontsize=17, color="#E0E0E0")
plt.ylabel('MSE\n', fontsize=14, color="#E0E0E0")
plt.xlabel('\nEpoch', fontsize=14, color="#E0E0E0")
plt.show();

In [None]:
#model = MultiLSTM(load_path="saved_model_2020-12-10-01-42-11")
#model.compile(loss=tf.losses.MeanSquaredError(),
              #optimizer=tf.optimizers.Adam(learning_rate=1e-3),
              #metrics=[])

## **4) Evaluating on all the stocks**

In [None]:
loss = {}
for mode in ["train", "val", "test"]:
    loss[mode] = model.evaluate(multi_data.tf_datasets[mode])

loss = pd.Series(loss.values(), index=loss.keys())
print("\n\n>>> Loss value (on the normalized data):")
loss

## **5) Evaluating on individual stocks**

In [None]:
results, predictions = {}, {}
for i, stk in enumerate(STOCKS.keys()):
    clear_output(wait=True)
    print(f"[{100*(i+1) / len(STOCKS) : 2f}%] Evaluating stock {i+1} of {len(STOCKS)}... ", end="")

    results[stk], predictions[stk] = {}, {}
    for mode in ["train", "val", "test"]:
        predictions[stk][mode], results[stk][mode] = eval(model, 
                                                          multi_data[stk], mode)
    print("done!")

#### **5.1) Training data**

In [None]:
symbol = "^BVSP"

In [None]:
print("     Training Data\n")
eval_print(results[symbol]["train"])
eval_plot(predictions[symbol]["train"], 
          multi_data[symbol].raw_train["labels"], 
          start_date="random", 
          plot_samples=100, 
          title="Performance on the Training Set")

#### **5.2) Validation data**

In [None]:
print("     Validation Data\n")
eval_print(results[symbol]["val"])
eval_plot(predictions[symbol]["val"], 
          multi_data[symbol].raw_val["labels"], 
          start_date="random", 
          plot_samples=100, 
          title="Performance on the Validation Set")

#### **5.3) Test data**

In [None]:
print("    Test Data\n")
eval_print(results[symbol]["test"])
eval_plot(predictions[symbol]["test"], 
          multi_data[symbol].raw_test["labels"], 
          start_date="random", 
          plot_samples=100, 
          title="Performance on the Test Set")