<a href="https://colab.research.google.com/github/StevenVuong/self_learning/blob/master/Financial_data_playaround.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Goal: Apply basic algorithms (LTSM), 1D-NN and fbprophet to financial data

#### Loading and Preparing Data

##### Module Installations and Imports

In [0]:
!pip install yfinance # https://pypi.org/project/yfinance/

In [0]:
# Standard Imports
import pandas as pd
import numpy as np
import datetime

# YFinance API
import yfinance as yf

# Display Imports
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, widgets
import pprint
import matplotlib.pyplot as plt

%matplotlib inline

In [0]:
try:
  # Use the %tensorflow_version magic if in colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [0]:
import tensorflow as tf

keras = tf.keras

##### Some Utility Classes/Functions

In [0]:
# Ref: https://stackoverflow.com/questions/8924173/how-do-i-print-bold-text-in-python
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

print(color.BOLD + 'I am BOLD!' + color.END)

In [0]:
def format_date(date:datetime.datetime) -> str:
  return ("%2.0f-%02d-%02d" % (date.year, date.month, date.day))

In [0]:
def dataframe_plots(stock_df: pd.DataFrame):
  """
  Basic Plot to get an idea of the stock information, plotting:
  -  High and Low
  -  Open and Adjusted Close
  -  Volume
  For the given time Window
  """
  plt.subplot(2,1,1)
  stock_df.High.plot(label="High")
  stock_df.Low.plot(label="Low")

  plt.legend(loc="best")
  plt.title("High and Low")
  plt.show()


  plt.subplot(2,1,1)
  stock_df.Open.plot(label="Open")
  stock_df.Adj_Close.plot(label="Adj_Close")

  plt.legend(loc="best")
  plt.title("Open and Adjusted Close")

  plt.tight_layout()
  plt.show()


  plt.subplot(2,1,1)
  stock_df.Volume.plot(label="Volume")

  plt.title("Volume")

  plt.tight_layout()
  plt.show()

In [0]:
def process_stock_df(dataframe:pd.DataFrame) -> pd.DataFrame:
  """
  Renames Adjusted Close to remove the space
  Adds differences column between high & low and open & adjusted_close
  """
  dataframe = dataframe.rename(columns={"Adj Close": "Adj_Close"})
  dataframe["hi_lo_diff"] = dataframe.High - dataframe.Low
  dataframe["op_adcl_diff"] = dataframe.Adj_Close - dataframe.Open

  return dataframe

In [0]:
def dataframe_plots_diff(stock_df_adj: pd.DataFrame):
  """
  Plots The Following for a stock dataframe:
  -  High - Low
  -  Open - Adjusted Close
  -  Volume
  For the given time Window
  """
  plt.subplot(2,1,1)
  stock_df_adj.hi_lo_diff.plot()
  plt.title("High - Low Difference")
  plt.show()

  plt.subplot(2,1,1)
  stock_df.op_adcl_diff.plot()
  plt.title("Open - Adjusted Close")
  plt.show()


  plt.subplot(2,1,1)
  stock_df.Volume.plot()
  plt.title("Volume")
  plt.show()

##### Configurables: 
-  Days back to get data from
-  Stock Tags to examine

In [0]:
end_date = datetime.datetime.now() # Format: 'YYYY-MM-DD'

stock_tag_list = ["BABA"] #, "MSFT", "GOOGL", "AMZN", "BABA"]

print("Stock Tag Data to be obtained:")
pprint.pprint( stock_tag_list)

In [0]:
# Default days back and string of past
days_back = 1000
date_str_past = format_date(end_date - datetime.timedelta(days=days_back))

print("Slide To adjust data taken from number of Days Past")
@interact(x=widgets.IntSlider(min=0, max=1000, step=1, value=days_back))
def adjust_time_period(x):
    days_back = x

    date_str_now = format_date(end_date)
    date_str_past = format_date(end_date - datetime.timedelta(days=days_back))

    print(f"Start Period: {date_str_past}, End Period: {date_str_now}")

##### Loading the stocks

In [0]:
stock_df_dict = {}

for stock_tag in stock_tag_list:

  print(f"Loading financial data for: {stock_tag}")

  stock_data = yf.download(stock_tag, date_str_past, end_date)
  stock_df_dict[stock_tag]=stock_data

  stock_data.Close

##### Examine data for desired Stock Tag

In [0]:
stock_tag_to_analyse = "BABA"

In [0]:
stock_df = stock_df_dict.get(stock_tag_to_analyse)
stock_df = process_stock_df(stock_df)

In [0]:
print("Data For: " + color.BOLD + stock_tag_to_analyse + color.END)
stock_df.head()

In [0]:
#dataframe_plots(stock_df)

In [0]:
#dataframe_plots_diff(stock_df)

#### Now the Real Analysis Begins
Goal: 
-  LSTM Forecasting for last three columns of df
  -  Ensemble the three and see how they do against test set
-  1D CNN Forecasting for last three columns of df
  - Again, Ensemble
  - Maybe we can increase the number of inputs to our NN, so there are 3 inputs instead  

##### Start with LSTM RNN Forecasting

In [0]:
## ToDo:
# We will start with an LSTM, see how we go from there..
# NN Inputs: Difference between Open & Adjusted CLose, Volume, Difference Between High and Low
# Also look at what the big dogs are investing in and copy their trades to a given degree?
# Do we want to filter out noise in our network?

In [0]:
# Split to train and test data, take 10% as test
train_split = 0.9

days_train = round(days_back * train_split)
days_test = round((1-train_split) * days_train)

print("Train/Test Split = {:.1f}:{:.1f}".format(train_split, (1-train_split)))

In [0]:
# Format Dates for Test
test_end_date = format_date(end_date) # Stick to day today
datetime_test_start_date = end_date - datetime.timedelta(days=days_test)
test_start_date = format_date(datetime_test_start_date)

# Format Dates for Train
train_end_date = test_start_date
datetime_train_start_date = datetime_test_start_date - datetime.timedelta(days=days_train)
train_start_date = format_date(datetime_train_start_date)

print("Training Time Period: {} to {}".format(train_start_date, train_end_date))
print("Testing Time Period: {} to {}".format(test_start_date, test_end_date))

In [0]:
# Getting Data for Train and Test
df_train = yf.download(stock_tag_to_analyse, train_start_date, train_end_date)
df_train.Close

df_test = yf.download(stock_tag_to_analyse, test_start_date, test_end_date)
df_test.Close

In [0]:
df_train =  process_stock_df(df_train)
df_test = process_stock_df(df_test)

print("Number of days train: {}, Number of days test: {}".format(len(df_train), len(df_test)))

##### Okay Apparently we had to split to train and test series; now let's try actual LTSM RNN Forecasting

In [0]:
# Set what we want to train on
x_train = df_train.hi_lo_diff.values
x_test = df_test.hi_lo_diff.values
# adjcl_diff needs to be multiplied by 100
# hi_lo_diff can be taken as is
# volume yet to work

In [0]:
def sequential_window_dataset(series, window_size):
  """
  Creates a Sequential Window from a series input
  Ref: https://www.tensorflow.org/api_docs/python/tf/data/Dataset
  """
  series = tf.expand_dims(series, axis=-1)                                            # Puts in desired TF Format
  ds = tf.data.Dataset.from_tensor_slices(series)                                     # Creates tf Dataset object
  ds = ds.window(window_size + 1, shift=window_size, drop_remainder=True)             # Creates sliding of smaller  window of Dataset Object
  ds = ds.flat_map(lambda window: window.batch(window_size + 1))                      # Make sure order stays the same
  ds = ds.map(lambda window: (window[:-1], window[1:]))                               # Create map of window of the corresponding index
  return ds.batch(1).prefetch(1)                                                      # Allow next batch to be prepared whilst current is processing

class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
      """
      Abstract base to build new callbacks: https://keras.io/callbacks/S
      """
      self.model.reset_states()

In [0]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

window_size = 15 # Number of entires to consider as one dataset
train_set = sequential_window_dataset(x_train, window_size)
valid_set = sequential_window_dataset(x_test, window_size)

model = keras.models.Sequential([
  keras.layers.LSTM(128, return_sequences=True, stateful=True,
                    batch_input_shape=[1, None, 1]),
  keras.layers.LSTM(128, return_sequences=True, stateful=True),
  keras.layers.Dense(1),
  keras.layers.Lambda(lambda x: x ) # 1:1 mapping for the time being
])

lr_schedule = keras.callbacks.LearningRateScheduler(
    lambda epoch: 1e-8 * 10**(epoch / 20)
    )
optimizer = keras.optimizers.SGD(lr=1e-8, momentum=0.9)

model.compile(loss=keras.losses.Huber(),
              optimizer=optimizer,
              metrics=["mae"]) #metric?

reset_states = ResetStatesCallback()
model_checkpoint = keras.callbacks.ModelCheckpoint(
    "my_checkpoint.h5", save_best_only=True
)
early_stopping = keras.callbacks.EarlyStopping(patience=50)

history = model.fit(
    train_set, 
    epochs=500, # To be changed
    validation_data=valid_set,
    callbacks=[early_stopping, model_checkpoint, lr_schedule, reset_states]
    )

##### Plots for Model

In [0]:
plt.semilogx(history.history["lr"], history.history["loss"])
plt.axis([1e-8, 1e-4, 0, 30])

##### Predict with Model

In [0]:
model = keras.models.load_model("my_checkpoint.h5")

In [0]:
all_data = np.concatenate((x_train, x_test), axis=None)

rnn_forecast = model.predict(all_data[np.newaxis, :, np.newaxis])
rnn_forecast = rnn_forecast[0, -len(x_test):, 0]

In [0]:
plt.plot(rnn_forecast)
plt.plot(x_test)

In [0]:
keras.metrics.mean_absolute_error(x_test, rnn_forecast).numpy()

##### Fully Convolutional Forecasting Now

In [0]:
def seq2seq_window_dataset(series, window_size, batch_size=32,
                           shuffle_buffer=1000):
    series = tf.expand_dims(series, axis=-1)
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + 1))
    ds = ds.shuffle(shuffle_buffer)
    ds = ds.map(lambda w: (w[:-1], w[1:]))
    return ds.batch(batch_size).prefetch(1)


def model_forecast(model, series, window_size):
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size))
    ds = ds.batch(32).prefetch(1)
    forecast = model.predict(ds)
    return forecast

In [0]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

window_size = 32
train_set = seq2seq_window_dataset(x_train, window_size,
                                   batch_size=128)
valid_set = seq2seq_window_dataset(x_test, window_size,
                                   batch_size=128)

model = keras.models.Sequential()
model.add(keras.layers.InputLayer(input_shape=[None, 1]))
for dilation_rate in (32, 64, 128, 256, 128, 64, 32):
    model.add(
      keras.layers.Conv1D(filters=dilation_rate,
                          kernel_size=2,
                          strides=1,
                          dilation_rate=dilation_rate,
                          padding="causal",
                          activation="relu")
    )

model.add(keras.layers.Conv1D(filters=1, kernel_size=1))
optimizer = keras.optimizers.Adam(lr=3e-4)
model.compile(loss=keras.losses.Huber(),
              optimizer=optimizer,
              metrics=["mae"])

In [0]:
model.summary()

In [0]:
model_checkpoint = keras.callbacks.ModelCheckpoint(
    "my_checkpoint.h5", save_best_only=True)
early_stopping = keras.callbacks.EarlyStopping(patience=50)
history = model.fit(train_set, epochs=500,
                    validation_data=valid_set,
                    callbacks=[early_stopping, model_checkpoint])

In [0]:
cnn_forecast = model_forecast(model, all_data[..., np.newaxis], window_size)
cnn_forecast = cnn_forecast[-len(x_test):, -1, 0]

In [0]:
plt.plot(cnn_forecast, label="cnn_forecast")
plt.plot(x_test, label="test")
plt.legend(loc="best")
plt.show()

In [0]:
keras.metrics.mean_absolute_error(cnn_forecast, x_test).numpy()

In [0]:
train_set

##### Try Another Model (UNet with skip ocnnections)

In [0]:
inputs = keras.Input(shape=(None, 1), name="input")

conv1 = keras.layers.Conv1D(filters=32, kernel_size=2, strides=1, padding='same', activation='relu', name="f32")(inputs) 
conv2 = keras.layers.Conv1D(filters=64, kernel_size=2, strides=1, padding='same', activation='relu', name="f64")(conv1) 

conv3 = keras.layers.Conv1D(filters=128, kernel_size=2, strides=1, padding='same', activation='relu', name="f128")(conv2) 

conv4 = keras.layers.Conv1D(filters=64, kernel_size=2, strides=1, padding='same', activation='relu', name="f64_b")(conv3) 
conv5 = keras.layers.concatenate([conv2, conv4], axis=0) # axis could be 1?

conv6 = keras.layers.Conv1D(filters=32, kernel_size=2, strides=1, padding='same', activation='relu')(conv5) 
conv7 = keras.layers.Conv1D(filters=1, kernel_size=1)(conv6)

modell = keras.Model(inputs=inputs, outputs=conv7)

optimizer = keras.optimizers.Adam(lr=3e-4)
modell.compile(loss=keras.losses.Huber(),
              optimizer=optimizer,
              metrics=["mae"])

In [0]:
model.summary()

In [0]:
keras.utils.plot_model(model)

In [0]:
history = modell.fit(train_set, epochs=500,
                    validation_data=valid_set,
                    callbacks=[early_stopping]) # does not seem to train?

#### Separate: Ticker reveals more Financial Info

In [0]:
yf_ticker = yf.Ticker("MSFT")
hist = yf_ticker.history(period="max")

In [0]:
yf_ticker.actions

Idea:
 -  Face analysis web scraper
 -  Something to view financial statement; quarterly stuff, beginning, end etc.. Balance sheet
 -  Straight to CNN, skip the LTSM and take in a lot more data