In [None]:
import keras
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Dropout, LSTM, Input, Activation, concatenate
from keras import optimizers
import pandas as pd
from sklearn import preprocessing
import numpy as np
np.random.seed(4)
tf.random.set_seed(4)

In [None]:
history_days = 50

In [None]:
def csv_to_dataset(csv_file_path):
  file_data = pd.read_csv(csv_file_path)
  file_data = file_data.drop('date', axis=1)
  file_data = file_data.drop(0, axis=0)
  print("File data DataFrame:", file_data.shape)
  print(file_data.head())
  file_data = file_data.values
  
  normalizing_scaler = preprocessing.MinMaxScaler()
  normalized_data = normalizing_scaler.fit_transform(file_data)
  print()
  print("Normalized data")
  print(normalized_data[0:5,:])
  
  # Data is in order of: Open stock value, high value, low, close, and volume - ohlcv
  # Creates array of 5x50-value array windows, each one will be a training input into model
  ohlcv_histories_normalised = np.array([normalized_data[i : i + history_days].copy() for i in range(len(normalized_data) - history_days)])
  print()
  print("Normalized inputs", ohlcv_histories_normalised.shape)
  #print(ohlcv_histories_normalised[0:2,0:5])
  
  # Get scaled stock open price values, which model is predicting
  next_day_open_values_normalised = np.array([normalized_data[:,0][i + history_days].copy() for i in range(len(normalized_data) - history_days)])
  next_day_open_values_normalised = np.expand_dims(next_day_open_values_normalised, -1)
  #print()
  print("Next day open values scaled:", next_day_open_values_normalised.shape)
  
  # Get unscaled stock open price from original file data
  next_day_open_values = np.array([file_data[:,0][i + history_days].copy() for i in range(len(file_data) - history_days)])
  next_day_open_values = np.expand_dims(next_day_open_values, -1)
  print("Next day open values unscaled:", next_day_open_values.shape)

  y_normaliser = preprocessing.MinMaxScaler()
  y_normaliser.fit(next_day_open_values)

  # Moving average technical indicator of stock price input
  moving_averages = []
  for his in ohlcv_histories_normalised:
    sma = np.mean(his[:,3]) # Using closing price of the stocks for the moving average, not open price
    moving_averages.append(np.array([sma]))

  moving_averages = np.array(moving_averages) # Convert to numpy array
  moving_averages_scaler = preprocessing.MinMaxScaler() # Scale with min-max scaler
  moving_averages_normalised = moving_averages_scaler.fit_transform(moving_averages)

  assert ohlcv_histories_normalised.shape[0] == next_day_open_values_normalised.shape[0] == moving_averages_normalised.shape[0]
  return ohlcv_histories_normalised, moving_averages_normalised, next_day_open_values_normalised, next_day_open_values, y_normaliser

In [None]:
def multiple_csv_to_dataset(test_set_name):
  import os
  ohlcv_histories = 0
  moving_averages = 0
  next_day_open_values = 0
  # For each company stock dataset in directory, add data to training dataset
  for csv_file_path in list(filter(lambda x: x.endswith('daily.csv'), os.listdir('./'))):
    if not csv_file_path == test_set_name:
      print(csv_file_path)
      if type(ohlcv_histories) == int:
        ohlcv_histories, moving_averages, next_day_open_values, _, _ = csv_to_dataset(csv_file_path)
      else:
        a, b, c, _, _ = csv_to_dataset(csv_file_path)
        ohlcv_histories = np.concatenate((ohlcv_histories, a), 0)
        moving_averages = np.concatenate((moving_averages, b), 0)
        next_day_open_values = np.concatenate((next_day_open_values, c), 0)

  ohlcv_train = ohlcv_histories
  mov_avg_train = moving_averages
  open_prices_train = next_day_open_values

  ohlcv_test, mov_avg_test, open_prices_test, unscaled_open_prices_test, y_normaliser = csv_to_dataset(test_set_name)

  return ohlcv_train, mov_avg_train, open_prices_train, ohlcv_test, mov_avg_test, open_prices_test, unscaled_open_prices_test, y_normaliser

In [None]:
# Load preprocessed dataset of stock prices
ohlcv_histories, moving_averages, next_day_open_values, unscaled_open_prices, y_normaliser = csv_to_dataset('MSFT_daily.csv')

# Split into test and training sets
train_split = 0.8
n = int(ohlcv_histories.shape[0] * train_split)

ohlcv_train = ohlcv_histories[:n]
mov_avg_train = moving_averages[:n]
open_prices_train = next_day_open_values[:n]

ohlcv_test = ohlcv_histories[n:]
mov_avg_test = moving_averages[n:]
open_prices_test = next_day_open_values[n:]

unscaled_open_prices_test = unscaled_open_prices[n:]



# Multiple csv dataset function returns training and testing splits already, commented out above
# Training set is now Microsoft, Netflix, and Facebook stock prices. Google stock prices is the test set
#ohlcv_train, mov_avg_train, open_prices_train, ohlcv_test, mov_avg_test, open_prices_test, unscaled_open_prices_test, y_normaliser = multiple_csv_to_dataset('GOOGL_daily.csv')

In [None]:
# Build Model v2 - more complex layers, 2 inputs
# Two sets of input into model - previous stock prices over time and the techincal indicator (moving average)
lstm_input = Input(shape=(history_days, 5), name='lstm_input')
dense_input = Input(shape=(mov_avg_train.shape[1],), name='tech_input')
 
# First branch of model has layers for first input, stock prices from data
x = LSTM(50, name='lstm_0')(lstm_input)
x = Dropout(0.2, name='lstm_dropout_0')(x)
lstm_branch = Model(inputs=lstm_input, outputs=x)
 
# Second branch - Moving Average technical indicator input
y = Dense(20, name='tech_dense_0')(dense_input)
y = Activation("relu", name='tech_relu_0')(y)
y = Dropout(0.2, name='tech_dropout_0')(y)
moving_averages_branch = Model(inputs=dense_input, outputs=y)
 
# Combine two branches
combined_branches = concatenate([lstm_branch.output, moving_averages_branch.output], name='concatenate')
z = Dense(64, activation="sigmoid", name='dense_pooling')(combined_branches)
z = Dense(1, activation="linear", name='dense_out')(z)
 
# Model takes inputs from both branches, outputs a single value
model = Model(inputs=[lstm_branch.input, moving_averages_branch.input], outputs=z)
adam = optimizers.Adam(lr=0.0005)
model.compile(optimizer=adam, loss='mse')

In [None]:
model.fit(x=[ohlcv_train, mov_avg_train], y=open_prices_train, batch_size=32, epochs=50, shuffle=True, validation_split=0.1)

In [None]:
evaluation = model.evaluate([ohlcv_test, mov_avg_test], open_prices_test)
print(evaluation)

In [None]:
# Evaluate model
# Open prices prediction
open_prices_test_predicted = model.predict([ohlcv_test, mov_avg_test])
open_prices_test_predicted = y_normaliser.inverse_transform(open_prices_test_predicted)

# Entire training dataset prediction
open_prices_predicted = model.predict([ohlcv_train, mov_avg_train])
open_prices_predicted = y_normaliser.inverse_transform(open_prices_predicted)

assert unscaled_open_prices_test.shape == open_prices_test_predicted.shape
real_mse = np.mean(np.square(unscaled_open_prices_test - open_prices_test_predicted))
scaled_mse = real_mse / (np.max(unscaled_open_prices_test) - np.min(unscaled_open_prices_test)) * 100
print(scaled_mse)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.gcf().set_size_inches(22, 15, forward=True)
start = 0
end = -1
real = plt.plot(unscaled_open_prices_test[start:end], label='real')
pred = plt.plot(open_prices_test_predicted[start:end], label='predicted')

plt.legend(['Real', 'Predicted'])

plt.show()

In [None]:
model.save(f'multiple_input_one_dataset_model.h5')

In [None]:
x = 0
for ohlcv, ind in zip(ohlcv_test[start: end], mov_avg_test[start: end]):
    normalised_price_today = ohlcv[-1][0]
    normalised_price_today = np.array([[normalised_price_today]])
    price_today = y_normaliser.inverse_transform(normalised_price_today)
    predicted_price_tomorrow = np.squeeze(y_normaliser.inverse_transform(model.predict([[ohlcv], [ind]])))
    if (x < 10):
        print(predicted)
    x += 1