In [None]:
import pandas as pd
import csv
import datetime
import yfinance as yf
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup as bs
import requests
from pyspark.sql.functions import sum,max,min,mean,count
import datetime as dt
import pyspark
from pyspark.sql import SparkSession
import findspark
import yaml
from yaml.loader import SafeLoader
from os.path import abspath

warehouse_location = abspath('spark-warehouse')
with open('cfg.yml') as f:
    config = yaml.load(f, Loader = SafeLoader)

findspark.init()
spark = SparkSession.builder \
    .master(config['spark']['spark_master'])\
    .appName('gather')\
    .enableHiveSupport()\
    .config('spark.sql.warehouse.dir', warehouse_location)\
    .config(config['spark']['spark_jars'], config['spark']['spark_jars_path'])\
    .config('spark.cores.max', '2')\
    .config('spark.executor.cores', '2')\
    .getOrCreate()
spark.sparkContext.setLogLevel("WARN")
spark

url = config['postgres']['url']
properties = {
    'user': config['postgres']['user'],
    'password' : config['postgres']['user'],
    'url': url,
    'driver': config['postgres']['driver']
    
}

# Training the Model on a Single Stock

## Retrieve data from database

In [None]:
def return_data(ticker_list, from_date, to_date):
    sentiment = spark.read.jbdc(url = url, 'sentiment', properties = properties).dropDuplicates()
    finance = spark.read.jbdc(url = url, 'company_data', properties = properties).dropDuplicates()
    condition = [finance.date == sentiment.date, finance.ticker == sentiment.ticker]
    full_data = finance.join(sentiment, condition).fillna(0)
    full_data.createOrReplaceTempView('dataset')
    df_list = []
    for ticker in ticker_list:
        try:
            working_data = spark.sql("Select * from dataset where ticker == " + str(ticker) + " & date between "+str(from_date) + " and "+ str(to_date))
        except:
            print("Failed to Retrieve Data from Database for ticker " + str(ticker) + ". Please load necessary data and retry query")
        df_list.append(working_data.toPandas().sort_values(by = 'date', ascending = True))
    return df_list
dfs = return_data(['MSFT', 'GOOG'], "2018-01-01", "2023-02-02")

## Scale Numeric Columns

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
target_scaler = MinMaxScaler()
def scale(df_list):
    scaler = MinMaxScaler(feature_range=(0,1))
    target_scaler = MinMaxScaler()
    for df in df_list:
        data = df.drop(['ticker'],axis=1)
    # features and target columns
        target = target_scaler.fit_transform(data['target'])
        X_feat = data.drop(['target'], axis = 1)
        for col in X_feat.columns:
            X_feat[col] = scaler.fit_transform(X_feat[col])
    return X_feat, target_scaler, target
X_feat, target_scaler = scale(dfs)    

## Format Training Data for LSTM

In [None]:
# Creating a data structure with 10 time-steps and 1 output
 
# Split data into X_train and y_train data sets
def lstm_split(data,target,steps):
      X = []
      y = []
      # Creating a data structure with 10 time-steps and 1 output
      for i in range(10, steps):
          X.append(data[i-10:i])
          y.append(target[i:i+1])  
      return np.array(X),np.array(y)
 
X1,y1 = lstm_split(X_feat,target,len(X_feat))
 
train_split = 0.9
split_idx = int(np.ceil(len(X1)*train_split))
date_index = X_feat.index
 
X_train,X_test = X1[:split_idx],X1[split_idx:]
y_train,y_test = y1[:split_idx],y1[split_idx:]
X_train_date,X_test_date = date_index[:split_idx],date_index[split_idx:]
 
print(X1.shape,X_train.shape,X_test.shape,y_test.shape,y_train.shape)
print(X_test.head())

## Constructing the Primary LSTM Model

In [None]:
#LSTM Framework
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.metrics import Precision
from keras.optimizers import Adam
#!pip install keras_tuner
import keras_tuner

#define function to create model, optional hyperparameters included to be selected during training
LR = 0.05
def build_model(hp):
  model = Sequential()
  hidden = hp.Choice('n_hidden', [0,1,2,3])
  model.add(LSTM(units = hp.Int('neurons_visible', min_value = X_train.shape[2], max_value = 100, step = 20),
                activation = hp.Choice('activate1', ['sigmoid', 'relu']),
                input_shape = (X_train.shape[1], X_train.shape[2]),
                return_sequences = True if hidden >0 else False))
  #Configure hidden layers based on random search determined hidden layer number
  if hidden > 0:
    for num in range(hidden):
      model.add(Dropout(hp.Float('dropout' +str(num+1), min_value = 0.1, max_value = 0.9, step = 0.3)))
      model.add(LSTM(units = hp.Int('neurons_hidden'+str(num+1), min_value = 20, max_value = 50, step = 10),
                     activation = 'relu', return_sequences = True if num != hidden else False))
      
  model.add(Dropout(0.5))
  model.add(Dense(units = 1, activation = 'sigmoid'))

  #compile the model
  model.compile(loss = 'mean_squared_error', optimizer = Adam(lr = LR), metrics = ['mean_squared_error'])
  
  return model

#set learning rate and early stopping callbacks
LR_decay = ReduceLROnPlateau('loss', patience=1, verbose=0, 
                             factor=0.5, min_lr=1e-8)
Early_stop = EarlyStopping(monitor='loss', min_delta=0, 
                           patience=25, verbose=1, mode='auto',
                           baseline=0, restore_best_weights=True)
#arrange random search class
tune = keras_tuner.RandomSearch(build_model, objective = 'val_loss', max_trials = 50, seed = 1)

#complete training 
tune.search(X_train, y_train, epochs = 200, batch_size = 24, validation_data = (X_test, y_test), callbacks = [LR_decay, Early_stop])

LSTM_model = tune.get_best_models()[0]
LSTM_model.save('LSTM_model.h5')
hyperparameters = tune.get_best_hyperparameters(1)[0]
print(hyperparameters.values)

## Constructing the Prophet Model

In [None]:
from prophet import Prophet
from sklearn.metrics import mean_squared_error
from prophet.diagnostics import cross_validation, performance_metrics
from sklearn.model_selection import ParameterGrid

In [None]:
MSFT_Data = finance_data[finance_data['ticker']=='MSFT']
# This function is used to get the train data and test data
def data_preparation(df):
  data=df[["date","adj_close"]]
  data = data.rename(columns = {'date':'ds', 'adj_close':'y'})
  return data
# Define parameter grid to search over
param_grid = {
    'seasonality_mode': ['additive', 'multiplicative'],
    'changepoint_prior_scale': [0.01, 0.1, 1.0],
    'seasonality_prior_scale': [0.01, 0.1, 1.0],
}

# Initialize minimum error and best parameters
min_error = float('inf')
best_params = {}

# Loop through all parameter combinations
for params in ParameterGrid(param_grid):
    print('Testing parameters:', params)

    # Initialize Prophet model with specified hyperparameters
    model = Prophet(**params)
    model.fit(data_preparation(MSFT_Data))

    # Perform time series cross-validation
    df_cv = cross_validation(model=model, initial='1000 days', horizon='10 days', period='10 days')

    # Calculate performance metrics
    df_metrics = performance_metrics(df_cv)

    # Calculate mean cross-validation error
    mean_cv_error = df_metrics['mse'].mean()

    # Update minimum error and best parameters if new minimum is found
    if mean_cv_error < min_error:
        min_error = mean_cv_error
        best_params = params

# Print best hyperparameters and corresponding error
print('Best parameters:', best_params)
print('Minimum cross-validation error:', min_error)

In [None]:
# We use the best parameter to fit the model
final_model = Prophet(**best_params)
final_model.fit(data_preparation(MSFT_Data))

future = final_model.make_future_dataframe(periods=100)
validation_predict = final_model.predict(future)
print(validation_predict[['ds', 'yhat', 'yhat_lower', 'yhat_upper']])
fig = final_model.plot(validation_predict)
fig1 = final_model.plot_components(validation_predict)

## Format Predictions of Sub-Models to Create Final Training Data

In [None]:
lstm = load('LSTM_model.h5')
preds_dict = {}

preds_dict['lstm_pred'] = lstm.predict(X_train)
preds_dict['prophet_pred_microsoft'] = f

hybrid_train = 

## Constructing the Hybrid Model

In [None]:
LR = 0.01
#General LSTM model based on randomsearch results
def multiple_models(x_train, y_train):
    hybrid = Sequential()
    hybrid.add(LSTM(units = 8, activation = "relu", input_shape = (X_train.shape[1], X_train.shape[2]), return_sequences = True))
    hybrid.add(Dropout(0.5))
    hybrid.add(LSTM(units = 12, activation = 'relu', return_sequences = True))
    hybrid.add(Dropout(0.5))
    hybrid.add(LSTM(units = 12))
    hybrid.add(Dropout(0.5))
    hybrid.add(Dense(units =1)
    hybrid.compile(loss = 'mean_squared_error', optimizer = Adam(lr = LR), metrics = ["mean_squared_error"])
    Early_stop = EarlyStopping(monitor='val_loss', min_delta=0, 
                              patience=25, verbose=1, mode='auto',
                              baseline=0, restore_best_weights=True)
    hybrid.fit(x_train, y_train, epochs = 200, batch_size = 24, callbacks = [Early_stop])
    return hybrid
#train the hybrid model
hybrid1 = multiple_models(hybrid_train, y_train)

# Model Testing Single Company Hybrid Model vs. LSTM

In [None]:
# prep google test data
google = frames[0]
google_target_scaler = MinMaxScaler()
google_target = google_target_scaler.fit_transform(google[['target']])
google_score = google['score']
test_ft = google.drop(['target', 'score', 'ticker'], axis = 1)

# Normalise the data

for col in test_ft.columns:
    scaler = MinMaxScaler(feature_range=(0,1))
    X_feat[col] = scaler.fit_transform(test_ft[[col]])
test_ft['score'] = google_score
test_ft = np.array(test_ft)
google_ft, google_target = lstm_split(test_ft,google_target,len(google_score))

In [None]:
lstm.evaluate_model()
hybrid1.evaluate_model()

# Expanded Model: Multiple Companies

## Retrieve Data from DB

In [None]:
dfs = return_data(['MSFT', 'GOOG', 'AMZN', "NFLX", "TSLA"], "2018-01-01", "2023-02-02")

## Create Sub-models

In [None]:
def create_lists(list_of_df):
  training_list = []
  target_list = []
  for i in list_of_df:
    data = frames[i]
    score = data['score']
    target = data['target']
    training = data.drop(['target', 'ticker', 'score'], axis = 1)
    scaler = MinMaxScaler()
    for col in training.columns:
      training[col] = scaler.fit_transform(training[col])
    training['score'] = score
    data_x, data_y = lstm_split(training, target, 10)
    training_list.append(data_x)
    target_list.append(data_y)
  return training_list, target_list

#Train each of the lstm models on training companies
training_list, target_list = create_lists(frames)
for tick, x_train, y_train in zip(ticker_list, training_list, target_list):
  model = multiple_models(tick, x_train, y_train)
  model.save('/models/'+tick+'_lstm.h5')

In [None]:
#Create Train Data for Hybrid Model Using Microsoft Predictions
def hybrid_train(direct, ticker_list, hybrid_train):
    preds_dict = {}
    path = direct
    for num, model in enumerate(os.listdir(path)):
        model = load_model(model)
        prediction = model.predict(hybrid_train)
        preds_dict[ticker_list[num]] = prediction
    preds_df = pd.DatFrame(preds_dict)
    return preds_df

ticker_list = ["MSFT", "NFLX", "AMZN", "TSLA"]
hybrid_training_data =  hybrid_train("/models/", ticker_list, X_train) 
hybrid_training_data['prophet'] = 

In [None]:
hybrid_expanded = multiple_models(hybrid_training_data, y_train)

## Test Hybrid Model Using Microsoft Validation Data and Google Test Data

In [None]:
from os import listdir
google_test = {}
microsoft_test = {}
path = '/models/'
for num, model in enumerate(os.listdir(path)):
  model = load_model(model)
  goog_preds = model.predict(google_ft)
  micro_preds = model.predict(X_test)
  google_test[ticker_list[num]] = goog_preds
  microsoft_test[ticker_list[num]] = micro_preds
google_test = pd.DataFrame(google_test)
microsoft_test = pd.DataFrame(microsoft_test)
google_test['prophet'] = 
microsoft_test['prophet'] = 