Main idea: Fetch multiple datasets of indexes and stocks correlated news to that index. (News only go back one year)
Then we train the model on the multiple datasets and test it on the last dataset.

1. Fetch the stock data from Yahoo Finance
2. Fetch the news data from NewsAPI
3. Preprocess stock data. We will give the model open, close, high, low
4. Preprocess news data to sentiment labels. then backfill the sentiment labels so that there are no missing values
5. Merge the stock data and news data
6. Train the model on the multiple datasets
7. Test the model on the last dataset
8. Evaluate the model
9. Save the model
10. Make predictions

In [227]:
#Data fetching 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime
from pygooglenews import GoogleNews

In [228]:
List_of_indexes = ['^GSPC', '^DJI', '^IXIC', '^RUT', '^FTSE', '^N225', '^HSI', '^GDAXI', '^OMX']
#List_of_indexes = ['^GSPC', '^DJI', '^IXIC', '^RUT', '^FTSE', '^N225', '^HSI', '^GDAXI', '^OMX', 'XLE', 'AAPL','MSFT','GOOGL','BRK-A','JPM','JNJ','V','WMT','PG','UNH','MA','INTC','VZ','HD','DIS','MRK','KO','PFE','BAC','PEP','CSCO','CMCSA','NFLX','T','ADBE','ABT','XOM','CRM','TMO','ACN','CVX','NKE']


#List_of_indexes = ['^GSPC', '^DJI', '^IXIC', '^RUT', '^FTSE', '^N225', '^HSI', '^GDAXI', '^OMX', 'XLE', 'AAPL','MSFT','AMZN','GOOGL','TSLA','BRK-A','JPM','JNJ','V','WMT','PG','UNH','MA','INTC','VZ','HD','DIS','MRK','KO','PFE','BAC','PEP','CSCO','CMCSA','NFLX','T','NVDA','ADBE','ABT','XOM','CRM','TMO','ACN','CVX','NKE','LLY','COST','ABBV','DHR','NEE','AVGO','MDT','QCOM','TXN','UNP','LIN','UPS','HON','SBUX','AMT','ORCL','LOW','IBM','MO','AMD','AMGN','C','CAT','BA','MMM','GE','GS','MS','FDX','PYPL','CHTR','AMAT','LMT']

#List_of_indexes = ['^GSPC', '^DJI', '^IXIC', '^RUT', '^FTSE', '^N225', '^HSI', '^GDAXI', '^OMX', 'XLE', 'AAPL', 'MSFT', 'AMZN', 'GOOGL', 'TSLA', 'BRK-A', 'JPM', 'JNJ', 'V', 'WMT', 'PG', 'UNH', 'MA', 'INTC', 'VZ', 'HD', 'DIS', 'MRK', 'KO', 'PFE', 'BAC', 'PEP', 'CSCO', 'CMCSA', 'NFLX', 'T', 'NVDA', 'ADBE', 'ABT', 'XOM', 'CRM', 'TMO', 'ACN', 'CVX', 'NKE', 'LLY', 'COST', 'ABBV', 'DHR', 'NEE', 'AVGO', 'MDT', 'QCOM', 'TXN', 'UNP', 'LIN', 'UPS', 'HON', 'SBUX', 'AMT', 'ORCL', 'LOW', 'IBM', 'MO', 'AMD', 'AMGN', 'C', 'CAT', 'BA', 'MMM', 'GE', 'GS', 'MS', 'FDX', 'PYPL', 'CHTR', 'AMAT', 'LMT', 'META', 'SNOW', 'PANW', 'SHOP', 'UBER', 'BP', 'CVX', 'SLB', 'COP', 'ISRG', 'GILD', 'BIIB', 'REGN', 'MCD', 'TGT', 'ROST', 'SCHW', 'BLK', 'AXP', 'BK', 'TSM', 'BABA', 'NTES', 'JD', 'TCEHY', 'NESN.SW', 'RACE', 'ASML', 'SAP.DE', 'PBR', 'VALE', 'ITUB', 'NIO', 'LI', 'RIVN', 'LCID', 'MU', 'ENPH', 'SEDG', 'FSLR', '^VIX', '^RUA', '^AEX', '^STOXX50E', '^BSESN', '^KS11', 'XLV','XLF', 'XLK', 'XLY', 'XLP', 'XLU', 'XLB', 'XLI']

In [229]:
#Fetch stock data
stock_data = {}
for index in List_of_indexes:
    #Manually create the dataframe
    collected_data = yf.download(index, period ="1y")
    df = pd.DataFrame()
    df['Open'] = collected_data['Open']
    df['Close'] = collected_data['Close']
    df['High'] = collected_data['High']
    df['Low'] = collected_data['Low']
    stock_data[index] = df
    


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [230]:
#Fetch news data
news_data = {}
gn = GoogleNews()
for index in List_of_indexes:
    news_data[index] = gn.search(index, when='1y')
    print("Fetched news data for: ", index)
    print("Number of articles: ", len(news_data[index]['entries']))
    #Convert to dataframe
    news_data[index] = pd.DataFrame(news_data[index]['entries'])




Fetched news data for:  ^GSPC
Number of articles:  75
Fetched news data for:  ^DJI
Number of articles:  98
Fetched news data for:  ^IXIC
Number of articles:  100
Fetched news data for:  ^RUT
Number of articles:  96
Fetched news data for:  ^FTSE
Number of articles:  99
Fetched news data for:  ^N225
Number of articles:  56
Fetched news data for:  ^HSI
Number of articles:  100
Fetched news data for:  ^GDAXI
Number of articles:  15
Fetched news data for:  ^OMX
Number of articles:  99


In [231]:
#Clean up the news data wich contains: title,title_detail,links,link,id,guidislink,published,published_parsed,summary,summary_detail,source,sub_articles
#We will put the title through a sentiment analysis model and then put all none existing values as 0
#Finbert model
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

#Test the model
from transformers import pipeline

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)


news_sentiments = {}

#Add date and sentiment to the dataframe
for index in List_of_indexes:
    news_sentiments[index] = []
    print("Processing news data for: ", index)
    for i in range(len(news_data[index])):
        #print(news_data[index].iloc[i]['title'])
        sentiment = classifier(news_data[index].iloc[i]['title'], top_k=None)
        postive_score = sentiment[0]['score']
        negative_score = sentiment[1]['score']
        neutral_score = sentiment[2]['score']
        #Convert date to dtype='datetime64[ns] so that we can later can merge the dataframes
        reformmated_time_stamp = pd.to_datetime(news_data[index].iloc[i]['published']).date()
        news_sentiments[index] = news_sentiments[index] + [{'Date': reformmated_time_stamp, 'positive_score': postive_score, 'negative_score': negative_score, 'neutral_score': neutral_score}]
    news_sentiments[index] = pd.DataFrame(news_sentiments[index])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Device set to use mps:0


Processing news data for:  ^GSPC
Processing news data for:  ^DJI
Processing news data for:  ^IXIC
Processing news data for:  ^RUT
Processing news data for:  ^FTSE
Processing news data for:  ^N225
Processing news data for:  ^HSI
Processing news data for:  ^GDAXI
Processing news data for:  ^OMX


In [232]:
# Average sentiment for days with multiple articles

for index in List_of_indexes:
    # Group by date and calculate mean sentiment scores
    news_sentiments[index] = (
        news_sentiments[index]
        .groupby('Date')
        .mean()
        .fillna(0)  # Handle NaN values
        .reset_index()  # Ensure 'date' becomes a column
        .set_index('Date')  # Set 'date' as the index
        .sort_index()  # Ensure the index is sorted
    )
    
    
# Define a common date range
common_index = pd.date_range(
    start=pd.Timestamp(min(pd.Timestamp(stock_data[index].index.min()), pd.Timestamp(news_sentiments[index].index.min()))),
    end=pd.Timestamp(max(pd.Timestamp(stock_data[index].index.max()), pd.Timestamp(news_sentiments[index].index.max()))),
    freq='D'  # Daily frequency
)

# Reindex both dataframes so they have the same index
stock_data[index] = stock_data[index].reindex(common_index).fillna(-1)  # Fill missing stock data with 0
news_sentiments[index] = news_sentiments[index].reindex(common_index).fillna(0)  # Fill missing sentiment data with 0

# Combine stock data and news sentiment into one DataFrame
combined_data = {}


# Reindex both dataframes

for index in List_of_indexes:
    # Ensure stock_data and news_sentiments have compatible indices
    stock_data[index].index = pd.to_datetime(stock_data[index].index).normalize()  # Normalize index to date only
    news_sentiments[index].index = pd.to_datetime(news_sentiments[index].index).normalize()  # Normalize index to date only
    
    # Merge stock data and news data
    combined_data[index] = pd.merge(
        stock_data[index], 
        news_sentiments[index], 
        how='left', 
        left_index=True, 
        right_index=True
    )
    
    #Drop all close values that are -1
    combined_data[index] = combined_data[index][combined_data[index].Close != -1]
    
    #Fill all missing values with 0
    combined_data[index] = combined_data[index].fillna(0)
    


In [233]:
#Check size of the dataframes
for index in List_of_indexes:
    print("Size of dataframe for ", index, " is: ", combined_data[index].shape)

Size of dataframe for  ^GSPC  is:  (251, 7)
Size of dataframe for  ^DJI  is:  (251, 7)
Size of dataframe for  ^IXIC  is:  (251, 7)
Size of dataframe for  ^RUT  is:  (251, 7)
Size of dataframe for  ^FTSE  is:  (254, 7)
Size of dataframe for  ^N225  is:  (245, 7)
Size of dataframe for  ^HSI  is:  (246, 7)
Size of dataframe for  ^GDAXI  is:  (254, 7)
Size of dataframe for  ^OMX  is:  (250, 7)


In [234]:
#Reformat the dataframes to be able to train the model
#We will use the last 30 days as testa data and the rest as training data as the data comes from diffrent indexes
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping


def create_sequences(input_data, output_data, timesteps, output_dim):
    xs, ys = [], []
    for i in range(len(input_data) - output_dim - timesteps):
        x = input_data[i:(i + timesteps)]
        y = output_data[i + timesteps:i + output_dim + timesteps]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

timesteps = 30
input_dim = 7
output_dim = 5 #Predict 5 days ahead


# Initialize global scalers
input_scaler = MinMaxScaler()
output_scaler = MinMaxScaler()

# Gather all data for global scaling
xs = []
ys = []
for index in List_of_indexes: 
    data = combined_data[index]
    # Normalize data
    input_features = data[['Open', 'Close', 'High', 'Low', 'positive_score', 'negative_score', 'neutral_score']].values
    output_feature = data[['Close']].values
    scaler = MinMaxScaler()
    input_features_scaled = scaler.fit_transform(input_features)
    output_feature_scaled = scaler.fit_transform(output_feature)
    
    # Create sequences
    x, y = create_sequences(input_features_scaled, output_feature_scaled, timesteps, output_dim)
    xs.append(x)
    ys.append(y)
    
# Concatenate all sequences
xs = np.concatenate(xs)
ys = np.concatenate(ys)

print(xs.shape, ys.shape)

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(xs, ys, test_size=0.2, shuffle=False)

print(xs.shape, ys.shape)

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(xs, ys, test_size=0.1, shuffle=False)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=False)


(1938, 30, 7) (1938, 5, 1)
(1938, 30, 7) (1938, 5, 1)


In [235]:
model = Sequential() 
model.add(LSTM(256, input_shape=(timesteps, input_dim)))


model.add(Dense(output_dim))


model.compile(optimizer='adam', loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model.fit(x_train, y_train, epochs=100, batch_size=32, validation_data=(x_val, y_val), callbacks=[early_stopping])





ValueError: Input 0 of layer "lstm_15" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 256)

In [None]:
#Evaluate the model
test_results = model.evaluate(x_test, y_test)
test_loss = test_results
print(f'Test Loss: {test_loss}')

In [None]:
# Make predictions
predictions = model.predict(x_test)

# Reshape y_test from (390, 5, 1) to (390, 5)
y_test_reshaped = y_test.reshape(y_test.shape[0], y_test.shape[1])

# Inverse transform predictions and y_test
predictions_unscaled = scaler.inverse_transform(predictions)
y_test_unscaled = scaler.inverse_transform(y_test_reshaped)

# Plot predictions vs. true values
plt.figure(figsize=(14, 7))

# Plot for each of the 5 prediction days
for i in range(5):  # 5 prediction steps
    plt.plot(
        range(len(predictions_unscaled[:, i])),
        predictions_unscaled[:, i],
        label=f'Predicted Day {i + 1}',
    )
    plt.plot(
        range(len(y_test_unscaled[:, i])),
        y_test_unscaled[:, i],
        linestyle='--',
        label=f'True Day {i + 1}',
    )

plt.ylabel('Close Price')
plt.xlabel('Days')
plt.title('Predicted vs True Close Price Over 5-Day Horizon')
plt.legend()
plt.show()


In [None]:
# Plot predictions and true values in separate subplots
plt.figure(figsize=(14, 20))

start = 0
end = 100



for i in range(5):  # 5 prediction steps
    plt.subplot(5, 1, i + 1)  # Create 5 subplots, one for each prediction day
    plt.plot(
        range(start, end),
        predictions_unscaled[start:end, i],
        label=f'Predicted Day {i + 1}',
    )
    plt.plot(
        range(start, end),
        y_test_unscaled[start:end, i],
        linestyle='--',
        label=f'True Day {i + 1}',
    )
    plt.ylabel('Close Price')
    plt.xlabel('Days')
    plt.title(f'Prediction for Day {i + 1}')
    plt.legend()

plt.tight_layout()
plt.show()


In [None]:
#Save the model to hopsworks
import hopsworks
import os
import warnings
from hsml.schema import Schema
from hsml.model_schema import ModelSchema


In [None]:
with open('./project_api_key.txt', 'r') as file:
    os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()
    print("API Key is set")
    print("First 5 characters of API Key: ", os.environ["HOPSWORKS_API_KEY"][:5])

In [None]:
model_dir = "FinanceModel"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
images_dir = model_dir + "/images"
if not os.path.exists(images_dir):
    os.mkdir(images_dir)

In [None]:
#Get loss of model 


In [None]:
project = hopsworks.login() 

model_dir = "Resources/Models"

# Creating input and output schemas using the 'Schema' class for features (X) and target variable (y)
input_schema = Schema(x_train)
output_schema = Schema(y_train)

# Creating a model schema using 'ModelSchema' with the input and output schemas
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# Converting the model schema to a dictionary representation
schema_dict = model_schema.to_dict()
model.save(model_dir + "/model.keras") 

#Save the scalers as well
import joblib

joblib.dump(scaler, model_dir + "/scaler.pkl") 


mr = project.get_model_registry()

#Get model metrics

res_dict = {
    "MSE" : test_loss
}

aq_model = mr.python.create_model(
    name="FinanceModel", 
    metrics= res_dict,
    model_schema=model_schema,
    description="Predicting stock prices using LSTM model trained on multiple indexes.",
)

# Saving the model artifacts to the 'air_quality_model' directory in the model registry
aq_model.save(model_dir)
